diff options
Diffstat (limited to 'kernel')
323 files changed, 24960 insertions, 11620 deletions
diff --git a/kernel/.gitignore b/kernel/.gitignore index b3097bde4e9c..34d1e77ee9df 100644 --- a/kernel/.gitignore +++ b/kernel/.gitignore @@ -1,7 +1,6 @@ # # Generated files # -config_data.h -config_data.gz +kheaders.md5 timeconst.h hz.bc diff --git a/kernel/Kconfig.freezer b/kernel/Kconfig.freezer index a3bb4cb52539..68646feefb3d 100644 --- a/kernel/Kconfig.freezer +++ b/kernel/Kconfig.freezer @@ -1,2 +1,3 @@ +# SPDX-License-Identifier: GPL-2.0-only config FREEZER def_bool PM_SLEEP || CGROUP_FREEZER diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz index 2a202a846757..38ef6d06888e 100644 --- a/kernel/Kconfig.hz +++ b/kernel/Kconfig.hz @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0-only # # Timer Interrupt Frequency Configuration # diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks index 84d882f3e299..e0852dc333ac 100644 --- a/kernel/Kconfig.locks +++ b/kernel/Kconfig.locks @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0-only # # The ARCH_INLINE foo is necessary because select ignores "depends on" # @@ -229,7 +230,7 @@ config MUTEX_SPIN_ON_OWNER config RWSEM_SPIN_ON_OWNER def_bool y - depends on SMP && RWSEM_XCHGADD_ALGORITHM && ARCH_SUPPORTS_ATOMIC_RMW + depends on SMP && ARCH_SUPPORTS_ATOMIC_RMW config LOCK_SPIN_ON_OWNER def_bool y @@ -242,9 +243,19 @@ config QUEUED_SPINLOCKS def_bool y if ARCH_USE_QUEUED_SPINLOCKS depends on SMP +config BPF_ARCH_SPINLOCK + bool + config ARCH_USE_QUEUED_RWLOCKS bool config QUEUED_RWLOCKS def_bool y if ARCH_USE_QUEUED_RWLOCKS depends on SMP + +config ARCH_HAS_MMIOWB + bool + +config MMIOWB + def_bool y if ARCH_HAS_MMIOWB + depends on SMP diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt index 0fee5fe6c899..dc0b682ec2d9 100644 --- a/kernel/Kconfig.preempt +++ b/kernel/Kconfig.preempt @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0-only choice prompt "Preemption Model" diff --git a/kernel/Makefile b/kernel/Makefile index 6aa7543bcdb2..a8d923b5481b 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -30,6 +30,7 @@ KCOV_INSTRUMENT_extable.o := n # Don't self-instrument. KCOV_INSTRUMENT_kcov.o := n KASAN_SANITIZE_kcov.o := n +CFLAGS_kcov.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector) # cond_syscall is currently not LTO compatible CFLAGS_sys_ni.o = $(DISABLE_LTO) @@ -70,6 +71,7 @@ obj-$(CONFIG_UTS_NS) += utsname.o obj-$(CONFIG_USER_NS) += user_namespace.o obj-$(CONFIG_PID_NS) += pid_namespace.o obj-$(CONFIG_IKCONFIG) += configs.o +obj-$(CONFIG_IKHEADERS) += kheaders.o obj-$(CONFIG_SMP) += stop_machine.o obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o obj-$(CONFIG_AUDIT) += audit.o auditfilter.o @@ -116,17 +118,17 @@ obj-$(CONFIG_GCC_PLUGIN_STACKLEAK) += stackleak.o KASAN_SANITIZE_stackleak.o := n KCOV_INSTRUMENT_stackleak.o := n -$(obj)/configs.o: $(obj)/config_data.h +$(obj)/configs.o: $(obj)/config_data.gz targets += config_data.gz $(obj)/config_data.gz: $(KCONFIG_CONFIG) FORCE $(call if_changed,gzip) -filechk_ikconfiggz = \ - echo "static const char kernel_config_data[] __used = MAGIC_START"; \ - cat $< | scripts/bin2c; \ - echo "MAGIC_END;" +$(obj)/kheaders.o: $(obj)/kheaders_data.tar.xz -targets += config_data.h -$(obj)/config_data.h: $(obj)/config_data.gz FORCE - $(call filechk,ikconfiggz) +quiet_cmd_genikh = CHK $(obj)/kheaders_data.tar.xz +cmd_genikh = $(CONFIG_SHELL) $(srctree)/kernel/gen_kheaders.sh $@ +$(obj)/kheaders_data.tar.xz: FORCE + $(call cmd,genikh) + +clean-files := kheaders_data.tar.xz kheaders.md5 diff --git a/kernel/acct.c b/kernel/acct.c index addf7732fb56..81f9831a7859 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -227,7 +227,7 @@ static int acct_on(struct filename *pathname) filp_close(file, NULL); return PTR_ERR(internal); } - err = mnt_want_write(internal); + err = __mnt_want_write(internal); if (err) { mntput(internal); kfree(acct); @@ -252,7 +252,7 @@ static int acct_on(struct filename *pathname) old = xchg(&ns->bacct, &acct->pin); mutex_unlock(&acct->lock); pin_kill(old); - mnt_drop_write(mnt); + __mnt_drop_write(mnt); mntput(mnt); return 0; } diff --git a/kernel/async.c b/kernel/async.c index a893d6170944..4f9c1d614016 100644 --- a/kernel/async.c +++ b/kernel/async.c @@ -1,13 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * async.c: Asynchronous function calls for boot performance * * (C) Copyright 2009 Intel Corporation * Author: Arjan van de Ven <arjan@linux.intel.com> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; version 2 - * of the License. */ @@ -119,7 +115,7 @@ static void async_run_entry_fn(struct work_struct *work) /* 1) run (and print duration) */ if (initcall_debug && system_state < SYSTEM_RUNNING) { - pr_debug("calling %lli_%pF @ %i\n", + pr_debug("calling %lli_%pS @ %i\n", (long long)entry->cookie, entry->func, task_pid_nr(current)); calltime = ktime_get(); @@ -128,7 +124,7 @@ static void async_run_entry_fn(struct work_struct *work) if (initcall_debug && system_state < SYSTEM_RUNNING) { rettime = ktime_get(); delta = ktime_sub(rettime, calltime); - pr_debug("initcall %lli_%pF returned 0 after %lld usecs\n", + pr_debug("initcall %lli_%pS returned 0 after %lld usecs\n", (long long)entry->cookie, entry->func, (long long)ktime_to_ns(delta) >> 10); @@ -149,7 +145,25 @@ static void async_run_entry_fn(struct work_struct *work) wake_up(&async_done); } -static async_cookie_t __async_schedule(async_func_t func, void *data, struct async_domain *domain) +/** + * async_schedule_node_domain - NUMA specific version of async_schedule_domain + * @func: function to execute asynchronously + * @data: data pointer to pass to the function + * @node: NUMA node that we want to schedule this on or close to + * @domain: the domain + * + * Returns an async_cookie_t that may be used for checkpointing later. + * @domain may be used in the async_synchronize_*_domain() functions to + * wait within a certain synchronization domain rather than globally. + * + * Note: This function may be called from atomic or non-atomic contexts. + * + * The node requested will be honored on a best effort basis. If the node + * has no CPUs associated with it then the work is distributed among all + * available CPUs. + */ +async_cookie_t async_schedule_node_domain(async_func_t func, void *data, + int node, struct async_domain *domain) { struct async_entry *entry; unsigned long flags; @@ -195,43 +209,30 @@ static async_cookie_t __async_schedule(async_func_t func, void *data, struct asy current->flags |= PF_USED_ASYNC; /* schedule for execution */ - queue_work(system_unbound_wq, &entry->work); + queue_work_node(node, system_unbound_wq, &entry->work); return newcookie; } +EXPORT_SYMBOL_GPL(async_schedule_node_domain); /** - * async_schedule - schedule a function for asynchronous execution + * async_schedule_node - NUMA specific version of async_schedule * @func: function to execute asynchronously * @data: data pointer to pass to the function + * @node: NUMA node that we want to schedule this on or close to * * Returns an async_cookie_t that may be used for checkpointing later. * Note: This function may be called from atomic or non-atomic contexts. - */ -async_cookie_t async_schedule(async_func_t func, void *data) -{ - return __async_schedule(func, data, &async_dfl_domain); -} -EXPORT_SYMBOL_GPL(async_schedule); - -/** - * async_schedule_domain - schedule a function for asynchronous execution within a certain domain - * @func: function to execute asynchronously - * @data: data pointer to pass to the function - * @domain: the domain * - * Returns an async_cookie_t that may be used for checkpointing later. - * @domain may be used in the async_synchronize_*_domain() functions to - * wait within a certain synchronization domain rather than globally. A - * synchronization domain is specified via @domain. Note: This function - * may be called from atomic or non-atomic contexts. + * The node requested will be honored on a best effort basis. If the node + * has no CPUs associated with it then the work is distributed among all + * available CPUs. */ -async_cookie_t async_schedule_domain(async_func_t func, void *data, - struct async_domain *domain) +async_cookie_t async_schedule_node(async_func_t func, void *data, int node) { - return __async_schedule(func, data, domain); + return async_schedule_node_domain(func, data, node, &async_dfl_domain); } -EXPORT_SYMBOL_GPL(async_schedule_domain); +EXPORT_SYMBOL_GPL(async_schedule_node); /** * async_synchronize_full - synchronize all asynchronous function calls diff --git a/kernel/audit.c b/kernel/audit.c index 632d36059556..da8dc0db5bd3 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* audit.c -- Auditing support * Gateway between the kernel (e.g., selinux) and the user-space audit daemon. * System-call specific features have moved to auditsc.c @@ -5,20 +6,6 @@ * Copyright 2003-2007 Red Hat Inc., Durham, North Carolina. * All Rights Reserved. * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - * * Written by Rickard E. (Rik) Faith <faith@redhat.com> * * Goals: 1) Integrate fully with Security Modules. @@ -396,10 +383,10 @@ static int audit_log_config_change(char *function_name, u32 new, u32 old, struct audit_buffer *ab; int rc = 0; - ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE); + ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_CONFIG_CHANGE); if (unlikely(!ab)) return rc; - audit_log_format(ab, "%s=%u old=%u ", function_name, new, old); + audit_log_format(ab, "op=set %s=%u old=%u ", function_name, new, old); audit_log_session_info(ab); rc = audit_log_task_context(ab); if (rc) @@ -1053,7 +1040,8 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type) return err; } -static void audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type) +static void audit_log_common_recv_msg(struct audit_context *context, + struct audit_buffer **ab, u16 msg_type) { uid_t uid = from_kuid(&init_user_ns, current_uid()); pid_t pid = task_tgid_nr(current); @@ -1063,7 +1051,7 @@ static void audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type) return; } - *ab = audit_log_start(NULL, GFP_KERNEL, msg_type); + *ab = audit_log_start(context, GFP_KERNEL, msg_type); if (unlikely(!*ab)) return; audit_log_format(*ab, "pid=%d uid=%u ", pid, uid); @@ -1071,6 +1059,12 @@ static void audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type) audit_log_task_context(*ab); } +static inline void audit_log_user_recv_msg(struct audit_buffer **ab, + u16 msg_type) +{ + audit_log_common_recv_msg(NULL, ab, msg_type); +} + int is_audit_feature_set(int i) { return af.features & AUDIT_FEATURE_TO_MASK(i); @@ -1338,7 +1332,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) if (err) break; } - audit_log_common_recv_msg(&ab, msg_type); + audit_log_user_recv_msg(&ab, msg_type); if (msg_type != AUDIT_USER_TTY) audit_log_format(ab, " msg='%.*s'", AUDIT_MESSAGE_TEXT_MAX, @@ -1361,8 +1355,12 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) if (nlmsg_len(nlh) < sizeof(struct audit_rule_data)) return -EINVAL; if (audit_enabled == AUDIT_LOCKED) { - audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE); - audit_log_format(ab, " audit_enabled=%d res=0", audit_enabled); + audit_log_common_recv_msg(audit_context(), &ab, + AUDIT_CONFIG_CHANGE); + audit_log_format(ab, " op=%s audit_enabled=%d res=0", + msg_type == AUDIT_ADD_RULE ? + "add_rule" : "remove_rule", + audit_enabled); audit_log_end(ab); return -EPERM; } @@ -1373,7 +1371,8 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) break; case AUDIT_TRIM: audit_trim_trees(); - audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE); + audit_log_common_recv_msg(audit_context(), &ab, + AUDIT_CONFIG_CHANGE); audit_log_format(ab, " op=trim res=1"); audit_log_end(ab); break; @@ -1403,8 +1402,8 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) /* OK, here comes... */ err = audit_tag_tree(old, new); - audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE); - + audit_log_common_recv_msg(audit_context(), &ab, + AUDIT_CONFIG_CHANGE); audit_log_format(ab, " op=make_equiv old="); audit_log_untrustedstring(ab, old); audit_log_format(ab, " new="); @@ -1471,7 +1470,8 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) old.enabled = t & AUDIT_TTY_ENABLE; old.log_passwd = !!(t & AUDIT_TTY_LOG_PASSWD); - audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE); + audit_log_common_recv_msg(audit_context(), &ab, + AUDIT_CONFIG_CHANGE); audit_log_format(ab, " op=tty_set old-enabled=%d new-enabled=%d" " old-log_passwd=%d new-log_passwd=%d res=%d", old.enabled, s.enabled, old.log_passwd, @@ -2054,153 +2054,6 @@ void audit_log_key(struct audit_buffer *ab, char *key) audit_log_format(ab, "(null)"); } -void audit_log_cap(struct audit_buffer *ab, char *prefix, kernel_cap_t *cap) -{ - int i; - - if (cap_isclear(*cap)) { - audit_log_format(ab, " %s=0", prefix); - return; - } - audit_log_format(ab, " %s=", prefix); - CAP_FOR_EACH_U32(i) - audit_log_format(ab, "%08x", cap->cap[CAP_LAST_U32 - i]); -} - -static void audit_log_fcaps(struct audit_buffer *ab, struct audit_names *name) -{ - audit_log_cap(ab, "cap_fp", &name->fcap.permitted); - audit_log_cap(ab, "cap_fi", &name->fcap.inheritable); - audit_log_format(ab, " cap_fe=%d cap_fver=%x", - name->fcap.fE, name->fcap_ver); -} - -static inline int audit_copy_fcaps(struct audit_names *name, - const struct dentry *dentry) -{ - struct cpu_vfs_cap_data caps; - int rc; - - if (!dentry) - return 0; - - rc = get_vfs_caps_from_disk(dentry, &caps); - if (rc) - return rc; - - name->fcap.permitted = caps.permitted; - name->fcap.inheritable = caps.inheritable; - name->fcap.fE = !!(caps.magic_etc & VFS_CAP_FLAGS_EFFECTIVE); - name->fcap_ver = (caps.magic_etc & VFS_CAP_REVISION_MASK) >> - VFS_CAP_REVISION_SHIFT; - - return 0; -} - -/* Copy inode data into an audit_names. */ -void audit_copy_inode(struct audit_names *name, const struct dentry *dentry, - struct inode *inode) -{ - name->ino = inode->i_ino; - name->dev = inode->i_sb->s_dev; - name->mode = inode->i_mode; - name->uid = inode->i_uid; - name->gid = inode->i_gid; - name->rdev = inode->i_rdev; - security_inode_getsecid(inode, &name->osid); - audit_copy_fcaps(name, dentry); -} - -/** - * audit_log_name - produce AUDIT_PATH record from struct audit_names - * @context: audit_context for the task - * @n: audit_names structure with reportable details - * @path: optional path to report instead of audit_names->name - * @record_num: record number to report when handling a list of names - * @call_panic: optional pointer to int that will be updated if secid fails - */ -void audit_log_name(struct audit_context *context, struct audit_names *n, - const struct path *path, int record_num, int *call_panic) -{ - struct audit_buffer *ab; - ab = audit_log_start(context, GFP_KERNEL, AUDIT_PATH); - if (!ab) - return; - - audit_log_format(ab, "item=%d", record_num); - - if (path) - audit_log_d_path(ab, " name=", path); - else if (n->name) { - switch (n->name_len) { - case AUDIT_NAME_FULL: - /* log the full path */ - audit_log_format(ab, " name="); - audit_log_untrustedstring(ab, n->name->name); - break; - case 0: - /* name was specified as a relative path and the - * directory component is the cwd */ - audit_log_d_path(ab, " name=", &context->pwd); - break; - default: - /* log the name's directory component */ - audit_log_format(ab, " name="); - audit_log_n_untrustedstring(ab, n->name->name, - n->name_len); - } - } else - audit_log_format(ab, " name=(null)"); - - if (n->ino != AUDIT_INO_UNSET) - audit_log_format(ab, " inode=%lu" - " dev=%02x:%02x mode=%#ho" - " ouid=%u ogid=%u rdev=%02x:%02x", - n->ino, - MAJOR(n->dev), - MINOR(n->dev), - n->mode, - from_kuid(&init_user_ns, n->uid), - from_kgid(&init_user_ns, n->gid), - MAJOR(n->rdev), - MINOR(n->rdev)); - if (n->osid != 0) { - char *ctx = NULL; - u32 len; - if (security_secid_to_secctx( - n->osid, &ctx, &len)) { - audit_log_format(ab, " osid=%u", n->osid); - if (call_panic) - *call_panic = 2; - } else { - audit_log_format(ab, " obj=%s", ctx); - security_release_secctx(ctx, len); - } - } - - /* log the audit_names record type */ - switch(n->type) { - case AUDIT_TYPE_NORMAL: - audit_log_format(ab, " nametype=NORMAL"); - break; - case AUDIT_TYPE_PARENT: - audit_log_format(ab, " nametype=PARENT"); - break; - case AUDIT_TYPE_CHILD_DELETE: - audit_log_format(ab, " nametype=DELETE"); - break; - case AUDIT_TYPE_CHILD_CREATE: - audit_log_format(ab, " nametype=CREATE"); - break; - default: - audit_log_format(ab, " nametype=UNKNOWN"); - break; - } - - audit_log_fcaps(ab, n); - audit_log_end(ab); -} - int audit_log_task_context(struct audit_buffer *ab) { char *ctx = NULL; @@ -2322,6 +2175,118 @@ void audit_log_link_denied(const char *operation) audit_log_end(ab); } +/* global counter which is incremented every time something logs in */ +static atomic_t session_id = ATOMIC_INIT(0); + +static int audit_set_loginuid_perm(kuid_t loginuid) +{ + /* if we are unset, we don't need privs */ + if (!audit_loginuid_set(current)) + return 0; + /* if AUDIT_FEATURE_LOGINUID_IMMUTABLE means never ever allow a change*/ + if (is_audit_feature_set(AUDIT_FEATURE_LOGINUID_IMMUTABLE)) + return -EPERM; + /* it is set, you need permission */ + if (!capable(CAP_AUDIT_CONTROL)) + return -EPERM; + /* reject if this is not an unset and we don't allow that */ + if (is_audit_feature_set(AUDIT_FEATURE_ONLY_UNSET_LOGINUID) + && uid_valid(loginuid)) + return -EPERM; + return 0; +} + +static void audit_log_set_loginuid(kuid_t koldloginuid, kuid_t kloginuid, + unsigned int oldsessionid, + unsigned int sessionid, int rc) +{ + struct audit_buffer *ab; + uid_t uid, oldloginuid, loginuid; + struct tty_struct *tty; + + if (!audit_enabled) + return; + + ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_LOGIN); + if (!ab) + return; + + uid = from_kuid(&init_user_ns, task_uid(current)); + oldloginuid = from_kuid(&init_user_ns, koldloginuid); + loginuid = from_kuid(&init_user_ns, kloginuid), + tty = audit_get_tty(); + + audit_log_format(ab, "pid=%d uid=%u", task_tgid_nr(current), uid); + audit_log_task_context(ab); + audit_log_format(ab, " old-auid=%u auid=%u tty=%s old-ses=%u ses=%u res=%d", + oldloginuid, loginuid, tty ? tty_name(tty) : "(none)", + oldsessionid, sessionid, !rc); + audit_put_tty(tty); + audit_log_end(ab); +} + +/** + * audit_set_loginuid - set current task's loginuid + * @loginuid: loginuid value + * + * Returns 0. + * + * Called (set) from fs/proc/base.c::proc_loginuid_write(). + */ +int audit_set_loginuid(kuid_t loginuid) +{ + unsigned int oldsessionid, sessionid = AUDIT_SID_UNSET; + kuid_t oldloginuid; + int rc; + + oldloginuid = audit_get_loginuid(current); + oldsessionid = audit_get_sessionid(current); + + rc = audit_set_loginuid_perm(loginuid); + if (rc) + goto out; + + /* are we setting or clearing? */ + if (uid_valid(loginuid)) { + sessionid = (unsigned int)atomic_inc_return(&session_id); + if (unlikely(sessionid == AUDIT_SID_UNSET)) + sessionid = (unsigned int)atomic_inc_return(&session_id); + } + + current->sessionid = sessionid; + current->loginuid = loginuid; +out: + audit_log_set_loginuid(oldloginuid, loginuid, oldsessionid, sessionid, rc); + return rc; +} + +/** + * audit_signal_info - record signal info for shutting down audit subsystem + * @sig: signal value + * @t: task being signaled + * + * If the audit subsystem is being terminated, record the task (pid) + * and uid that is doing that. + */ +int audit_signal_info(int sig, struct task_struct *t) +{ + kuid_t uid = current_uid(), auid; + + if (auditd_test_task(t) && + (sig == SIGTERM || sig == SIGHUP || + sig == SIGUSR1 || sig == SIGUSR2)) { + audit_sig_pid = task_tgid_nr(current); + auid = audit_get_loginuid(current); + if (uid_valid(auid)) + audit_sig_uid = auid; + else + audit_sig_uid = uid; + security_task_getsecid(current, &audit_sig_sid); + } + + return audit_signal_info_syscall(t); +} + /** * audit_log_end - end one audit record * @ab: the audit_buffer diff --git a/kernel/audit.h b/kernel/audit.h index 91421679a168..6fb7160412d4 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -1,22 +1,9 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ /* audit -- definition of audit_context structure and supporting types * * Copyright 2003-2004 Red Hat, Inc. * Copyright 2005 Hewlett-Packard Development Company, L.P. * Copyright 2005 IBM Corporation - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include <linux/fs.h> @@ -69,6 +56,7 @@ struct audit_cap_data { kernel_cap_t effective; /* effective set of process */ }; kernel_cap_t ambient; + kuid_t rootid; }; /* When fs/namei.c:getname() is called, we store the pointer in name and bump @@ -212,15 +200,6 @@ extern bool audit_ever_enabled; extern void audit_log_session_info(struct audit_buffer *ab); -extern void audit_copy_inode(struct audit_names *name, - const struct dentry *dentry, - struct inode *inode); -extern void audit_log_cap(struct audit_buffer *ab, char *prefix, - kernel_cap_t *cap); -extern void audit_log_name(struct audit_context *context, - struct audit_names *n, const struct path *path, - int record_num, int *call_panic); - extern int auditd_test_task(struct task_struct *task); #define AUDIT_INODE_BUCKETS 32 @@ -239,7 +218,7 @@ extern int audit_comparator(const u32 left, const u32 op, const u32 right); extern int audit_uid_comparator(kuid_t left, u32 op, kuid_t right); extern int audit_gid_comparator(kgid_t left, u32 op, kgid_t right); extern int parent_len(const char *path); -extern int audit_compare_dname_path(const char *dname, const char *path, int plen); +extern int audit_compare_dname_path(const struct qstr *dname, const char *path, int plen); extern struct sk_buff *audit_make_reply(int seq, int type, int done, int multi, const void *payload, int size); extern void audit_panic(const char *message); @@ -267,25 +246,52 @@ extern void audit_log_d_path_exe(struct audit_buffer *ab, extern struct tty_struct *audit_get_tty(void); extern void audit_put_tty(struct tty_struct *tty); -/* audit watch functions */ +/* audit watch/mark/tree functions */ #ifdef CONFIG_AUDITSYSCALL +extern unsigned int audit_serial(void); +extern int auditsc_get_stamp(struct audit_context *ctx, + struct timespec64 *t, unsigned int *serial); + extern void audit_put_watch(struct audit_watch *watch); extern void audit_get_watch(struct audit_watch *watch); -extern int audit_to_watch(struct audit_krule *krule, char *path, int len, u32 op); +extern int audit_to_watch(struct audit_krule *krule, char *path, int len, + u32 op); extern int audit_add_watch(struct audit_krule *krule, struct list_head **list); extern void audit_remove_watch_rule(struct audit_krule *krule); extern char *audit_watch_path(struct audit_watch *watch); -extern int audit_watch_compare(struct audit_watch *watch, unsigned long ino, dev_t dev); +extern int audit_watch_compare(struct audit_watch *watch, unsigned long ino, + dev_t dev); -extern struct audit_fsnotify_mark *audit_alloc_mark(struct audit_krule *krule, char *pathname, int len); +extern struct audit_fsnotify_mark *audit_alloc_mark(struct audit_krule *krule, + char *pathname, int len); extern char *audit_mark_path(struct audit_fsnotify_mark *mark); extern void audit_remove_mark(struct audit_fsnotify_mark *audit_mark); extern void audit_remove_mark_rule(struct audit_krule *krule); -extern int audit_mark_compare(struct audit_fsnotify_mark *mark, unsigned long ino, dev_t dev); +extern int audit_mark_compare(struct audit_fsnotify_mark *mark, + unsigned long ino, dev_t dev); extern int audit_dupe_exe(struct audit_krule *new, struct audit_krule *old); -extern int audit_exe_compare(struct task_struct *tsk, struct audit_fsnotify_mark *mark); +extern int audit_exe_compare(struct task_struct *tsk, + struct audit_fsnotify_mark *mark); + +extern struct audit_chunk *audit_tree_lookup(const struct inode *inode); +extern void audit_put_chunk(struct audit_chunk *chunk); +extern bool audit_tree_match(struct audit_chunk *chunk, + struct audit_tree *tree); +extern int audit_make_tree(struct audit_krule *rule, char *pathname, u32 op); +extern int audit_add_tree_rule(struct audit_krule *rule); +extern int audit_remove_tree_rule(struct audit_krule *rule); +extern void audit_trim_trees(void); +extern int audit_tag_tree(char *old, char *new); +extern const char *audit_tree_path(struct audit_tree *tree); +extern void audit_put_tree(struct audit_tree *tree); +extern void audit_kill_trees(struct audit_context *context); -#else +extern int audit_signal_info_syscall(struct task_struct *t); +extern void audit_filter_inodes(struct task_struct *tsk, + struct audit_context *ctx); +extern struct list_head *audit_killed_trees(void); +#else /* CONFIG_AUDITSYSCALL */ +#define auditsc_get_stamp(c, t, s) 0 #define audit_put_watch(w) {} #define audit_get_watch(w) {} #define audit_to_watch(k, p, l, o) (-EINVAL) @@ -301,21 +307,7 @@ extern int audit_exe_compare(struct task_struct *tsk, struct audit_fsnotify_mark #define audit_mark_compare(m, i, d) 0 #define audit_exe_compare(t, m) (-EINVAL) #define audit_dupe_exe(n, o) (-EINVAL) -#endif /* CONFIG_AUDITSYSCALL */ -#ifdef CONFIG_AUDITSYSCALL -extern struct audit_chunk *audit_tree_lookup(const struct inode *inode); -extern void audit_put_chunk(struct audit_chunk *chunk); -extern bool audit_tree_match(struct audit_chunk *chunk, struct audit_tree *tree); -extern int audit_make_tree(struct audit_krule *rule, char *pathname, u32 op); -extern int audit_add_tree_rule(struct audit_krule *rule); -extern int audit_remove_tree_rule(struct audit_krule *rule); -extern void audit_trim_trees(void); -extern int audit_tag_tree(char *old, char *new); -extern const char *audit_tree_path(struct audit_tree *tree); -extern void audit_put_tree(struct audit_tree *tree); -extern void audit_kill_trees(struct list_head *list); -#else #define audit_remove_tree_rule(rule) BUG() #define audit_add_tree_rule(rule) -EINVAL #define audit_make_tree(rule, str, op) -EINVAL @@ -323,8 +315,15 @@ extern void audit_kill_trees(struct list_head *list); #define audit_put_tree(tree) (void)0 #define audit_tag_tree(old, new) -EINVAL #define audit_tree_path(rule) "" /* never called */ -#define audit_kill_trees(list) BUG() -#endif +#define audit_kill_trees(context) BUG() + +static inline int audit_signal_info_syscall(struct task_struct *t) +{ + return 0; +} + +#define audit_filter_inodes(t, c) AUDIT_DISABLED +#endif /* CONFIG_AUDITSYSCALL */ extern char *audit_unpack_string(void **bufp, size_t *remain, size_t len); @@ -334,14 +333,5 @@ extern u32 audit_sig_sid; extern int audit_filter(int msgtype, unsigned int listtype); -#ifdef CONFIG_AUDITSYSCALL -extern int audit_signal_info(int sig, struct task_struct *t); -extern void audit_filter_inodes(struct task_struct *tsk, struct audit_context *ctx); -extern struct list_head *audit_killed_trees(void); -#else -#define audit_signal_info(s,t) AUDIT_DISABLED -#define audit_filter_inodes(t,c) AUDIT_DISABLED -#endif - extern void audit_ctl_lock(void); extern void audit_ctl_unlock(void); diff --git a/kernel/audit_fsnotify.c b/kernel/audit_fsnotify.c index cf4512a33675..f0d243318452 100644 --- a/kernel/audit_fsnotify.c +++ b/kernel/audit_fsnotify.c @@ -1,18 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* audit_fsnotify.c -- tracking inodes * * Copyright 2003-2009,2014-2015 Red Hat, Inc. * Copyright 2005 Hewlett-Packard Development Company, L.P. * Copyright 2005 IBM Corporation - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. */ #include <linux/kernel.h> @@ -127,7 +118,7 @@ static void audit_mark_log_rule_change(struct audit_fsnotify_mark *audit_mark, c if (!audit_enabled) return; - ab = audit_log_start(NULL, GFP_NOFS, AUDIT_CONFIG_CHANGE); + ab = audit_log_start(audit_context(), GFP_NOFS, AUDIT_CONFIG_CHANGE); if (unlikely(!ab)) return; audit_log_session_info(ab); @@ -164,7 +155,7 @@ static void audit_autoremove_mark_rule(struct audit_fsnotify_mark *audit_mark) static int audit_mark_handle_event(struct fsnotify_group *group, struct inode *to_tell, u32 mask, const void *data, int data_type, - const unsigned char *dname, u32 cookie, + const struct qstr *dname, u32 cookie, struct fsnotify_iter_info *iter_info) { struct fsnotify_mark *inode_mark = fsnotify_iter_inode_mark(iter_info); diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index d4af4d97f847..e49c912f862d 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -524,13 +524,14 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree) return 0; } -static void audit_tree_log_remove_rule(struct audit_krule *rule) +static void audit_tree_log_remove_rule(struct audit_context *context, + struct audit_krule *rule) { struct audit_buffer *ab; if (!audit_enabled) return; - ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE); + ab = audit_log_start(context, GFP_KERNEL, AUDIT_CONFIG_CHANGE); if (unlikely(!ab)) return; audit_log_format(ab, "op=remove_rule dir="); @@ -540,7 +541,7 @@ static void audit_tree_log_remove_rule(struct audit_krule *rule) audit_log_end(ab); } -static void kill_rules(struct audit_tree *tree) +static void kill_rules(struct audit_context *context, struct audit_tree *tree) { struct audit_krule *rule, *next; struct audit_entry *entry; @@ -551,7 +552,7 @@ static void kill_rules(struct audit_tree *tree) list_del_init(&rule->rlist); if (rule->tree) { /* not a half-baked one */ - audit_tree_log_remove_rule(rule); + audit_tree_log_remove_rule(context, rule); if (entry->rule.exe) audit_remove_mark(entry->rule.exe); rule->tree = NULL; @@ -633,7 +634,7 @@ static void trim_marked(struct audit_tree *tree) tree->goner = 1; spin_unlock(&hash_lock); mutex_lock(&audit_filter_mutex); - kill_rules(tree); + kill_rules(audit_context(), tree); list_del_init(&tree->list); mutex_unlock(&audit_filter_mutex); prune_one(tree); @@ -973,8 +974,10 @@ static void audit_schedule_prune(void) * ... and that one is done if evict_chunk() decides to delay until the end * of syscall. Runs synchronously. */ -void audit_kill_trees(struct list_head *list) +void audit_kill_trees(struct audit_context *context) { + struct list_head *list = &context->killed_trees; + audit_ctl_lock(); mutex_lock(&audit_filter_mutex); @@ -982,7 +985,7 @@ void audit_kill_trees(struct list_head *list) struct audit_tree *victim; victim = list_entry(list->next, struct audit_tree, list); - kill_rules(victim); + kill_rules(context, victim); list_del_init(&victim->list); mutex_unlock(&audit_filter_mutex); @@ -1017,7 +1020,7 @@ static void evict_chunk(struct audit_chunk *chunk) list_del_init(&owner->same_root); spin_unlock(&hash_lock); if (!postponed) { - kill_rules(owner); + kill_rules(audit_context(), owner); list_move(&owner->list, &prune_list); need_prune = 1; } else { @@ -1037,7 +1040,7 @@ static void evict_chunk(struct audit_chunk *chunk) static int audit_tree_handle_event(struct fsnotify_group *group, struct inode *to_tell, u32 mask, const void *data, int data_type, - const unsigned char *file_name, u32 cookie, + const struct qstr *file_name, u32 cookie, struct fsnotify_iter_info *iter_info) { return 0; diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index 20ef9ba134b0..1f31c2f1e6fc 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -1,22 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* audit_watch.c -- watching inodes * * Copyright 2003-2009 Red Hat, Inc. * Copyright 2005 Hewlett-Packard Development Company, L.P. * Copyright 2005 IBM Corporation - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include <linux/file.h> @@ -242,7 +229,7 @@ static void audit_watch_log_rule_change(struct audit_krule *r, struct audit_watc if (!audit_enabled) return; - ab = audit_log_start(NULL, GFP_NOFS, AUDIT_CONFIG_CHANGE); + ab = audit_log_start(audit_context(), GFP_NOFS, AUDIT_CONFIG_CHANGE); if (!ab) return; audit_log_session_info(ab); @@ -255,7 +242,7 @@ static void audit_watch_log_rule_change(struct audit_krule *r, struct audit_watc /* Update inode info in audit rules based on filesystem event. */ static void audit_update_watch(struct audit_parent *parent, - const char *dname, dev_t dev, + const struct qstr *dname, dev_t dev, unsigned long ino, unsigned invalidating) { struct audit_watch *owatch, *nwatch, *nextw; @@ -482,7 +469,7 @@ void audit_remove_watch_rule(struct audit_krule *krule) static int audit_watch_handle_event(struct fsnotify_group *group, struct inode *to_tell, u32 mask, const void *data, int data_type, - const unsigned char *dname, u32 cookie, + const struct qstr *dname, u32 cookie, struct fsnotify_iter_info *iter_info) { struct fsnotify_mark *inode_mark = fsnotify_iter_inode_mark(iter_info); diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index bf309f2592c4..b0126e9c0743 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -1,22 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* auditfilter.c -- filtering of audit events * * Copyright 2003-2004 Red Hat, Inc. * Copyright 2005 Hewlett-Packard Development Company, L.P. * Copyright 2005 IBM Corporation - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt @@ -335,7 +322,7 @@ static u32 audit_to_op(u32 op) /* check if an audit field is valid */ static int audit_field_valid(struct audit_entry *entry, struct audit_field *f) { - switch(f->type) { + switch (f->type) { case AUDIT_MSGTYPE: if (entry->rule.listnr != AUDIT_FILTER_EXCLUDE && entry->rule.listnr != AUDIT_FILTER_USER) @@ -347,7 +334,7 @@ static int audit_field_valid(struct audit_entry *entry, struct audit_field *f) break; } - switch(entry->rule.listnr) { + switch (entry->rule.listnr) { case AUDIT_FILTER_FS: switch(f->type) { case AUDIT_FSTYPE: @@ -358,9 +345,16 @@ static int audit_field_valid(struct audit_entry *entry, struct audit_field *f) } } - switch(f->type) { - default: - return -EINVAL; + /* Check for valid field type and op */ + switch (f->type) { + case AUDIT_ARG0: + case AUDIT_ARG1: + case AUDIT_ARG2: + case AUDIT_ARG3: + case AUDIT_PERS: /* <uapi/linux/personality.h> */ + case AUDIT_DEVMINOR: + /* all ops are valid */ + break; case AUDIT_UID: case AUDIT_EUID: case AUDIT_SUID: @@ -373,46 +367,53 @@ static int audit_field_valid(struct audit_entry *entry, struct audit_field *f) case AUDIT_FSGID: case AUDIT_OBJ_GID: case AUDIT_PID: - case AUDIT_PERS: case AUDIT_MSGTYPE: case AUDIT_PPID: case AUDIT_DEVMAJOR: - case AUDIT_DEVMINOR: case AUDIT_EXIT: case AUDIT_SUCCESS: case AUDIT_INODE: case AUDIT_SESSIONID: + case AUDIT_SUBJ_SEN: + case AUDIT_SUBJ_CLR: + case AUDIT_OBJ_LEV_LOW: + case AUDIT_OBJ_LEV_HIGH: + case AUDIT_SADDR_FAM: /* bit ops are only useful on syscall args */ if (f->op == Audit_bitmask || f->op == Audit_bittest) return -EINVAL; break; - case AUDIT_ARG0: - case AUDIT_ARG1: - case AUDIT_ARG2: - case AUDIT_ARG3: case AUDIT_SUBJ_USER: case AUDIT_SUBJ_ROLE: case AUDIT_SUBJ_TYPE: - case AUDIT_SUBJ_SEN: - case AUDIT_SUBJ_CLR: case AUDIT_OBJ_USER: case AUDIT_OBJ_ROLE: case AUDIT_OBJ_TYPE: - case AUDIT_OBJ_LEV_LOW: - case AUDIT_OBJ_LEV_HIGH: case AUDIT_WATCH: case AUDIT_DIR: case AUDIT_FILTERKEY: - break; case AUDIT_LOGINUID_SET: - if ((f->val != 0) && (f->val != 1)) - return -EINVAL; - /* FALL THROUGH */ case AUDIT_ARCH: case AUDIT_FSTYPE: + case AUDIT_PERM: + case AUDIT_FILETYPE: + case AUDIT_FIELD_COMPARE: + case AUDIT_EXE: + /* only equal and not equal valid ops */ if (f->op != Audit_not_equal && f->op != Audit_equal) return -EINVAL; break; + default: + /* field not recognized */ + return -EINVAL; + } + + /* Check for select valid field values */ + switch (f->type) { + case AUDIT_LOGINUID_SET: + if ((f->val != 0) && (f->val != 1)) + return -EINVAL; + break; case AUDIT_PERM: if (f->val & ~15) return -EINVAL; @@ -425,11 +426,14 @@ static int audit_field_valid(struct audit_entry *entry, struct audit_field *f) if (f->val > AUDIT_MAX_FIELD_COMPARE) return -EINVAL; break; - case AUDIT_EXE: - if (f->op != Audit_not_equal && f->op != Audit_equal) + case AUDIT_SADDR_FAM: + if (f->val >= AF_MAX) return -EINVAL; break; + default: + break; } + return 0; } @@ -670,7 +674,7 @@ static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule) data->values[i] = AUDIT_UID_UNSET; break; } - /* fallthrough if set */ + /* fall through - if set */ default: data->values[i] = f->val; } @@ -1091,7 +1095,7 @@ static void audit_log_rule_change(char *action, struct audit_krule *rule, int re if (!audit_enabled) return; - ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE); + ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_CONFIG_CHANGE); if (!ab) return; audit_log_session_info(ab); @@ -1114,22 +1118,24 @@ int audit_rule_change(int type, int seq, void *data, size_t datasz) int err = 0; struct audit_entry *entry; - entry = audit_data_to_entry(data, datasz); - if (IS_ERR(entry)) - return PTR_ERR(entry); - switch (type) { case AUDIT_ADD_RULE: + entry = audit_data_to_entry(data, datasz); + if (IS_ERR(entry)) + return PTR_ERR(entry); err = audit_add_rule(entry); audit_log_rule_change("add_rule", &entry->rule, !err); break; case AUDIT_DEL_RULE: + entry = audit_data_to_entry(data, datasz); + if (IS_ERR(entry)) + return PTR_ERR(entry); err = audit_del_rule(entry); audit_log_rule_change("remove_rule", &entry->rule, !err); break; default: - err = -EINVAL; WARN_ON(1); + return -EINVAL; } if (err || type == AUDIT_DEL_RULE) { @@ -1201,7 +1207,6 @@ int audit_comparator(u32 left, u32 op, u32 right) case Audit_bittest: return ((left & right) == right); default: - BUG(); return 0; } } @@ -1224,7 +1229,6 @@ int audit_uid_comparator(kuid_t left, u32 op, kuid_t right) case Audit_bitmask: case Audit_bittest: default: - BUG(); return 0; } } @@ -1247,7 +1251,6 @@ int audit_gid_comparator(kgid_t left, u32 op, kgid_t right) case Audit_bitmask: case Audit_bittest: default: - BUG(); return 0; } } @@ -1290,12 +1293,12 @@ int parent_len(const char *path) * @parentlen: length of the parent if known. Passing in AUDIT_NAME_FULL * here indicates that we must compute this value. */ -int audit_compare_dname_path(const char *dname, const char *path, int parentlen) +int audit_compare_dname_path(const struct qstr *dname, const char *path, int parentlen) { int dlen, pathlen; const char *p; - dlen = strlen(dname); + dlen = dname->len; pathlen = strlen(path); if (pathlen < dlen) return 1; @@ -1306,7 +1309,7 @@ int audit_compare_dname_path(const char *dname, const char *path, int parentlen) p = path + parentlen; - return strncmp(p, dname, dlen); + return strncmp(p, dname->name, dlen); } int audit_filter(int msgtype, unsigned int listtype) @@ -1315,8 +1318,6 @@ int audit_filter(int msgtype, unsigned int listtype) int ret = 1; /* Audit by default */ rcu_read_lock(); - if (list_empty(&audit_filter_list[listtype])) - goto unlock_and_return; list_for_each_entry_rcu(e, &audit_filter_list[listtype], list) { int i, result = 0; @@ -1355,7 +1356,7 @@ int audit_filter(int msgtype, unsigned int listtype) if (f->lsm_rule) { security_task_getsecid(current, &sid); result = security_audit_rule_match(sid, - f->type, f->op, f->lsm_rule, NULL); + f->type, f->op, f->lsm_rule); } break; case AUDIT_EXE: diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 6593a5207fb0..4effe01ebbe2 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -601,12 +601,20 @@ static int audit_filter_rules(struct task_struct *tsk, } break; case AUDIT_WATCH: - if (name) - result = audit_watch_compare(rule->watch, name->ino, name->dev); + if (name) { + result = audit_watch_compare(rule->watch, + name->ino, + name->dev); + if (f->op == Audit_not_equal) + result = !result; + } break; case AUDIT_DIR: - if (ctx) + if (ctx) { result = match_tree_refs(ctx, rule->tree); + if (f->op == Audit_not_equal) + result = !result; + } break; case AUDIT_LOGINUID: result = audit_uid_comparator(audit_get_loginuid(tsk), @@ -615,6 +623,11 @@ static int audit_filter_rules(struct task_struct *tsk, case AUDIT_LOGINUID_SET: result = audit_comparator(audit_loginuid_set(tsk), f->op, f->val); break; + case AUDIT_SADDR_FAM: + if (ctx->sockaddr) + result = audit_comparator(ctx->sockaddr->ss_family, + f->op, f->val); + break; case AUDIT_SUBJ_USER: case AUDIT_SUBJ_ROLE: case AUDIT_SUBJ_TYPE: @@ -631,9 +644,8 @@ static int audit_filter_rules(struct task_struct *tsk, need_sid = 0; } result = security_audit_rule_match(sid, f->type, - f->op, - f->lsm_rule, - ctx); + f->op, + f->lsm_rule); } break; case AUDIT_OBJ_USER: @@ -647,13 +659,17 @@ static int audit_filter_rules(struct task_struct *tsk, /* Find files that match */ if (name) { result = security_audit_rule_match( - name->osid, f->type, f->op, - f->lsm_rule, ctx); + name->osid, + f->type, + f->op, + f->lsm_rule); } else if (ctx) { list_for_each_entry(n, &ctx->names_list, list) { - if (security_audit_rule_match(n->osid, f->type, - f->op, f->lsm_rule, - ctx)) { + if (security_audit_rule_match( + n->osid, + f->type, + f->op, + f->lsm_rule)) { ++result; break; } @@ -664,7 +680,7 @@ static int audit_filter_rules(struct task_struct *tsk, break; if (security_audit_rule_match(ctx->ipc.osid, f->type, f->op, - f->lsm_rule, ctx)) + f->lsm_rule)) ++result; } break; @@ -681,9 +697,13 @@ static int audit_filter_rules(struct task_struct *tsk, break; case AUDIT_PERM: result = audit_match_perm(ctx, f->val); + if (f->op == Audit_not_equal) + result = !result; break; case AUDIT_FILETYPE: result = audit_match_filetype(ctx, f->val); + if (f->op == Audit_not_equal) + result = !result; break; case AUDIT_FIELD_COMPARE: result = audit_field_compare(tsk, cred, f, ctx, name); @@ -768,15 +788,13 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk, return AUDIT_DISABLED; rcu_read_lock(); - if (!list_empty(list)) { - list_for_each_entry_rcu(e, list, list) { - if (audit_in_mask(&e->rule, ctx->major) && - audit_filter_rules(tsk, &e->rule, ctx, NULL, - &state, false)) { - rcu_read_unlock(); - ctx->current_state = state; - return state; - } + list_for_each_entry_rcu(e, list, list) { + if (audit_in_mask(&e->rule, ctx->major) && + audit_filter_rules(tsk, &e->rule, ctx, NULL, + &state, false)) { + rcu_read_unlock(); + ctx->current_state = state; + return state; } } rcu_read_unlock(); @@ -795,9 +813,6 @@ static int audit_filter_inode_name(struct task_struct *tsk, struct audit_entry *e; enum audit_state state; - if (list_empty(list)) - return 0; - list_for_each_entry_rcu(e, list, list) { if (audit_in_mask(&e->rule, ctx->major) && audit_filter_rules(tsk, &e->rule, ctx, n, &state, false)) { @@ -805,7 +820,6 @@ static int audit_filter_inode_name(struct task_struct *tsk, return 1; } } - return 0; } @@ -837,6 +851,13 @@ static inline void audit_proctitle_free(struct audit_context *context) context->proctitle.len = 0; } +static inline void audit_free_module(struct audit_context *context) +{ + if (context->type == AUDIT_KERN_MODULE) { + kfree(context->module.name); + context->module.name = NULL; + } +} static inline void audit_free_names(struct audit_context *context) { struct audit_names *n, *next; @@ -920,6 +941,7 @@ int audit_alloc(struct task_struct *tsk) static inline void audit_free_context(struct audit_context *context) { + audit_free_module(context); audit_free_names(context); unroll_tree_refs(context, NULL, 0); free_tree_refs(context); @@ -1136,6 +1158,33 @@ out: kfree(buf_head); } +static void audit_log_cap(struct audit_buffer *ab, char *prefix, + kernel_cap_t *cap) +{ + int i; + + if (cap_isclear(*cap)) { + audit_log_format(ab, " %s=0", prefix); + return; + } + audit_log_format(ab, " %s=", prefix); + CAP_FOR_EACH_U32(i) + audit_log_format(ab, "%08x", cap->cap[CAP_LAST_U32 - i]); +} + +static void audit_log_fcaps(struct audit_buffer *ab, struct audit_names *name) +{ + if (name->fcap_ver == -1) { + audit_log_format(ab, " cap_fe=? cap_fver=? cap_fp=? cap_fi=?"); + return; + } + audit_log_cap(ab, "cap_fp", &name->fcap.permitted); + audit_log_cap(ab, "cap_fi", &name->fcap.inheritable); + audit_log_format(ab, " cap_fe=%d cap_fver=%x cap_frootid=%d", + name->fcap.fE, name->fcap_ver, + from_kuid(&init_user_ns, name->fcap.rootid)); +} + static void show_special(struct audit_context *context, int *call_panic) { struct audit_buffer *ab; @@ -1237,7 +1286,6 @@ static void show_special(struct audit_context *context, int *call_panic) audit_log_format(ab, "name="); if (context->module.name) { audit_log_untrustedstring(ab, context->module.name); - kfree(context->module.name); } else audit_log_format(ab, "(null)"); @@ -1258,6 +1306,97 @@ static inline int audit_proctitle_rtrim(char *proctitle, int len) return len; } +/* + * audit_log_name - produce AUDIT_PATH record from struct audit_names + * @context: audit_context for the task + * @n: audit_names structure with reportable details + * @path: optional path to report instead of audit_names->name + * @record_num: record number to report when handling a list of names + * @call_panic: optional pointer to int that will be updated if secid fails + */ +static void audit_log_name(struct audit_context *context, struct audit_names *n, + const struct path *path, int record_num, int *call_panic) +{ + struct audit_buffer *ab; + + ab = audit_log_start(context, GFP_KERNEL, AUDIT_PATH); + if (!ab) + return; + + audit_log_format(ab, "item=%d", record_num); + + if (path) + audit_log_d_path(ab, " name=", path); + else if (n->name) { + switch (n->name_len) { + case AUDIT_NAME_FULL: + /* log the full path */ + audit_log_format(ab, " name="); + audit_log_untrustedstring(ab, n->name->name); + break; + case 0: + /* name was specified as a relative path and the + * directory component is the cwd + */ + audit_log_d_path(ab, " name=", &context->pwd); + break; + default: + /* log the name's directory component */ + audit_log_format(ab, " name="); + audit_log_n_untrustedstring(ab, n->name->name, + n->name_len); + } + } else + audit_log_format(ab, " name=(null)"); + + if (n->ino != AUDIT_INO_UNSET) + audit_log_format(ab, " inode=%lu dev=%02x:%02x mode=%#ho ouid=%u ogid=%u rdev=%02x:%02x", + n->ino, + MAJOR(n->dev), + MINOR(n->dev), + n->mode, + from_kuid(&init_user_ns, n->uid), + from_kgid(&init_user_ns, n->gid), + MAJOR(n->rdev), + MINOR(n->rdev)); + if (n->osid != 0) { + char *ctx = NULL; + u32 len; + + if (security_secid_to_secctx( + n->osid, &ctx, &len)) { + audit_log_format(ab, " osid=%u", n->osid); + if (call_panic) + *call_panic = 2; + } else { + audit_log_format(ab, " obj=%s", ctx); + security_release_secctx(ctx, len); + } + } + + /* log the audit_names record type */ + switch (n->type) { + case AUDIT_TYPE_NORMAL: + audit_log_format(ab, " nametype=NORMAL"); + break; + case AUDIT_TYPE_PARENT: + audit_log_format(ab, " nametype=PARENT"); + break; + case AUDIT_TYPE_CHILD_DELETE: + audit_log_format(ab, " nametype=DELETE"); + break; + case AUDIT_TYPE_CHILD_CREATE: + audit_log_format(ab, " nametype=CREATE"); + break; + default: + audit_log_format(ab, " nametype=UNKNOWN"); + break; + } + + audit_log_fcaps(ab, n); + audit_log_end(ab); +} + static void audit_log_proctitle(void) { int res; @@ -1358,6 +1497,9 @@ static void audit_log_exit(void) audit_log_cap(ab, "pi", &axs->new_pcap.inheritable); audit_log_cap(ab, "pe", &axs->new_pcap.effective); audit_log_cap(ab, "pa", &axs->new_pcap.ambient); + audit_log_format(ab, " frootid=%d", + from_kuid(&init_user_ns, + axs->fcap.rootid)); break; } } @@ -1444,6 +1586,9 @@ void __audit_free(struct task_struct *tsk) if (!context) return; + if (!list_empty(&context->killed_trees)) + audit_kill_trees(context); + /* We are called either by do_exit() or the fork() error handling code; * in the former case tsk == current and in the latter tsk is a * random task_struct that doesn't doesn't have any meaningful data we @@ -1460,9 +1605,6 @@ void __audit_free(struct task_struct *tsk) audit_log_exit(); } - if (!list_empty(&context->killed_trees)) - audit_kill_trees(&context->killed_trees); - audit_set_context(tsk, NULL); audit_free_context(context); } @@ -1505,7 +1647,7 @@ void __audit_syscall_entry(int major, unsigned long a1, unsigned long a2, return; } - context->arch = syscall_get_arch(); + context->arch = syscall_get_arch(current); context->major = major; context->argv[0] = a1; context->argv[1] = a2; @@ -1537,6 +1679,9 @@ void __audit_syscall_exit(int success, long return_code) if (!context) return; + if (!list_empty(&context->killed_trees)) + audit_kill_trees(context); + if (!context->dummy && context->in_syscall) { if (success) context->return_valid = AUDITSC_SUCCESS; @@ -1571,9 +1716,7 @@ void __audit_syscall_exit(int success, long return_code) context->in_syscall = 0; context->prio = context->state == AUDIT_RECORD_CONTEXT ? ~0ULL : 0; - if (!list_empty(&context->killed_trees)) - audit_kill_trees(&context->killed_trees); - + audit_free_module(context); audit_free_names(context); unroll_tree_refs(context, NULL, 0); audit_free_aux(context); @@ -1750,6 +1893,48 @@ void __audit_getname(struct filename *name) get_fs_pwd(current->fs, &context->pwd); } +static inline int audit_copy_fcaps(struct audit_names *name, + const struct dentry *dentry) +{ + struct cpu_vfs_cap_data caps; + int rc; + + if (!dentry) + return 0; + + rc = get_vfs_caps_from_disk(dentry, &caps); + if (rc) + return rc; + + name->fcap.permitted = caps.permitted; + name->fcap.inheritable = caps.inheritable; + name->fcap.fE = !!(caps.magic_etc & VFS_CAP_FLAGS_EFFECTIVE); + name->fcap.rootid = caps.rootid; + name->fcap_ver = (caps.magic_etc & VFS_CAP_REVISION_MASK) >> + VFS_CAP_REVISION_SHIFT; + + return 0; +} + +/* Copy inode data into an audit_names. */ +static void audit_copy_inode(struct audit_names *name, + const struct dentry *dentry, + struct inode *inode, unsigned int flags) +{ + name->ino = inode->i_ino; + name->dev = inode->i_sb->s_dev; + name->mode = inode->i_mode; + name->uid = inode->i_uid; + name->gid = inode->i_gid; + name->rdev = inode->i_rdev; + security_inode_getsecid(inode, &name->osid); + if (flags & AUDIT_INODE_NOEVAL) { + name->fcap_ver = -1; + return; + } + audit_copy_fcaps(name, dentry); +} + /** * __audit_inode - store the inode and device from a lookup * @name: name being audited @@ -1763,10 +1948,29 @@ void __audit_inode(struct filename *name, const struct dentry *dentry, struct inode *inode = d_backing_inode(dentry); struct audit_names *n; bool parent = flags & AUDIT_INODE_PARENT; + struct audit_entry *e; + struct list_head *list = &audit_filter_list[AUDIT_FILTER_FS]; + int i; if (!context->in_syscall) return; + rcu_read_lock(); + list_for_each_entry_rcu(e, list, list) { + for (i = 0; i < e->rule.field_count; i++) { + struct audit_field *f = &e->rule.fields[i]; + + if (f->type == AUDIT_FSTYPE + && audit_comparator(inode->i_sb->s_magic, + f->op, f->val) + && e->rule.action == AUDIT_NEVER) { + rcu_read_unlock(); + return; + } + } + } + rcu_read_unlock(); + if (!name) goto out_alloc; @@ -1832,7 +2036,7 @@ out: n->type = AUDIT_TYPE_NORMAL; } handle_path(dentry); - audit_copy_inode(n, dentry, inode); + audit_copy_inode(n, dentry, inode, flags & AUDIT_INODE_NOEVAL); } void __audit_file(const struct file *file) @@ -1860,7 +2064,7 @@ void __audit_inode_child(struct inode *parent, { struct audit_context *context = audit_context(); struct inode *inode = d_backing_inode(dentry); - const char *dname = dentry->d_name.name; + const struct qstr *dname = &dentry->d_name; struct audit_names *n, *found_parent = NULL, *found_child = NULL; struct audit_entry *e; struct list_head *list = &audit_filter_list[AUDIT_FILTER_FS]; @@ -1870,20 +2074,16 @@ void __audit_inode_child(struct inode *parent, return; rcu_read_lock(); - if (!list_empty(list)) { - list_for_each_entry_rcu(e, list, list) { - for (i = 0; i < e->rule.field_count; i++) { - struct audit_field *f = &e->rule.fields[i]; - - if (f->type == AUDIT_FSTYPE) { - if (audit_comparator(parent->i_sb->s_magic, - f->op, f->val)) { - if (e->rule.action == AUDIT_NEVER) { - rcu_read_unlock(); - return; - } - } - } + list_for_each_entry_rcu(e, list, list) { + for (i = 0; i < e->rule.field_count; i++) { + struct audit_field *f = &e->rule.fields[i]; + + if (f->type == AUDIT_FSTYPE + && audit_comparator(parent->i_sb->s_magic, + f->op, f->val) + && e->rule.action == AUDIT_NEVER) { + rcu_read_unlock(); + return; } } } @@ -1916,7 +2116,7 @@ void __audit_inode_child(struct inode *parent, (n->type != type && n->type != AUDIT_TYPE_UNKNOWN)) continue; - if (!strcmp(dname, n->name->name) || + if (!strcmp(dname->name, n->name->name) || !audit_compare_dname_path(dname, n->name->name, found_parent ? found_parent->name_len : @@ -1933,7 +2133,7 @@ void __audit_inode_child(struct inode *parent, n = audit_alloc_name(context, AUDIT_TYPE_PARENT); if (!n) return; - audit_copy_inode(n, NULL, parent); + audit_copy_inode(n, NULL, parent, 0); } if (!found_child) { @@ -1952,7 +2152,7 @@ void __audit_inode_child(struct inode *parent, } if (inode) - audit_copy_inode(found_child, dentry, inode); + audit_copy_inode(found_child, dentry, inode, 0); else found_child->ino = AUDIT_INO_UNSET; } @@ -1983,90 +2183,6 @@ int auditsc_get_stamp(struct audit_context *ctx, return 1; } -/* global counter which is incremented every time something logs in */ -static atomic_t session_id = ATOMIC_INIT(0); - -static int audit_set_loginuid_perm(kuid_t loginuid) -{ - /* if we are unset, we don't need privs */ - if (!audit_loginuid_set(current)) - return 0; - /* if AUDIT_FEATURE_LOGINUID_IMMUTABLE means never ever allow a change*/ - if (is_audit_feature_set(AUDIT_FEATURE_LOGINUID_IMMUTABLE)) - return -EPERM; - /* it is set, you need permission */ - if (!capable(CAP_AUDIT_CONTROL)) - return -EPERM; - /* reject if this is not an unset and we don't allow that */ - if (is_audit_feature_set(AUDIT_FEATURE_ONLY_UNSET_LOGINUID) && uid_valid(loginuid)) - return -EPERM; - return 0; -} - -static void audit_log_set_loginuid(kuid_t koldloginuid, kuid_t kloginuid, - unsigned int oldsessionid, unsigned int sessionid, - int rc) -{ - struct audit_buffer *ab; - uid_t uid, oldloginuid, loginuid; - struct tty_struct *tty; - - if (!audit_enabled) - return; - - ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_LOGIN); - if (!ab) - return; - - uid = from_kuid(&init_user_ns, task_uid(current)); - oldloginuid = from_kuid(&init_user_ns, koldloginuid); - loginuid = from_kuid(&init_user_ns, kloginuid), - tty = audit_get_tty(); - - audit_log_format(ab, "pid=%d uid=%u", task_tgid_nr(current), uid); - audit_log_task_context(ab); - audit_log_format(ab, " old-auid=%u auid=%u tty=%s old-ses=%u ses=%u res=%d", - oldloginuid, loginuid, tty ? tty_name(tty) : "(none)", - oldsessionid, sessionid, !rc); - audit_put_tty(tty); - audit_log_end(ab); -} - -/** - * audit_set_loginuid - set current task's audit_context loginuid - * @loginuid: loginuid value - * - * Returns 0. - * - * Called (set) from fs/proc/base.c::proc_loginuid_write(). - */ -int audit_set_loginuid(kuid_t loginuid) -{ - unsigned int oldsessionid, sessionid = AUDIT_SID_UNSET; - kuid_t oldloginuid; - int rc; - - oldloginuid = audit_get_loginuid(current); - oldsessionid = audit_get_sessionid(current); - - rc = audit_set_loginuid_perm(loginuid); - if (rc) - goto out; - - /* are we setting or clearing? */ - if (uid_valid(loginuid)) { - sessionid = (unsigned int)atomic_inc_return(&session_id); - if (unlikely(sessionid == AUDIT_SID_UNSET)) - sessionid = (unsigned int)atomic_inc_return(&session_id); - } - - current->sessionid = sessionid; - current->loginuid = loginuid; -out: - audit_log_set_loginuid(oldloginuid, loginuid, oldsessionid, sessionid, rc); - return rc; -} - /** * __audit_mq_open - record audit data for a POSIX MQ open * @oflag: open flag @@ -2261,30 +2377,17 @@ void __audit_ptrace(struct task_struct *t) } /** - * audit_signal_info - record signal info for shutting down audit subsystem - * @sig: signal value + * audit_signal_info_syscall - record signal info for syscalls * @t: task being signaled * * If the audit subsystem is being terminated, record the task (pid) * and uid that is doing that. */ -int audit_signal_info(int sig, struct task_struct *t) +int audit_signal_info_syscall(struct task_struct *t) { struct audit_aux_data_pids *axp; struct audit_context *ctx = audit_context(); - kuid_t uid = current_uid(), auid, t_uid = task_uid(t); - - if (auditd_test_task(t) && - (sig == SIGTERM || sig == SIGHUP || - sig == SIGUSR1 || sig == SIGUSR2)) { - audit_sig_pid = task_tgid_nr(current); - auid = audit_get_loginuid(current); - if (uid_valid(auid)) - audit_sig_uid = auid; - else - audit_sig_uid = uid; - security_task_getsecid(current, &audit_sig_sid); - } + kuid_t t_uid = task_uid(t); if (!audit_signals || audit_dummy_context()) return 0; @@ -2355,6 +2458,7 @@ int __audit_log_bprm_fcaps(struct linux_binprm *bprm, ax->fcap.permitted = vcaps.permitted; ax->fcap.inheritable = vcaps.inheritable; ax->fcap.fE = !!(vcaps.magic_etc & VFS_CAP_FLAGS_EFFECTIVE); + ax->fcap.rootid = vcaps.rootid; ax->fcap_ver = (vcaps.magic_etc & VFS_CAP_REVISION_MASK) >> VFS_CAP_REVISION_SHIFT; ax->old_pcap.permitted = old->cap_permitted; @@ -2412,6 +2516,35 @@ void __audit_fanotify(unsigned int response) AUDIT_FANOTIFY, "resp=%u", response); } +void __audit_tk_injoffset(struct timespec64 offset) +{ + audit_log(audit_context(), GFP_KERNEL, AUDIT_TIME_INJOFFSET, + "sec=%lli nsec=%li", + (long long)offset.tv_sec, offset.tv_nsec); +} + +static void audit_log_ntp_val(const struct audit_ntp_data *ad, + const char *op, enum audit_ntp_type type) +{ + const struct audit_ntp_val *val = &ad->vals[type]; + + if (val->newval == val->oldval) + return; + + audit_log(audit_context(), GFP_KERNEL, AUDIT_TIME_ADJNTPVAL, + "op=%s old=%lli new=%lli", op, val->oldval, val->newval); +} + +void __audit_ntp_log(const struct audit_ntp_data *ad) +{ + audit_log_ntp_val(ad, "offset", AUDIT_NTP_OFFSET); + audit_log_ntp_val(ad, "freq", AUDIT_NTP_FREQ); + audit_log_ntp_val(ad, "status", AUDIT_NTP_STATUS); + audit_log_ntp_val(ad, "tai", AUDIT_NTP_TAI); + audit_log_ntp_val(ad, "tick", AUDIT_NTP_TICK); + audit_log_ntp_val(ad, "adjust", AUDIT_NTP_ADJUST); +} + static void audit_log_task(struct audit_buffer *ab) { kuid_t auid, uid; @@ -2480,7 +2613,7 @@ void audit_seccomp(unsigned long syscall, long signr, int code) return; audit_log_task(ab); audit_log_format(ab, " sig=%ld arch=%x syscall=%ld compat=%d ip=0x%lx code=0x%x", - signr, syscall_get_arch(), syscall, + signr, syscall_get_arch(current), syscall, in_compat_syscall(), KSTK_EIP(current), code); audit_log_end(ab); } diff --git a/kernel/backtracetest.c b/kernel/backtracetest.c index 1323360d90e3..a2a97fa3071b 100644 --- a/kernel/backtracetest.c +++ b/kernel/backtracetest.c @@ -1,13 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Simple stack backtrace regression test module * * (C) Copyright 2008 Intel Corporation * Author: Arjan van de Ven <arjan@linux.intel.com> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; version 2 - * of the License. */ #include <linux/completion.h> @@ -48,19 +44,14 @@ static void backtrace_test_irq(void) #ifdef CONFIG_STACKTRACE static void backtrace_test_saved(void) { - struct stack_trace trace; unsigned long entries[8]; + unsigned int nr_entries; pr_info("Testing a saved backtrace.\n"); pr_info("The following trace is a kernel self test and not a bug!\n"); - trace.nr_entries = 0; - trace.max_entries = ARRAY_SIZE(entries); - trace.entries = entries; - trace.skip = 0; - - save_stack_trace(&trace); - print_stack_trace(&trace, 0); + nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 0); + stack_trace_print(entries, nr_entries, 0); } #else static void backtrace_test_saved(void) diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 4c2fa3ac56f6..29d781061cd5 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -1,5 +1,6 @@ # SPDX-License-Identifier: GPL-2.0 obj-y := core.o +CFLAGS_core.o += $(call cc-disable-warning, override-init) obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 25632a75d630..1c65ce0098a9 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -1,14 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0-only /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com * Copyright (c) 2016,2017 Facebook - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. */ #include <linux/bpf.h> #include <linux/btf.h> @@ -22,7 +14,7 @@ #include "map_in_map.h" #define ARRAY_CREATE_FLAG_MASK \ - (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) + (BPF_F_NUMA_NODE | BPF_F_ACCESS_MASK) static void bpf_array_free_percpu(struct bpf_array *array) { @@ -63,6 +55,7 @@ int array_map_alloc_check(union bpf_attr *attr) if (attr->max_entries == 0 || attr->key_size != 4 || attr->value_size == 0 || attr->map_flags & ~ARRAY_CREATE_FLAG_MASK || + !bpf_map_flags_access_ok(attr->map_flags) || (percpu && numa_node != NUMA_NO_NODE)) return -EINVAL; @@ -82,6 +75,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) u32 elem_size, index_mask, max_entries; bool unpriv = !capable(CAP_SYS_ADMIN); u64 cost, array_size, mask64; + struct bpf_map_memory mem; struct bpf_array *array; elem_size = round_up(attr->value_size, 8); @@ -115,32 +109,29 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) /* make sure there is no u32 overflow later in round_up() */ cost = array_size; - if (cost >= U32_MAX - PAGE_SIZE) - return ERR_PTR(-ENOMEM); - if (percpu) { + if (percpu) cost += (u64)attr->max_entries * elem_size * num_possible_cpus(); - if (cost >= U32_MAX - PAGE_SIZE) - return ERR_PTR(-ENOMEM); - } - cost = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; - ret = bpf_map_precharge_memlock(cost); + ret = bpf_map_charge_init(&mem, cost); if (ret < 0) return ERR_PTR(ret); /* allocate all map elements and zero-initialize them */ array = bpf_map_area_alloc(array_size, numa_node); - if (!array) + if (!array) { + bpf_map_charge_finish(&mem); return ERR_PTR(-ENOMEM); + } array->index_mask = index_mask; array->map.unpriv_array = unpriv; /* copy mandatory map attributes */ bpf_map_init_from_attr(&array->map, attr); - array->map.pages = cost; + bpf_map_charge_move(&array->map.memory, &mem); array->elem_size = elem_size; if (percpu && bpf_array_alloc_percpu(array)) { + bpf_map_charge_finish(&array->map.memory); bpf_map_area_free(array); return ERR_PTR(-ENOMEM); } @@ -160,6 +151,36 @@ static void *array_map_lookup_elem(struct bpf_map *map, void *key) return array->value + array->elem_size * (index & array->index_mask); } +static int array_map_direct_value_addr(const struct bpf_map *map, u64 *imm, + u32 off) +{ + struct bpf_array *array = container_of(map, struct bpf_array, map); + + if (map->max_entries != 1) + return -ENOTSUPP; + if (off >= map->value_size) + return -EINVAL; + + *imm = (unsigned long)array->value; + return 0; +} + +static int array_map_direct_value_meta(const struct bpf_map *map, u64 imm, + u32 *off) +{ + struct bpf_array *array = container_of(map, struct bpf_array, map); + u64 base = (unsigned long)array->value; + u64 range = array->elem_size; + + if (map->max_entries != 1) + return -ENOTSUPP; + if (imm < base || imm >= base + range) + return -ENOENT; + + *off = imm - base; + return 0; +} + /* emit BPF instructions equivalent to C code of array_map_lookup_elem() */ static u32 array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) { @@ -253,8 +274,9 @@ static int array_map_update_elem(struct bpf_map *map, void *key, void *value, { struct bpf_array *array = container_of(map, struct bpf_array, map); u32 index = *(u32 *)key; + char *val; - if (unlikely(map_flags > BPF_EXIST)) + if (unlikely((map_flags & ~BPF_F_LOCK) > BPF_EXIST)) /* unknown flags */ return -EINVAL; @@ -262,17 +284,25 @@ static int array_map_update_elem(struct bpf_map *map, void *key, void *value, /* all elements were pre-allocated, cannot insert a new one */ return -E2BIG; - if (unlikely(map_flags == BPF_NOEXIST)) + if (unlikely(map_flags & BPF_NOEXIST)) /* all elements already exist */ return -EEXIST; - if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY) + if (unlikely((map_flags & BPF_F_LOCK) && + !map_value_has_spin_lock(map))) + return -EINVAL; + + if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { memcpy(this_cpu_ptr(array->pptrs[index & array->index_mask]), value, map->value_size); - else - memcpy(array->value + - array->elem_size * (index & array->index_mask), - value, map->value_size); + } else { + val = array->value + + array->elem_size * (index & array->index_mask); + if (map_flags & BPF_F_LOCK) + copy_map_value_locked(map, val, value, false); + else + copy_map_value(map, val, value); + } return 0; } @@ -351,7 +381,8 @@ static void array_map_seq_show_elem(struct bpf_map *map, void *key, return; } - seq_printf(m, "%u: ", *(u32 *)key); + if (map->btf_key_type_id) + seq_printf(m, "%u: ", *(u32 *)key); btf_type_seq_show(map->btf, map->btf_value_type_id, value, m); seq_puts(m, "\n"); @@ -388,6 +419,18 @@ static int array_map_check_btf(const struct bpf_map *map, { u32 int_data; + /* One exception for keyless BTF: .bss/.data/.rodata map */ + if (btf_type_is_void(key_type)) { + if (map->map_type != BPF_MAP_TYPE_ARRAY || + map->max_entries != 1) + return -EINVAL; + + if (BTF_INFO_KIND(value_type->info) != BTF_KIND_DATASEC) + return -EINVAL; + + return 0; + } + if (BTF_INFO_KIND(key_type->info) != BTF_KIND_INT) return -EINVAL; @@ -410,6 +453,8 @@ const struct bpf_map_ops array_map_ops = { .map_update_elem = array_map_update_elem, .map_delete_elem = array_map_delete_elem, .map_gen_lookup = array_map_gen_lookup, + .map_direct_value_addr = array_map_direct_value_addr, + .map_direct_value_meta = array_map_direct_value_meta, .map_seq_show_elem = array_map_seq_show_elem, .map_check_btf = array_map_check_btf, }; @@ -431,6 +476,9 @@ static int fd_array_map_alloc_check(union bpf_attr *attr) /* only file descriptors can be stored in this type of map */ if (attr->value_size != sizeof(u32)) return -EINVAL; + /* Program read-only/write-only not supported for special maps yet. */ + if (attr->map_flags & (BPF_F_RDONLY_PROG | BPF_F_WRONLY_PROG)) + return -EINVAL; return array_map_alloc_check(attr); } diff --git a/kernel/bpf/bpf_lru_list.c b/kernel/bpf/bpf_lru_list.c index e6ef4401a138..1b6b9349cb85 100644 --- a/kernel/bpf/bpf_lru_list.c +++ b/kernel/bpf/bpf_lru_list.c @@ -1,8 +1,5 @@ +// SPDX-License-Identifier: GPL-2.0-only /* Copyright (c) 2016 Facebook - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. */ #include <linux/cpumask.h> #include <linux/spinlock.h> diff --git a/kernel/bpf/bpf_lru_list.h b/kernel/bpf/bpf_lru_list.h index 7d4f89b7cb84..f02504640e18 100644 --- a/kernel/bpf/bpf_lru_list.h +++ b/kernel/bpf/bpf_lru_list.h @@ -1,8 +1,5 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ /* Copyright (c) 2016 Facebook - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. */ #ifndef __BPF_LRU_LIST_H_ #define __BPF_LRU_LIST_H_ diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index c57bd10340ed..546ebee39e2a 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -157,7 +157,7 @@ * */ -#define BITS_PER_U64 (sizeof(u64) * BITS_PER_BYTE) +#define BITS_PER_U128 (sizeof(u64) * BITS_PER_BYTE * 2) #define BITS_PER_BYTE_MASK (BITS_PER_BYTE - 1) #define BITS_PER_BYTE_MASKED(bits) ((bits) & BITS_PER_BYTE_MASK) #define BITS_ROUNDDOWN_BYTES(bits) ((bits) >> 3) @@ -185,6 +185,16 @@ i < btf_type_vlen(struct_type); \ i++, member++) +#define for_each_vsi(i, struct_type, member) \ + for (i = 0, member = btf_type_var_secinfo(struct_type); \ + i < btf_type_vlen(struct_type); \ + i++, member++) + +#define for_each_vsi_from(i, from, struct_type, member) \ + for (i = from, member = btf_type_var_secinfo(struct_type) + from; \ + i < btf_type_vlen(struct_type); \ + i++, member++) + static DEFINE_IDR(btf_idr); static DEFINE_SPINLOCK(btf_idr_lock); @@ -262,6 +272,8 @@ static const char * const btf_kind_str[NR_BTF_KINDS] = { [BTF_KIND_RESTRICT] = "RESTRICT", [BTF_KIND_FUNC] = "FUNC", [BTF_KIND_FUNC_PROTO] = "FUNC_PROTO", + [BTF_KIND_VAR] = "VAR", + [BTF_KIND_DATASEC] = "DATASEC", }; struct btf_kind_operations { @@ -314,7 +326,7 @@ static bool btf_type_is_modifier(const struct btf_type *t) return false; } -static bool btf_type_is_void(const struct btf_type *t) +bool btf_type_is_void(const struct btf_type *t) { return t == &btf_void; } @@ -355,6 +367,11 @@ static bool btf_type_is_struct(const struct btf_type *t) return kind == BTF_KIND_STRUCT || kind == BTF_KIND_UNION; } +static bool __btf_type_is_struct(const struct btf_type *t) +{ + return BTF_INFO_KIND(t->info) == BTF_KIND_STRUCT; +} + static bool btf_type_is_array(const struct btf_type *t) { return BTF_INFO_KIND(t->info) == BTF_KIND_ARRAY; @@ -370,13 +387,36 @@ static bool btf_type_is_int(const struct btf_type *t) return BTF_INFO_KIND(t->info) == BTF_KIND_INT; } +static bool btf_type_is_var(const struct btf_type *t) +{ + return BTF_INFO_KIND(t->info) == BTF_KIND_VAR; +} + +static bool btf_type_is_datasec(const struct btf_type *t) +{ + return BTF_INFO_KIND(t->info) == BTF_KIND_DATASEC; +} + +/* Types that act only as a source, not sink or intermediate + * type when resolving. + */ +static bool btf_type_is_resolve_source_only(const struct btf_type *t) +{ + return btf_type_is_var(t) || + btf_type_is_datasec(t); +} + /* What types need to be resolved? * * btf_type_is_modifier() is an obvious one. * * btf_type_is_struct() because its member refers to * another type (through member->type). - + * + * btf_type_is_var() because the variable refers to + * another type. btf_type_is_datasec() holds multiple + * btf_type_is_var() types that need resolving. + * * btf_type_is_array() because its element (array->type) * refers to another type. Array can be thought of a * special case of struct while array just has the same @@ -385,9 +425,11 @@ static bool btf_type_is_int(const struct btf_type *t) static bool btf_type_needs_resolve(const struct btf_type *t) { return btf_type_is_modifier(t) || - btf_type_is_ptr(t) || - btf_type_is_struct(t) || - btf_type_is_array(t); + btf_type_is_ptr(t) || + btf_type_is_struct(t) || + btf_type_is_array(t) || + btf_type_is_var(t) || + btf_type_is_datasec(t); } /* t->size can be used */ @@ -398,6 +440,7 @@ static bool btf_type_has_size(const struct btf_type *t) case BTF_KIND_STRUCT: case BTF_KIND_UNION: case BTF_KIND_ENUM: + case BTF_KIND_DATASEC: return true; } @@ -462,6 +505,16 @@ static const struct btf_enum *btf_type_enum(const struct btf_type *t) return (const struct btf_enum *)(t + 1); } +static const struct btf_var *btf_type_var(const struct btf_type *t) +{ + return (const struct btf_var *)(t + 1); +} + +static const struct btf_var_secinfo *btf_type_var_secinfo(const struct btf_type *t) +{ + return (const struct btf_var_secinfo *)(t + 1); +} + static const struct btf_kind_operations *btf_type_ops(const struct btf_type *t) { return kind_ops[BTF_INFO_KIND(t->info)]; @@ -473,23 +526,31 @@ static bool btf_name_offset_valid(const struct btf *btf, u32 offset) offset < btf->hdr.str_len; } -/* Only C-style identifier is permitted. This can be relaxed if - * necessary. - */ -static bool btf_name_valid_identifier(const struct btf *btf, u32 offset) +static bool __btf_name_char_ok(char c, bool first, bool dot_ok) +{ + if ((first ? !isalpha(c) : + !isalnum(c)) && + c != '_' && + ((c == '.' && !dot_ok) || + c != '.')) + return false; + return true; +} + +static bool __btf_name_valid(const struct btf *btf, u32 offset, bool dot_ok) { /* offset must be valid */ const char *src = &btf->strings[offset]; const char *src_limit; - if (!isalpha(*src) && *src != '_') + if (!__btf_name_char_ok(*src, true, dot_ok)) return false; /* set a limit on identifier length */ src_limit = src + KSYM_NAME_LEN; src++; while (*src && src < src_limit) { - if (!isalnum(*src) && *src != '_') + if (!__btf_name_char_ok(*src, false, dot_ok)) return false; src++; } @@ -497,6 +558,19 @@ static bool btf_name_valid_identifier(const struct btf *btf, u32 offset) return !*src; } +/* Only C-style identifier is permitted. This can be relaxed if + * necessary. + */ +static bool btf_name_valid_identifier(const struct btf *btf, u32 offset) +{ + return __btf_name_valid(btf, offset, false); +} + +static bool btf_name_valid_section(const struct btf *btf, u32 offset) +{ + return __btf_name_valid(btf, offset, true); +} + static const char *__btf_name_by_offset(const struct btf *btf, u32 offset) { if (!offset) @@ -525,7 +599,7 @@ const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id) /* * Regular int is not a bit field and it must be either - * u8/u16/u32/u64. + * u8/u16/u32/u64 or __int128. */ static bool btf_type_int_is_regular(const struct btf_type *t) { @@ -538,7 +612,8 @@ static bool btf_type_int_is_regular(const struct btf_type *t) if (BITS_PER_BYTE_MASKED(nr_bits) || BTF_INT_OFFSET(int_data) || (nr_bytes != sizeof(u8) && nr_bytes != sizeof(u16) && - nr_bytes != sizeof(u32) && nr_bytes != sizeof(u64))) { + nr_bytes != sizeof(u32) && nr_bytes != sizeof(u64) && + nr_bytes != (2 * sizeof(u64)))) { return false; } @@ -691,6 +766,32 @@ static void btf_verifier_log_member(struct btf_verifier_env *env, __btf_verifier_log(log, "\n"); } +__printf(4, 5) +static void btf_verifier_log_vsi(struct btf_verifier_env *env, + const struct btf_type *datasec_type, + const struct btf_var_secinfo *vsi, + const char *fmt, ...) +{ + struct bpf_verifier_log *log = &env->log; + va_list args; + + if (!bpf_verifier_log_needed(log)) + return; + if (env->phase != CHECK_META) + btf_verifier_log_type(env, datasec_type, NULL); + + __btf_verifier_log(log, "\t type_id=%u offset=%u size=%u", + vsi->type, vsi->offset, vsi->size); + if (fmt && *fmt) { + __btf_verifier_log(log, " "); + va_start(args, fmt); + bpf_verifier_vlog(log, fmt, args); + va_end(args); + } + + __btf_verifier_log(log, "\n"); +} + static void btf_verifier_log_hdr(struct btf_verifier_env *env, u32 btf_data_size) { @@ -968,7 +1069,8 @@ const struct btf_type *btf_type_id_size(const struct btf *btf, } else if (btf_type_is_ptr(size_type)) { size = sizeof(void *); } else { - if (WARN_ON_ONCE(!btf_type_is_modifier(size_type))) + if (WARN_ON_ONCE(!btf_type_is_modifier(size_type) && + !btf_type_is_var(size_type))) return NULL; size = btf->resolved_sizes[size_type_id]; @@ -1063,9 +1165,9 @@ static int btf_int_check_member(struct btf_verifier_env *env, nr_copy_bits = BTF_INT_BITS(int_data) + BITS_PER_BYTE_MASKED(struct_bits_off); - if (nr_copy_bits > BITS_PER_U64) { + if (nr_copy_bits > BITS_PER_U128) { btf_verifier_log_member(env, struct_type, member, - "nr_copy_bits exceeds 64"); + "nr_copy_bits exceeds 128"); return -EINVAL; } @@ -1119,9 +1221,9 @@ static int btf_int_check_kflag_member(struct btf_verifier_env *env, bytes_offset = BITS_ROUNDDOWN_BYTES(struct_bits_off); nr_copy_bits = nr_bits + BITS_PER_BYTE_MASKED(struct_bits_off); - if (nr_copy_bits > BITS_PER_U64) { + if (nr_copy_bits > BITS_PER_U128) { btf_verifier_log_member(env, struct_type, member, - "nr_copy_bits exceeds 64"); + "nr_copy_bits exceeds 128"); return -EINVAL; } @@ -1168,9 +1270,9 @@ static s32 btf_int_check_meta(struct btf_verifier_env *env, nr_bits = BTF_INT_BITS(int_data) + BTF_INT_OFFSET(int_data); - if (nr_bits > BITS_PER_U64) { + if (nr_bits > BITS_PER_U128) { btf_verifier_log_type(env, t, "nr_bits exceeds %zu", - BITS_PER_U64); + BITS_PER_U128); return -EINVAL; } @@ -1211,31 +1313,93 @@ static void btf_int_log(struct btf_verifier_env *env, btf_int_encoding_str(BTF_INT_ENCODING(int_data))); } +static void btf_int128_print(struct seq_file *m, void *data) +{ + /* data points to a __int128 number. + * Suppose + * int128_num = *(__int128 *)data; + * The below formulas shows what upper_num and lower_num represents: + * upper_num = int128_num >> 64; + * lower_num = int128_num & 0xffffffffFFFFFFFFULL; + */ + u64 upper_num, lower_num; + +#ifdef __BIG_ENDIAN_BITFIELD + upper_num = *(u64 *)data; + lower_num = *(u64 *)(data + 8); +#else + upper_num = *(u64 *)(data + 8); + lower_num = *(u64 *)data; +#endif + if (upper_num == 0) + seq_printf(m, "0x%llx", lower_num); + else + seq_printf(m, "0x%llx%016llx", upper_num, lower_num); +} + +static void btf_int128_shift(u64 *print_num, u16 left_shift_bits, + u16 right_shift_bits) +{ + u64 upper_num, lower_num; + +#ifdef __BIG_ENDIAN_BITFIELD + upper_num = print_num[0]; + lower_num = print_num[1]; +#else + upper_num = print_num[1]; + lower_num = print_num[0]; +#endif + + /* shake out un-needed bits by shift/or operations */ + if (left_shift_bits >= 64) { + upper_num = lower_num << (left_shift_bits - 64); + lower_num = 0; + } else { + upper_num = (upper_num << left_shift_bits) | + (lower_num >> (64 - left_shift_bits)); + lower_num = lower_num << left_shift_bits; + } + + if (right_shift_bits >= 64) { + lower_num = upper_num >> (right_shift_bits - 64); + upper_num = 0; + } else { + lower_num = (lower_num >> right_shift_bits) | + (upper_num << (64 - right_shift_bits)); + upper_num = upper_num >> right_shift_bits; + } + +#ifdef __BIG_ENDIAN_BITFIELD + print_num[0] = upper_num; + print_num[1] = lower_num; +#else + print_num[0] = lower_num; + print_num[1] = upper_num; +#endif +} + static void btf_bitfield_seq_show(void *data, u8 bits_offset, u8 nr_bits, struct seq_file *m) { u16 left_shift_bits, right_shift_bits; u8 nr_copy_bytes; u8 nr_copy_bits; - u64 print_num; + u64 print_num[2] = {}; nr_copy_bits = nr_bits + bits_offset; nr_copy_bytes = BITS_ROUNDUP_BYTES(nr_copy_bits); - print_num = 0; - memcpy(&print_num, data, nr_copy_bytes); + memcpy(print_num, data, nr_copy_bytes); #ifdef __BIG_ENDIAN_BITFIELD left_shift_bits = bits_offset; #else - left_shift_bits = BITS_PER_U64 - nr_copy_bits; + left_shift_bits = BITS_PER_U128 - nr_copy_bits; #endif - right_shift_bits = BITS_PER_U64 - nr_bits; - - print_num <<= left_shift_bits; - print_num >>= right_shift_bits; + right_shift_bits = BITS_PER_U128 - nr_bits; - seq_printf(m, "0x%llx", print_num); + btf_int128_shift(print_num, left_shift_bits, right_shift_bits); + btf_int128_print(m, print_num); } @@ -1250,7 +1414,7 @@ static void btf_int_bits_seq_show(const struct btf *btf, /* * bits_offset is at most 7. - * BTF_INT_OFFSET() cannot exceed 64 bits. + * BTF_INT_OFFSET() cannot exceed 128 bits. */ total_bits_offset = bits_offset + BTF_INT_OFFSET(int_data); data += BITS_ROUNDDOWN_BYTES(total_bits_offset); @@ -1274,6 +1438,9 @@ static void btf_int_seq_show(const struct btf *btf, const struct btf_type *t, } switch (nr_bits) { + case 128: + btf_int128_print(m, data); + break; case 64: if (sign) seq_printf(m, "%lld", *(s64 *)data); @@ -1438,7 +1605,7 @@ static int btf_modifier_resolve(struct btf_verifier_env *env, u32 next_type_size = 0; next_type = btf_type_by_id(btf, next_type_id); - if (!next_type) { + if (!next_type || btf_type_is_resolve_source_only(next_type)) { btf_verifier_log_type(env, v->t, "Invalid type_id"); return -EINVAL; } @@ -1471,6 +1638,53 @@ static int btf_modifier_resolve(struct btf_verifier_env *env, return 0; } +static int btf_var_resolve(struct btf_verifier_env *env, + const struct resolve_vertex *v) +{ + const struct btf_type *next_type; + const struct btf_type *t = v->t; + u32 next_type_id = t->type; + struct btf *btf = env->btf; + u32 next_type_size; + + next_type = btf_type_by_id(btf, next_type_id); + if (!next_type || btf_type_is_resolve_source_only(next_type)) { + btf_verifier_log_type(env, v->t, "Invalid type_id"); + return -EINVAL; + } + + if (!env_type_is_resolve_sink(env, next_type) && + !env_type_is_resolved(env, next_type_id)) + return env_stack_push(env, next_type, next_type_id); + + if (btf_type_is_modifier(next_type)) { + const struct btf_type *resolved_type; + u32 resolved_type_id; + + resolved_type_id = next_type_id; + resolved_type = btf_type_id_resolve(btf, &resolved_type_id); + + if (btf_type_is_ptr(resolved_type) && + !env_type_is_resolve_sink(env, resolved_type) && + !env_type_is_resolved(env, resolved_type_id)) + return env_stack_push(env, resolved_type, + resolved_type_id); + } + + /* We must resolve to something concrete at this point, no + * forward types or similar that would resolve to size of + * zero is allowed. + */ + if (!btf_type_id_size(btf, &next_type_id, &next_type_size)) { + btf_verifier_log_type(env, v->t, "Invalid type_id"); + return -EINVAL; + } + + env_stack_pop_resolved(env, next_type_id, next_type_size); + + return 0; +} + static int btf_ptr_resolve(struct btf_verifier_env *env, const struct resolve_vertex *v) { @@ -1480,7 +1694,7 @@ static int btf_ptr_resolve(struct btf_verifier_env *env, struct btf *btf = env->btf; next_type = btf_type_by_id(btf, next_type_id); - if (!next_type) { + if (!next_type || btf_type_is_resolve_source_only(next_type)) { btf_verifier_log_type(env, v->t, "Invalid type_id"); return -EINVAL; } @@ -1538,6 +1752,15 @@ static void btf_modifier_seq_show(const struct btf *btf, btf_type_ops(t)->seq_show(btf, t, type_id, data, bits_offset, m); } +static void btf_var_seq_show(const struct btf *btf, const struct btf_type *t, + u32 type_id, void *data, u8 bits_offset, + struct seq_file *m) +{ + t = btf_type_id_resolve(btf, &type_id); + + btf_type_ops(t)->seq_show(btf, t, type_id, data, bits_offset, m); +} + static void btf_ptr_seq_show(const struct btf *btf, const struct btf_type *t, u32 type_id, void *data, u8 bits_offset, struct seq_file *m) @@ -1705,7 +1928,8 @@ static int btf_array_resolve(struct btf_verifier_env *env, /* Check array->index_type */ index_type_id = array->index_type; index_type = btf_type_by_id(btf, index_type_id); - if (btf_type_nosize_or_null(index_type)) { + if (btf_type_nosize_or_null(index_type) || + btf_type_is_resolve_source_only(index_type)) { btf_verifier_log_type(env, v->t, "Invalid index"); return -EINVAL; } @@ -1724,7 +1948,8 @@ static int btf_array_resolve(struct btf_verifier_env *env, /* Check array->type */ elem_type_id = array->type; elem_type = btf_type_by_id(btf, elem_type_id); - if (btf_type_nosize_or_null(elem_type)) { + if (btf_type_nosize_or_null(elem_type) || + btf_type_is_resolve_source_only(elem_type)) { btf_verifier_log_type(env, v->t, "Invalid elem"); return -EINVAL; @@ -1945,7 +2170,8 @@ static int btf_struct_resolve(struct btf_verifier_env *env, const struct btf_type *member_type = btf_type_by_id(env->btf, member_type_id); - if (btf_type_nosize_or_null(member_type)) { + if (btf_type_nosize_or_null(member_type) || + btf_type_is_resolve_source_only(member_type)) { btf_verifier_log_member(env, v->t, member, "Invalid member"); return -EINVAL; @@ -1980,6 +2206,43 @@ static void btf_struct_log(struct btf_verifier_env *env, btf_verifier_log(env, "size=%u vlen=%u", t->size, btf_type_vlen(t)); } +/* find 'struct bpf_spin_lock' in map value. + * return >= 0 offset if found + * and < 0 in case of error + */ +int btf_find_spin_lock(const struct btf *btf, const struct btf_type *t) +{ + const struct btf_member *member; + u32 i, off = -ENOENT; + + if (!__btf_type_is_struct(t)) + return -EINVAL; + + for_each_member(i, t, member) { + const struct btf_type *member_type = btf_type_by_id(btf, + member->type); + if (!__btf_type_is_struct(member_type)) + continue; + if (member_type->size != sizeof(struct bpf_spin_lock)) + continue; + if (strcmp(__btf_name_by_offset(btf, member_type->name_off), + "bpf_spin_lock")) + continue; + if (off != -ENOENT) + /* only one 'struct bpf_spin_lock' is allowed */ + return -E2BIG; + off = btf_member_bit_offset(t, member); + if (off % 8) + /* valid C code cannot generate such BTF */ + return -EINVAL; + off /= 8; + if (off % __alignof__(struct bpf_spin_lock)) + /* valid struct bpf_spin_lock will be 4 byte aligned */ + return -EINVAL; + } + return off; +} + static void btf_struct_seq_show(const struct btf *btf, const struct btf_type *t, u32 type_id, void *data, u8 bits_offset, struct seq_file *m) @@ -2303,6 +2566,222 @@ static struct btf_kind_operations func_ops = { .seq_show = btf_df_seq_show, }; +static s32 btf_var_check_meta(struct btf_verifier_env *env, + const struct btf_type *t, + u32 meta_left) +{ + const struct btf_var *var; + u32 meta_needed = sizeof(*var); + + if (meta_left < meta_needed) { + btf_verifier_log_basic(env, t, + "meta_left:%u meta_needed:%u", + meta_left, meta_needed); + return -EINVAL; + } + + if (btf_type_vlen(t)) { + btf_verifier_log_type(env, t, "vlen != 0"); + return -EINVAL; + } + + if (btf_type_kflag(t)) { + btf_verifier_log_type(env, t, "Invalid btf_info kind_flag"); + return -EINVAL; + } + + if (!t->name_off || + !__btf_name_valid(env->btf, t->name_off, true)) { + btf_verifier_log_type(env, t, "Invalid name"); + return -EINVAL; + } + + /* A var cannot be in type void */ + if (!t->type || !BTF_TYPE_ID_VALID(t->type)) { + btf_verifier_log_type(env, t, "Invalid type_id"); + return -EINVAL; + } + + var = btf_type_var(t); + if (var->linkage != BTF_VAR_STATIC && + var->linkage != BTF_VAR_GLOBAL_ALLOCATED) { + btf_verifier_log_type(env, t, "Linkage not supported"); + return -EINVAL; + } + + btf_verifier_log_type(env, t, NULL); + + return meta_needed; +} + +static void btf_var_log(struct btf_verifier_env *env, const struct btf_type *t) +{ + const struct btf_var *var = btf_type_var(t); + + btf_verifier_log(env, "type_id=%u linkage=%u", t->type, var->linkage); +} + +static const struct btf_kind_operations var_ops = { + .check_meta = btf_var_check_meta, + .resolve = btf_var_resolve, + .check_member = btf_df_check_member, + .check_kflag_member = btf_df_check_kflag_member, + .log_details = btf_var_log, + .seq_show = btf_var_seq_show, +}; + +static s32 btf_datasec_check_meta(struct btf_verifier_env *env, + const struct btf_type *t, + u32 meta_left) +{ + const struct btf_var_secinfo *vsi; + u64 last_vsi_end_off = 0, sum = 0; + u32 i, meta_needed; + + meta_needed = btf_type_vlen(t) * sizeof(*vsi); + if (meta_left < meta_needed) { + btf_verifier_log_basic(env, t, + "meta_left:%u meta_needed:%u", + meta_left, meta_needed); + return -EINVAL; + } + + if (!btf_type_vlen(t)) { + btf_verifier_log_type(env, t, "vlen == 0"); + return -EINVAL; + } + + if (!t->size) { + btf_verifier_log_type(env, t, "size == 0"); + return -EINVAL; + } + + if (btf_type_kflag(t)) { + btf_verifier_log_type(env, t, "Invalid btf_info kind_flag"); + return -EINVAL; + } + + if (!t->name_off || + !btf_name_valid_section(env->btf, t->name_off)) { + btf_verifier_log_type(env, t, "Invalid name"); + return -EINVAL; + } + + btf_verifier_log_type(env, t, NULL); + + for_each_vsi(i, t, vsi) { + /* A var cannot be in type void */ + if (!vsi->type || !BTF_TYPE_ID_VALID(vsi->type)) { + btf_verifier_log_vsi(env, t, vsi, + "Invalid type_id"); + return -EINVAL; + } + + if (vsi->offset < last_vsi_end_off || vsi->offset >= t->size) { + btf_verifier_log_vsi(env, t, vsi, + "Invalid offset"); + return -EINVAL; + } + + if (!vsi->size || vsi->size > t->size) { + btf_verifier_log_vsi(env, t, vsi, + "Invalid size"); + return -EINVAL; + } + + last_vsi_end_off = vsi->offset + vsi->size; + if (last_vsi_end_off > t->size) { + btf_verifier_log_vsi(env, t, vsi, + "Invalid offset+size"); + return -EINVAL; + } + + btf_verifier_log_vsi(env, t, vsi, NULL); + sum += vsi->size; + } + + if (t->size < sum) { + btf_verifier_log_type(env, t, "Invalid btf_info size"); + return -EINVAL; + } + + return meta_needed; +} + +static int btf_datasec_resolve(struct btf_verifier_env *env, + const struct resolve_vertex *v) +{ + const struct btf_var_secinfo *vsi; + struct btf *btf = env->btf; + u16 i; + + for_each_vsi_from(i, v->next_member, v->t, vsi) { + u32 var_type_id = vsi->type, type_id, type_size = 0; + const struct btf_type *var_type = btf_type_by_id(env->btf, + var_type_id); + if (!var_type || !btf_type_is_var(var_type)) { + btf_verifier_log_vsi(env, v->t, vsi, + "Not a VAR kind member"); + return -EINVAL; + } + + if (!env_type_is_resolve_sink(env, var_type) && + !env_type_is_resolved(env, var_type_id)) { + env_stack_set_next_member(env, i + 1); + return env_stack_push(env, var_type, var_type_id); + } + + type_id = var_type->type; + if (!btf_type_id_size(btf, &type_id, &type_size)) { + btf_verifier_log_vsi(env, v->t, vsi, "Invalid type"); + return -EINVAL; + } + + if (vsi->size < type_size) { + btf_verifier_log_vsi(env, v->t, vsi, "Invalid size"); + return -EINVAL; + } + } + + env_stack_pop_resolved(env, 0, 0); + return 0; +} + +static void btf_datasec_log(struct btf_verifier_env *env, + const struct btf_type *t) +{ + btf_verifier_log(env, "size=%u vlen=%u", t->size, btf_type_vlen(t)); +} + +static void btf_datasec_seq_show(const struct btf *btf, + const struct btf_type *t, u32 type_id, + void *data, u8 bits_offset, + struct seq_file *m) +{ + const struct btf_var_secinfo *vsi; + const struct btf_type *var; + u32 i; + + seq_printf(m, "section (\"%s\") = {", __btf_name_by_offset(btf, t->name_off)); + for_each_vsi(i, t, vsi) { + var = btf_type_by_id(btf, vsi->type); + if (i) + seq_puts(m, ","); + btf_type_ops(var)->seq_show(btf, var, vsi->type, + data + vsi->offset, bits_offset, m); + } + seq_puts(m, "}"); +} + +static const struct btf_kind_operations datasec_ops = { + .check_meta = btf_datasec_check_meta, + .resolve = btf_datasec_resolve, + .check_member = btf_df_check_member, + .check_kflag_member = btf_df_check_kflag_member, + .log_details = btf_datasec_log, + .seq_show = btf_datasec_seq_show, +}; + static int btf_func_proto_check(struct btf_verifier_env *env, const struct btf_type *t) { @@ -2434,6 +2913,8 @@ static const struct btf_kind_operations * const kind_ops[NR_BTF_KINDS] = { [BTF_KIND_RESTRICT] = &modifier_ops, [BTF_KIND_FUNC] = &func_ops, [BTF_KIND_FUNC_PROTO] = &func_proto_ops, + [BTF_KIND_VAR] = &var_ops, + [BTF_KIND_DATASEC] = &datasec_ops, }; static s32 btf_check_meta(struct btf_verifier_env *env, @@ -2514,13 +2995,17 @@ static bool btf_resolve_valid(struct btf_verifier_env *env, if (!env_type_is_resolved(env, type_id)) return false; - if (btf_type_is_struct(t)) + if (btf_type_is_struct(t) || btf_type_is_datasec(t)) return !btf->resolved_ids[type_id] && - !btf->resolved_sizes[type_id]; + !btf->resolved_sizes[type_id]; - if (btf_type_is_modifier(t) || btf_type_is_ptr(t)) { + if (btf_type_is_modifier(t) || btf_type_is_ptr(t) || + btf_type_is_var(t)) { t = btf_type_id_resolve(btf, &type_id); - return t && !btf_type_is_modifier(t); + return t && + !btf_type_is_modifier(t) && + !btf_type_is_var(t) && + !btf_type_is_datasec(t); } if (btf_type_is_array(t)) { diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index d17d05570a3f..0a00eaca6fae 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -1,33 +1,48 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Functions to manage eBPF programs attached to cgroups * * Copyright (c) 2016 Daniel Mack - * - * This file is subject to the terms and conditions of version 2 of the GNU - * General Public License. See the file COPYING in the main directory of the - * Linux distribution for more details. */ #include <linux/kernel.h> #include <linux/atomic.h> #include <linux/cgroup.h> +#include <linux/filter.h> #include <linux/slab.h> +#include <linux/sysctl.h> +#include <linux/string.h> #include <linux/bpf.h> #include <linux/bpf-cgroup.h> #include <net/sock.h> +#include <net/bpf_sk_storage.h> + +#include "../cgroup/cgroup-internal.h" DEFINE_STATIC_KEY_FALSE(cgroup_bpf_enabled_key); EXPORT_SYMBOL(cgroup_bpf_enabled_key); +void cgroup_bpf_offline(struct cgroup *cgrp) +{ + cgroup_get(cgrp); + percpu_ref_kill(&cgrp->bpf.refcnt); +} + /** - * cgroup_bpf_put() - put references of all bpf programs - * @cgrp: the cgroup to modify + * cgroup_bpf_release() - put references of all bpf programs and + * release all cgroup bpf data + * @work: work structure embedded into the cgroup to modify */ -void cgroup_bpf_put(struct cgroup *cgrp) +static void cgroup_bpf_release(struct work_struct *work) { + struct cgroup *cgrp = container_of(work, struct cgroup, + bpf.release_work); enum bpf_cgroup_storage_type stype; + struct bpf_prog_array *old_array; unsigned int type; + mutex_lock(&cgroup_mutex); + for (type = 0; type < ARRAY_SIZE(cgrp->bpf.progs); type++) { struct list_head *progs = &cgrp->bpf.progs[type]; struct bpf_prog_list *pl, *tmp; @@ -42,8 +57,29 @@ void cgroup_bpf_put(struct cgroup *cgrp) kfree(pl); static_branch_dec(&cgroup_bpf_enabled_key); } - bpf_prog_array_free(cgrp->bpf.effective[type]); + old_array = rcu_dereference_protected( + cgrp->bpf.effective[type], + lockdep_is_held(&cgroup_mutex)); + bpf_prog_array_free(old_array); } + + mutex_unlock(&cgroup_mutex); + + percpu_ref_exit(&cgrp->bpf.refcnt); + cgroup_put(cgrp); +} + +/** + * cgroup_bpf_release_fn() - callback used to schedule releasing + * of bpf cgroup data + * @ref: percpu ref counter structure + */ +static void cgroup_bpf_release_fn(struct percpu_ref *ref) +{ + struct cgroup *cgrp = container_of(ref, struct cgroup, bpf.refcnt); + + INIT_WORK(&cgrp->bpf.release_work, cgroup_bpf_release); + queue_work(system_wq, &cgrp->bpf.release_work); } /* count number of elements in the list. @@ -98,7 +134,7 @@ static bool hierarchy_allows_attach(struct cgroup *cgrp, */ static int compute_effective_progs(struct cgroup *cgrp, enum bpf_attach_type type, - struct bpf_prog_array __rcu **array) + struct bpf_prog_array **array) { enum bpf_cgroup_storage_type stype; struct bpf_prog_array *progs; @@ -136,17 +172,16 @@ static int compute_effective_progs(struct cgroup *cgrp, } } while ((p = cgroup_parent(p))); - rcu_assign_pointer(*array, progs); + *array = progs; return 0; } static void activate_effective_progs(struct cgroup *cgrp, enum bpf_attach_type type, - struct bpf_prog_array __rcu *array) + struct bpf_prog_array *old_array) { - struct bpf_prog_array __rcu *old_array; - - old_array = xchg(&cgrp->bpf.effective[type], array); + rcu_swap_protected(cgrp->bpf.effective[type], old_array, + lockdep_is_held(&cgroup_mutex)); /* free prog array after grace period, since __cgroup_bpf_run_*() * might be still walking the array */ @@ -163,8 +198,13 @@ int cgroup_bpf_inherit(struct cgroup *cgrp) * that array below is variable length */ #define NR ARRAY_SIZE(cgrp->bpf.effective) - struct bpf_prog_array __rcu *arrays[NR] = {}; - int i; + struct bpf_prog_array *arrays[NR] = {}; + int ret, i; + + ret = percpu_ref_init(&cgrp->bpf.refcnt, cgroup_bpf_release_fn, 0, + GFP_KERNEL); + if (ret) + return ret; for (i = 0; i < NR; i++) INIT_LIST_HEAD(&cgrp->bpf.progs[i]); @@ -180,6 +220,9 @@ int cgroup_bpf_inherit(struct cgroup *cgrp) cleanup: for (i = 0; i < NR; i++) bpf_prog_array_free(arrays[i]); + + percpu_ref_exit(&cgrp->bpf.refcnt); + return -ENOMEM; } @@ -193,6 +236,9 @@ static int update_effective_progs(struct cgroup *cgrp, css_for_each_descendant_pre(css, &cgrp->self) { struct cgroup *desc = container_of(css, struct cgroup, self); + if (percpu_ref_is_zero(&desc->bpf.refcnt)) + continue; + err = compute_effective_progs(desc, type, &desc->bpf.inactive); if (err) goto cleanup; @@ -202,6 +248,14 @@ static int update_effective_progs(struct cgroup *cgrp, css_for_each_descendant_pre(css, &cgrp->self) { struct cgroup *desc = container_of(css, struct cgroup, self); + if (percpu_ref_is_zero(&desc->bpf.refcnt)) { + if (unlikely(desc->bpf.inactive)) { + bpf_prog_array_free(desc->bpf.inactive); + desc->bpf.inactive = NULL; + } + continue; + } + activate_effective_progs(desc, type, desc->bpf.inactive); desc->bpf.inactive = NULL; } @@ -230,6 +284,7 @@ cleanup: * @cgrp: The cgroup which descendants to traverse * @prog: A program to attach * @type: Type of attach operation + * @flags: Option flags * * Must be called with cgroup_mutex held. */ @@ -363,7 +418,7 @@ cleanup: * Must be called with cgroup_mutex held. */ int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, - enum bpf_attach_type type, u32 unused_flags) + enum bpf_attach_type type) { struct list_head *progs = &cgrp->bpf.progs[type]; enum bpf_cgroup_storage_type stype; @@ -440,10 +495,14 @@ int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, enum bpf_attach_type type = attr->query.attach_type; struct list_head *progs = &cgrp->bpf.progs[type]; u32 flags = cgrp->bpf.flags[type]; + struct bpf_prog_array *effective; int cnt, ret = 0, i; + effective = rcu_dereference_protected(cgrp->bpf.effective[type], + lockdep_is_held(&cgroup_mutex)); + if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE) - cnt = bpf_prog_array_length(cgrp->bpf.effective[type]); + cnt = bpf_prog_array_length(effective); else cnt = prog_list_length(progs); @@ -460,8 +519,7 @@ int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, } if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE) { - return bpf_prog_array_copy_to_user(cgrp->bpf.effective[type], - prog_ids, cnt); + return bpf_prog_array_copy_to_user(effective, prog_ids, cnt); } else { struct bpf_prog_list *pl; u32 id; @@ -544,8 +602,16 @@ int cgroup_bpf_prog_query(const union bpf_attr *attr, * The program type passed in via @type must be suitable for network * filtering. No further check is performed to assert that. * - * This function will return %-EPERM if any if an attached program was found - * and if it returned != 1 during execution. In all other cases, 0 is returned. + * For egress packets, this function can return: + * NET_XMIT_SUCCESS (0) - continue with packet output + * NET_XMIT_DROP (1) - drop packet and notify TCP to call cwr + * NET_XMIT_CN (2) - continue with packet output and notify TCP + * to call cwr + * -EPERM - drop packet + * + * For ingress packets, this function will return -EPERM if any + * attached program was found and if it returned != 1 during execution. + * Otherwise 0 is returned. */ int __cgroup_bpf_run_filter_skb(struct sock *sk, struct sk_buff *skb, @@ -571,12 +637,19 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk, /* compute pointers for the bpf prog */ bpf_compute_and_save_data_end(skb, &saved_data_end); - ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], skb, - __bpf_prog_run_save_cb); + if (type == BPF_CGROUP_INET_EGRESS) { + ret = BPF_PROG_CGROUP_INET_EGRESS_RUN_ARRAY( + cgrp->bpf.effective[type], skb, __bpf_prog_run_save_cb); + } else { + ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], skb, + __bpf_prog_run_save_cb); + ret = (ret == 1 ? 0 : -EPERM); + } bpf_restore_data_end(skb, saved_data_end); __skb_pull(skb, offset); skb->sk = save_sk; - return ret == 1 ? 0 : -EPERM; + + return ret; } EXPORT_SYMBOL(__cgroup_bpf_run_filter_skb); @@ -700,7 +773,7 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor, EXPORT_SYMBOL(__cgroup_bpf_check_dev_permission); static const struct bpf_func_proto * -cgroup_dev_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +cgroup_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_map_lookup_elem: @@ -709,6 +782,12 @@ cgroup_dev_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_map_update_elem_proto; case BPF_FUNC_map_delete_elem: return &bpf_map_delete_elem_proto; + case BPF_FUNC_map_push_elem: + return &bpf_map_push_elem_proto; + case BPF_FUNC_map_pop_elem: + return &bpf_map_pop_elem_proto; + case BPF_FUNC_map_peek_elem: + return &bpf_map_peek_elem_proto; case BPF_FUNC_get_current_uid_gid: return &bpf_get_current_uid_gid_proto; case BPF_FUNC_get_local_storage: @@ -724,6 +803,12 @@ cgroup_dev_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) } } +static const struct bpf_func_proto * +cgroup_dev_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + return cgroup_base_func_proto(func_id, prog); +} + static bool cgroup_dev_is_valid_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, @@ -761,3 +846,692 @@ const struct bpf_verifier_ops cg_dev_verifier_ops = { .get_func_proto = cgroup_dev_func_proto, .is_valid_access = cgroup_dev_is_valid_access, }; + +/** + * __cgroup_bpf_run_filter_sysctl - Run a program on sysctl + * + * @head: sysctl table header + * @table: sysctl table + * @write: sysctl is being read (= 0) or written (= 1) + * @buf: pointer to buffer passed by user space + * @pcount: value-result argument: value is size of buffer pointed to by @buf, + * result is size of @new_buf if program set new value, initial value + * otherwise + * @ppos: value-result argument: value is position at which read from or write + * to sysctl is happening, result is new position if program overrode it, + * initial value otherwise + * @new_buf: pointer to pointer to new buffer that will be allocated if program + * overrides new value provided by user space on sysctl write + * NOTE: it's caller responsibility to free *new_buf if it was set + * @type: type of program to be executed + * + * Program is run when sysctl is being accessed, either read or written, and + * can allow or deny such access. + * + * This function will return %-EPERM if an attached program is found and + * returned value != 1 during execution. In all other cases 0 is returned. + */ +int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, + struct ctl_table *table, int write, + void __user *buf, size_t *pcount, + loff_t *ppos, void **new_buf, + enum bpf_attach_type type) +{ + struct bpf_sysctl_kern ctx = { + .head = head, + .table = table, + .write = write, + .ppos = ppos, + .cur_val = NULL, + .cur_len = PAGE_SIZE, + .new_val = NULL, + .new_len = 0, + .new_updated = 0, + }; + struct cgroup *cgrp; + int ret; + + ctx.cur_val = kmalloc_track_caller(ctx.cur_len, GFP_KERNEL); + if (ctx.cur_val) { + mm_segment_t old_fs; + loff_t pos = 0; + + old_fs = get_fs(); + set_fs(KERNEL_DS); + if (table->proc_handler(table, 0, (void __user *)ctx.cur_val, + &ctx.cur_len, &pos)) { + /* Let BPF program decide how to proceed. */ + ctx.cur_len = 0; + } + set_fs(old_fs); + } else { + /* Let BPF program decide how to proceed. */ + ctx.cur_len = 0; + } + + if (write && buf && *pcount) { + /* BPF program should be able to override new value with a + * buffer bigger than provided by user. + */ + ctx.new_val = kmalloc_track_caller(PAGE_SIZE, GFP_KERNEL); + ctx.new_len = min_t(size_t, PAGE_SIZE, *pcount); + if (!ctx.new_val || + copy_from_user(ctx.new_val, buf, ctx.new_len)) + /* Let BPF program decide how to proceed. */ + ctx.new_len = 0; + } + + rcu_read_lock(); + cgrp = task_dfl_cgroup(current); + ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], &ctx, BPF_PROG_RUN); + rcu_read_unlock(); + + kfree(ctx.cur_val); + + if (ret == 1 && ctx.new_updated) { + *new_buf = ctx.new_val; + *pcount = ctx.new_len; + } else { + kfree(ctx.new_val); + } + + return ret == 1 ? 0 : -EPERM; +} +EXPORT_SYMBOL(__cgroup_bpf_run_filter_sysctl); + +#ifdef CONFIG_NET +static bool __cgroup_bpf_prog_array_is_empty(struct cgroup *cgrp, + enum bpf_attach_type attach_type) +{ + struct bpf_prog_array *prog_array; + bool empty; + + rcu_read_lock(); + prog_array = rcu_dereference(cgrp->bpf.effective[attach_type]); + empty = bpf_prog_array_is_empty(prog_array); + rcu_read_unlock(); + + return empty; +} + +static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen) +{ + if (unlikely(max_optlen > PAGE_SIZE) || max_optlen < 0) + return -EINVAL; + + ctx->optval = kzalloc(max_optlen, GFP_USER); + if (!ctx->optval) + return -ENOMEM; + + ctx->optval_end = ctx->optval + max_optlen; + ctx->optlen = max_optlen; + + return 0; +} + +static void sockopt_free_buf(struct bpf_sockopt_kern *ctx) +{ + kfree(ctx->optval); +} + +int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level, + int *optname, char __user *optval, + int *optlen, char **kernel_optval) +{ + struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); + struct bpf_sockopt_kern ctx = { + .sk = sk, + .level = *level, + .optname = *optname, + }; + int ret; + + /* Opportunistic check to see whether we have any BPF program + * attached to the hook so we don't waste time allocating + * memory and locking the socket. + */ + if (!cgroup_bpf_enabled || + __cgroup_bpf_prog_array_is_empty(cgrp, BPF_CGROUP_SETSOCKOPT)) + return 0; + + ret = sockopt_alloc_buf(&ctx, *optlen); + if (ret) + return ret; + + if (copy_from_user(ctx.optval, optval, *optlen) != 0) { + ret = -EFAULT; + goto out; + } + + lock_sock(sk); + ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[BPF_CGROUP_SETSOCKOPT], + &ctx, BPF_PROG_RUN); + release_sock(sk); + + if (!ret) { + ret = -EPERM; + goto out; + } + + if (ctx.optlen == -1) { + /* optlen set to -1, bypass kernel */ + ret = 1; + } else if (ctx.optlen > *optlen || ctx.optlen < -1) { + /* optlen is out of bounds */ + ret = -EFAULT; + } else { + /* optlen within bounds, run kernel handler */ + ret = 0; + + /* export any potential modifications */ + *level = ctx.level; + *optname = ctx.optname; + *optlen = ctx.optlen; + *kernel_optval = ctx.optval; + } + +out: + if (ret) + sockopt_free_buf(&ctx); + return ret; +} +EXPORT_SYMBOL(__cgroup_bpf_run_filter_setsockopt); + +int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level, + int optname, char __user *optval, + int __user *optlen, int max_optlen, + int retval) +{ + struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); + struct bpf_sockopt_kern ctx = { + .sk = sk, + .level = level, + .optname = optname, + .retval = retval, + }; + int ret; + + /* Opportunistic check to see whether we have any BPF program + * attached to the hook so we don't waste time allocating + * memory and locking the socket. + */ + if (!cgroup_bpf_enabled || + __cgroup_bpf_prog_array_is_empty(cgrp, BPF_CGROUP_GETSOCKOPT)) + return retval; + + ret = sockopt_alloc_buf(&ctx, max_optlen); + if (ret) + return ret; + + if (!retval) { + /* If kernel getsockopt finished successfully, + * copy whatever was returned to the user back + * into our temporary buffer. Set optlen to the + * one that kernel returned as well to let + * BPF programs inspect the value. + */ + + if (get_user(ctx.optlen, optlen)) { + ret = -EFAULT; + goto out; + } + + if (ctx.optlen > max_optlen) + ctx.optlen = max_optlen; + + if (copy_from_user(ctx.optval, optval, ctx.optlen) != 0) { + ret = -EFAULT; + goto out; + } + } + + lock_sock(sk); + ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[BPF_CGROUP_GETSOCKOPT], + &ctx, BPF_PROG_RUN); + release_sock(sk); + + if (!ret) { + ret = -EPERM; + goto out; + } + + if (ctx.optlen > max_optlen) { + ret = -EFAULT; + goto out; + } + + /* BPF programs only allowed to set retval to 0, not some + * arbitrary value. + */ + if (ctx.retval != 0 && ctx.retval != retval) { + ret = -EFAULT; + goto out; + } + + if (copy_to_user(optval, ctx.optval, ctx.optlen) || + put_user(ctx.optlen, optlen)) { + ret = -EFAULT; + goto out; + } + + ret = ctx.retval; + +out: + sockopt_free_buf(&ctx); + return ret; +} +EXPORT_SYMBOL(__cgroup_bpf_run_filter_getsockopt); +#endif + +static ssize_t sysctl_cpy_dir(const struct ctl_dir *dir, char **bufp, + size_t *lenp) +{ + ssize_t tmp_ret = 0, ret; + + if (dir->header.parent) { + tmp_ret = sysctl_cpy_dir(dir->header.parent, bufp, lenp); + if (tmp_ret < 0) + return tmp_ret; + } + + ret = strscpy(*bufp, dir->header.ctl_table[0].procname, *lenp); + if (ret < 0) + return ret; + *bufp += ret; + *lenp -= ret; + ret += tmp_ret; + + /* Avoid leading slash. */ + if (!ret) + return ret; + + tmp_ret = strscpy(*bufp, "/", *lenp); + if (tmp_ret < 0) + return tmp_ret; + *bufp += tmp_ret; + *lenp -= tmp_ret; + + return ret + tmp_ret; +} + +BPF_CALL_4(bpf_sysctl_get_name, struct bpf_sysctl_kern *, ctx, char *, buf, + size_t, buf_len, u64, flags) +{ + ssize_t tmp_ret = 0, ret; + + if (!buf) + return -EINVAL; + + if (!(flags & BPF_F_SYSCTL_BASE_NAME)) { + if (!ctx->head) + return -EINVAL; + tmp_ret = sysctl_cpy_dir(ctx->head->parent, &buf, &buf_len); + if (tmp_ret < 0) + return tmp_ret; + } + + ret = strscpy(buf, ctx->table->procname, buf_len); + + return ret < 0 ? ret : tmp_ret + ret; +} + +static const struct bpf_func_proto bpf_sysctl_get_name_proto = { + .func = bpf_sysctl_get_name, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE, + .arg4_type = ARG_ANYTHING, +}; + +static int copy_sysctl_value(char *dst, size_t dst_len, char *src, + size_t src_len) +{ + if (!dst) + return -EINVAL; + + if (!dst_len) + return -E2BIG; + + if (!src || !src_len) { + memset(dst, 0, dst_len); + return -EINVAL; + } + + memcpy(dst, src, min(dst_len, src_len)); + + if (dst_len > src_len) { + memset(dst + src_len, '\0', dst_len - src_len); + return src_len; + } + + dst[dst_len - 1] = '\0'; + + return -E2BIG; +} + +BPF_CALL_3(bpf_sysctl_get_current_value, struct bpf_sysctl_kern *, ctx, + char *, buf, size_t, buf_len) +{ + return copy_sysctl_value(buf, buf_len, ctx->cur_val, ctx->cur_len); +} + +static const struct bpf_func_proto bpf_sysctl_get_current_value_proto = { + .func = bpf_sysctl_get_current_value, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_UNINIT_MEM, + .arg3_type = ARG_CONST_SIZE, +}; + +BPF_CALL_3(bpf_sysctl_get_new_value, struct bpf_sysctl_kern *, ctx, char *, buf, + size_t, buf_len) +{ + if (!ctx->write) { + if (buf && buf_len) + memset(buf, '\0', buf_len); + return -EINVAL; + } + return copy_sysctl_value(buf, buf_len, ctx->new_val, ctx->new_len); +} + +static const struct bpf_func_proto bpf_sysctl_get_new_value_proto = { + .func = bpf_sysctl_get_new_value, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_UNINIT_MEM, + .arg3_type = ARG_CONST_SIZE, +}; + +BPF_CALL_3(bpf_sysctl_set_new_value, struct bpf_sysctl_kern *, ctx, + const char *, buf, size_t, buf_len) +{ + if (!ctx->write || !ctx->new_val || !ctx->new_len || !buf || !buf_len) + return -EINVAL; + + if (buf_len > PAGE_SIZE - 1) + return -E2BIG; + + memcpy(ctx->new_val, buf, buf_len); + ctx->new_len = buf_len; + ctx->new_updated = 1; + + return 0; +} + +static const struct bpf_func_proto bpf_sysctl_set_new_value_proto = { + .func = bpf_sysctl_set_new_value, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE, +}; + +static const struct bpf_func_proto * +sysctl_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + switch (func_id) { + case BPF_FUNC_strtol: + return &bpf_strtol_proto; + case BPF_FUNC_strtoul: + return &bpf_strtoul_proto; + case BPF_FUNC_sysctl_get_name: + return &bpf_sysctl_get_name_proto; + case BPF_FUNC_sysctl_get_current_value: + return &bpf_sysctl_get_current_value_proto; + case BPF_FUNC_sysctl_get_new_value: + return &bpf_sysctl_get_new_value_proto; + case BPF_FUNC_sysctl_set_new_value: + return &bpf_sysctl_set_new_value_proto; + default: + return cgroup_base_func_proto(func_id, prog); + } +} + +static bool sysctl_is_valid_access(int off, int size, enum bpf_access_type type, + const struct bpf_prog *prog, + struct bpf_insn_access_aux *info) +{ + const int size_default = sizeof(__u32); + + if (off < 0 || off + size > sizeof(struct bpf_sysctl) || off % size) + return false; + + switch (off) { + case offsetof(struct bpf_sysctl, write): + if (type != BPF_READ) + return false; + bpf_ctx_record_field_size(info, size_default); + return bpf_ctx_narrow_access_ok(off, size, size_default); + case offsetof(struct bpf_sysctl, file_pos): + if (type == BPF_READ) { + bpf_ctx_record_field_size(info, size_default); + return bpf_ctx_narrow_access_ok(off, size, size_default); + } else { + return size == size_default; + } + default: + return false; + } +} + +static u32 sysctl_convert_ctx_access(enum bpf_access_type type, + const struct bpf_insn *si, + struct bpf_insn *insn_buf, + struct bpf_prog *prog, u32 *target_size) +{ + struct bpf_insn *insn = insn_buf; + + switch (si->off) { + case offsetof(struct bpf_sysctl, write): + *insn++ = BPF_LDX_MEM( + BPF_SIZE(si->code), si->dst_reg, si->src_reg, + bpf_target_off(struct bpf_sysctl_kern, write, + FIELD_SIZEOF(struct bpf_sysctl_kern, + write), + target_size)); + break; + case offsetof(struct bpf_sysctl, file_pos): + /* ppos is a pointer so it should be accessed via indirect + * loads and stores. Also for stores additional temporary + * register is used since neither src_reg nor dst_reg can be + * overridden. + */ + if (type == BPF_WRITE) { + int treg = BPF_REG_9; + + if (si->src_reg == treg || si->dst_reg == treg) + --treg; + if (si->src_reg == treg || si->dst_reg == treg) + --treg; + *insn++ = BPF_STX_MEM( + BPF_DW, si->dst_reg, treg, + offsetof(struct bpf_sysctl_kern, tmp_reg)); + *insn++ = BPF_LDX_MEM( + BPF_FIELD_SIZEOF(struct bpf_sysctl_kern, ppos), + treg, si->dst_reg, + offsetof(struct bpf_sysctl_kern, ppos)); + *insn++ = BPF_STX_MEM( + BPF_SIZEOF(u32), treg, si->src_reg, 0); + *insn++ = BPF_LDX_MEM( + BPF_DW, treg, si->dst_reg, + offsetof(struct bpf_sysctl_kern, tmp_reg)); + } else { + *insn++ = BPF_LDX_MEM( + BPF_FIELD_SIZEOF(struct bpf_sysctl_kern, ppos), + si->dst_reg, si->src_reg, + offsetof(struct bpf_sysctl_kern, ppos)); + *insn++ = BPF_LDX_MEM( + BPF_SIZE(si->code), si->dst_reg, si->dst_reg, 0); + } + *target_size = sizeof(u32); + break; + } + + return insn - insn_buf; +} + +const struct bpf_verifier_ops cg_sysctl_verifier_ops = { + .get_func_proto = sysctl_func_proto, + .is_valid_access = sysctl_is_valid_access, + .convert_ctx_access = sysctl_convert_ctx_access, +}; + +const struct bpf_prog_ops cg_sysctl_prog_ops = { +}; + +static const struct bpf_func_proto * +cg_sockopt_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + switch (func_id) { +#ifdef CONFIG_NET + case BPF_FUNC_sk_storage_get: + return &bpf_sk_storage_get_proto; + case BPF_FUNC_sk_storage_delete: + return &bpf_sk_storage_delete_proto; +#endif +#ifdef CONFIG_INET + case BPF_FUNC_tcp_sock: + return &bpf_tcp_sock_proto; +#endif + default: + return cgroup_base_func_proto(func_id, prog); + } +} + +static bool cg_sockopt_is_valid_access(int off, int size, + enum bpf_access_type type, + const struct bpf_prog *prog, + struct bpf_insn_access_aux *info) +{ + const int size_default = sizeof(__u32); + + if (off < 0 || off >= sizeof(struct bpf_sockopt)) + return false; + + if (off % size != 0) + return false; + + if (type == BPF_WRITE) { + switch (off) { + case offsetof(struct bpf_sockopt, retval): + if (size != size_default) + return false; + return prog->expected_attach_type == + BPF_CGROUP_GETSOCKOPT; + case offsetof(struct bpf_sockopt, optname): + /* fallthrough */ + case offsetof(struct bpf_sockopt, level): + if (size != size_default) + return false; + return prog->expected_attach_type == + BPF_CGROUP_SETSOCKOPT; + case offsetof(struct bpf_sockopt, optlen): + return size == size_default; + default: + return false; + } + } + + switch (off) { + case offsetof(struct bpf_sockopt, sk): + if (size != sizeof(__u64)) + return false; + info->reg_type = PTR_TO_SOCKET; + break; + case offsetof(struct bpf_sockopt, optval): + if (size != sizeof(__u64)) + return false; + info->reg_type = PTR_TO_PACKET; + break; + case offsetof(struct bpf_sockopt, optval_end): + if (size != sizeof(__u64)) + return false; + info->reg_type = PTR_TO_PACKET_END; + break; + case offsetof(struct bpf_sockopt, retval): + if (size != size_default) + return false; + return prog->expected_attach_type == BPF_CGROUP_GETSOCKOPT; + default: + if (size != size_default) + return false; + break; + } + return true; +} + +#define CG_SOCKOPT_ACCESS_FIELD(T, F) \ + T(BPF_FIELD_SIZEOF(struct bpf_sockopt_kern, F), \ + si->dst_reg, si->src_reg, \ + offsetof(struct bpf_sockopt_kern, F)) + +static u32 cg_sockopt_convert_ctx_access(enum bpf_access_type type, + const struct bpf_insn *si, + struct bpf_insn *insn_buf, + struct bpf_prog *prog, + u32 *target_size) +{ + struct bpf_insn *insn = insn_buf; + + switch (si->off) { + case offsetof(struct bpf_sockopt, sk): + *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, sk); + break; + case offsetof(struct bpf_sockopt, level): + if (type == BPF_WRITE) + *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, level); + else + *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, level); + break; + case offsetof(struct bpf_sockopt, optname): + if (type == BPF_WRITE) + *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, optname); + else + *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optname); + break; + case offsetof(struct bpf_sockopt, optlen): + if (type == BPF_WRITE) + *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, optlen); + else + *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optlen); + break; + case offsetof(struct bpf_sockopt, retval): + if (type == BPF_WRITE) + *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, retval); + else + *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, retval); + break; + case offsetof(struct bpf_sockopt, optval): + *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optval); + break; + case offsetof(struct bpf_sockopt, optval_end): + *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optval_end); + break; + } + + return insn - insn_buf; +} + +static int cg_sockopt_get_prologue(struct bpf_insn *insn_buf, + bool direct_write, + const struct bpf_prog *prog) +{ + /* Nothing to do for sockopt argument. The data is kzalloc'ated. + */ + return 0; +} + +const struct bpf_verifier_ops cg_sockopt_verifier_ops = { + .get_func_proto = cg_sockopt_func_proto, + .is_valid_access = cg_sockopt_is_valid_access, + .convert_ctx_access = cg_sockopt_convert_ctx_access, + .gen_prologue = cg_sockopt_get_prologue, +}; + +const struct bpf_prog_ops cg_sockopt_prog_ops = { +}; diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index f908b9356025..16079550db6d 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * Linux Socket Filter - Kernel level socket filtering * @@ -12,11 +13,6 @@ * Alexei Starovoitov <ast@plumgrid.com> * Daniel Borkmann <dborkman@redhat.com> * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * * Andi Kleen - Fix a few bad bugs and races. * Kris Katterjohn - Added many additional checks in bpf_check_classic() */ @@ -78,7 +74,7 @@ void *bpf_internal_load_pointer_neg_helper(const struct sk_buff *skb, int k, uns return NULL; } -struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags) +struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flags) { gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | gfp_extra_flags; struct bpf_prog_aux *aux; @@ -104,6 +100,32 @@ struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags) return fp; } + +struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags) +{ + gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | gfp_extra_flags; + struct bpf_prog *prog; + int cpu; + + prog = bpf_prog_alloc_no_stats(size, gfp_extra_flags); + if (!prog) + return NULL; + + prog->aux->stats = alloc_percpu_gfp(struct bpf_prog_stats, gfp_flags); + if (!prog->aux->stats) { + kfree(prog->aux); + vfree(prog); + return NULL; + } + + for_each_possible_cpu(cpu) { + struct bpf_prog_stats *pstats; + + pstats = per_cpu_ptr(prog->aux->stats, cpu); + u64_stats_init(&pstats->syncp); + } + return prog; +} EXPORT_SYMBOL_GPL(bpf_prog_alloc); int bpf_prog_alloc_jited_linfo(struct bpf_prog *prog) @@ -231,7 +253,10 @@ struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size, void __bpf_prog_free(struct bpf_prog *fp) { - kfree(fp->aux); + if (fp->aux) { + free_percpu(fp->aux->stats); + kfree(fp->aux); + } vfree(fp); } @@ -263,7 +288,8 @@ int bpf_prog_calc_tag(struct bpf_prog *fp) dst[i] = fp->insnsi[i]; if (!was_ld_map && dst[i].code == (BPF_LD | BPF_IMM | BPF_DW) && - dst[i].src_reg == BPF_PSEUDO_MAP_FD) { + (dst[i].src_reg == BPF_PSEUDO_MAP_FD || + dst[i].src_reg == BPF_PSEUDO_MAP_VALUE)) { was_ld_map = true; dst[i].imm = 0; } else if (was_ld_map && @@ -307,15 +333,16 @@ int bpf_prog_calc_tag(struct bpf_prog *fp) return 0; } -static int bpf_adj_delta_to_imm(struct bpf_insn *insn, u32 pos, u32 delta, - u32 curr, const bool probe_pass) +static int bpf_adj_delta_to_imm(struct bpf_insn *insn, u32 pos, s32 end_old, + s32 end_new, s32 curr, const bool probe_pass) { const s64 imm_min = S32_MIN, imm_max = S32_MAX; + s32 delta = end_new - end_old; s64 imm = insn->imm; - if (curr < pos && curr + imm + 1 > pos) + if (curr < pos && curr + imm + 1 >= end_old) imm += delta; - else if (curr > pos + delta && curr + imm + 1 <= pos + delta) + else if (curr >= end_new && curr + imm + 1 < end_new) imm -= delta; if (imm < imm_min || imm > imm_max) return -ERANGE; @@ -324,15 +351,16 @@ static int bpf_adj_delta_to_imm(struct bpf_insn *insn, u32 pos, u32 delta, return 0; } -static int bpf_adj_delta_to_off(struct bpf_insn *insn, u32 pos, u32 delta, - u32 curr, const bool probe_pass) +static int bpf_adj_delta_to_off(struct bpf_insn *insn, u32 pos, s32 end_old, + s32 end_new, s32 curr, const bool probe_pass) { const s32 off_min = S16_MIN, off_max = S16_MAX; + s32 delta = end_new - end_old; s32 off = insn->off; - if (curr < pos && curr + off + 1 > pos) + if (curr < pos && curr + off + 1 >= end_old) off += delta; - else if (curr > pos + delta && curr + off + 1 <= pos + delta) + else if (curr >= end_new && curr + off + 1 < end_new) off -= delta; if (off < off_min || off > off_max) return -ERANGE; @@ -341,10 +369,10 @@ static int bpf_adj_delta_to_off(struct bpf_insn *insn, u32 pos, u32 delta, return 0; } -static int bpf_adj_branches(struct bpf_prog *prog, u32 pos, u32 delta, - const bool probe_pass) +static int bpf_adj_branches(struct bpf_prog *prog, u32 pos, s32 end_old, + s32 end_new, const bool probe_pass) { - u32 i, insn_cnt = prog->len + (probe_pass ? delta : 0); + u32 i, insn_cnt = prog->len + (probe_pass ? end_new - end_old : 0); struct bpf_insn *insn = prog->insnsi; int ret = 0; @@ -356,22 +384,23 @@ static int bpf_adj_branches(struct bpf_prog *prog, u32 pos, u32 delta, * do any other adjustments. Therefore skip the patchlet. */ if (probe_pass && i == pos) { - i += delta + 1; - insn++; + i = end_new; + insn = prog->insnsi + end_old; } code = insn->code; - if (BPF_CLASS(code) != BPF_JMP || + if ((BPF_CLASS(code) != BPF_JMP && + BPF_CLASS(code) != BPF_JMP32) || BPF_OP(code) == BPF_EXIT) continue; /* Adjust offset of jmps if we cross patch boundaries. */ if (BPF_OP(code) == BPF_CALL) { if (insn->src_reg != BPF_PSEUDO_CALL) continue; - ret = bpf_adj_delta_to_imm(insn, pos, delta, i, - probe_pass); + ret = bpf_adj_delta_to_imm(insn, pos, end_old, + end_new, i, probe_pass); } else { - ret = bpf_adj_delta_to_off(insn, pos, delta, i, - probe_pass); + ret = bpf_adj_delta_to_off(insn, pos, end_old, + end_new, i, probe_pass); } if (ret) break; @@ -406,6 +435,7 @@ struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off, u32 insn_adj_cnt, insn_rest, insn_delta = len - 1; const u32 cnt_max = S16_MAX; struct bpf_prog *prog_adj; + int err; /* Since our patchlet doesn't expand the image, we're done. */ if (insn_delta == 0) { @@ -421,8 +451,8 @@ struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off, * we afterwards may not fail anymore. */ if (insn_adj_cnt > cnt_max && - bpf_adj_branches(prog, off, insn_delta, true)) - return NULL; + (err = bpf_adj_branches(prog, off, off + 1, off + len, true))) + return ERR_PTR(err); /* Several new instructions need to be inserted. Make room * for them. Likely, there's no need for a new allocation as @@ -431,7 +461,7 @@ struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off, prog_adj = bpf_prog_realloc(prog, bpf_prog_size(insn_adj_cnt), GFP_USER); if (!prog_adj) - return NULL; + return ERR_PTR(-ENOMEM); prog_adj->len = insn_adj_cnt; @@ -453,13 +483,25 @@ struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off, * the ship has sailed to reverse to the original state. An * overflow cannot happen at this point. */ - BUG_ON(bpf_adj_branches(prog_adj, off, insn_delta, false)); + BUG_ON(bpf_adj_branches(prog_adj, off, off + 1, off + len, false)); bpf_adj_linfo(prog_adj, off, insn_delta); return prog_adj; } +int bpf_remove_insns(struct bpf_prog *prog, u32 off, u32 cnt) +{ + /* Branch offsets can't overflow when program is shrinking, no need + * to call bpf_adj_branches(..., true) here + */ + memmove(prog->insnsi + off, prog->insnsi + off + cnt, + sizeof(struct bpf_insn) * (prog->len - off - cnt)); + prog->len -= cnt; + + return WARN_ON_ONCE(bpf_adj_branches(prog, off, off + cnt, off, false)); +} + void bpf_prog_kallsyms_del_subprogs(struct bpf_prog *fp) { int i; @@ -495,7 +537,7 @@ bpf_get_prog_addr_region(const struct bpf_prog *prog, *symbol_end = addr + hdr->pages * PAGE_SIZE; } -static void bpf_get_prog_name(const struct bpf_prog *prog, char *sym) +void bpf_get_prog_name(const struct bpf_prog *prog, char *sym) { const char *end = sym + KSYM_NAME_LEN; const struct btf_type *type; @@ -804,7 +846,6 @@ void __weak bpf_jit_free(struct bpf_prog *fp) if (fp->jited) { struct bpf_binary_header *hdr = bpf_jit_binary_hdr(fp); - bpf_jit_binary_unlock_ro(hdr); bpf_jit_binary_free(hdr); WARN_ON_ONCE(!bpf_prog_kallsyms_verify_off(fp)); @@ -934,6 +975,27 @@ static int bpf_jit_blind_insn(const struct bpf_insn *from, *to++ = BPF_JMP_REG(from->code, from->dst_reg, BPF_REG_AX, off); break; + case BPF_JMP32 | BPF_JEQ | BPF_K: + case BPF_JMP32 | BPF_JNE | BPF_K: + case BPF_JMP32 | BPF_JGT | BPF_K: + case BPF_JMP32 | BPF_JLT | BPF_K: + case BPF_JMP32 | BPF_JGE | BPF_K: + case BPF_JMP32 | BPF_JLE | BPF_K: + case BPF_JMP32 | BPF_JSGT | BPF_K: + case BPF_JMP32 | BPF_JSLT | BPF_K: + case BPF_JMP32 | BPF_JSGE | BPF_K: + case BPF_JMP32 | BPF_JSLE | BPF_K: + case BPF_JMP32 | BPF_JSET | BPF_K: + /* Accommodate for extra offset in case of a backjump. */ + off = from->off; + if (off < 0) + off -= 2; + *to++ = BPF_ALU32_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm); + *to++ = BPF_ALU32_IMM(BPF_XOR, BPF_REG_AX, imm_rnd); + *to++ = BPF_JMP32_REG(from->code, from->dst_reg, BPF_REG_AX, + off); + break; + case BPF_LD | BPF_IMM | BPF_DW: *to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ aux[1].imm); *to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd); @@ -1031,13 +1093,13 @@ struct bpf_prog *bpf_jit_blind_constants(struct bpf_prog *prog) continue; tmp = bpf_patch_insn_single(clone, i, insn_buff, rewritten); - if (!tmp) { + if (IS_ERR(tmp)) { /* Patching may have repointed aux->prog during * realloc from the original one, so we need to * fix it up here on error. */ bpf_jit_prog_release_other(prog, clone); - return ERR_PTR(-ENOMEM); + return tmp; } clone = tmp; @@ -1130,6 +1192,31 @@ EXPORT_SYMBOL_GPL(__bpf_call_base); INSN_2(JMP, CALL), \ /* Exit instruction. */ \ INSN_2(JMP, EXIT), \ + /* 32-bit Jump instructions. */ \ + /* Register based. */ \ + INSN_3(JMP32, JEQ, X), \ + INSN_3(JMP32, JNE, X), \ + INSN_3(JMP32, JGT, X), \ + INSN_3(JMP32, JLT, X), \ + INSN_3(JMP32, JGE, X), \ + INSN_3(JMP32, JLE, X), \ + INSN_3(JMP32, JSGT, X), \ + INSN_3(JMP32, JSLT, X), \ + INSN_3(JMP32, JSGE, X), \ + INSN_3(JMP32, JSLE, X), \ + INSN_3(JMP32, JSET, X), \ + /* Immediate based. */ \ + INSN_3(JMP32, JEQ, K), \ + INSN_3(JMP32, JNE, K), \ + INSN_3(JMP32, JGT, K), \ + INSN_3(JMP32, JLT, K), \ + INSN_3(JMP32, JGE, K), \ + INSN_3(JMP32, JLE, K), \ + INSN_3(JMP32, JSGT, K), \ + INSN_3(JMP32, JSLT, K), \ + INSN_3(JMP32, JSGE, K), \ + INSN_3(JMP32, JSLE, K), \ + INSN_3(JMP32, JSET, K), \ /* Jump instructions. */ \ /* Register based. */ \ INSN_3(JMP, JEQ, X), \ @@ -1202,8 +1289,9 @@ bool bpf_opcode_in_insntable(u8 code) #ifndef CONFIG_BPF_JIT_ALWAYS_ON /** * __bpf_prog_run - run eBPF program on a given context - * @ctx: is the data we are operating on + * @regs: is the array of MAX_BPF_EXT_REG eBPF pseudo-registers * @insn: is the array of eBPF instructions + * @stack: is the eBPF storage stack * * Decode and execute eBPF instructions. */ @@ -1276,10 +1364,10 @@ select_insn: insn++; CONT; ALU_ARSH_X: - DST = (u64) (u32) ((*(s32 *) &DST) >> SRC); + DST = (u64) (u32) (((s32) DST) >> SRC); CONT; ALU_ARSH_K: - DST = (u64) (u32) ((*(s32 *) &DST) >> IMM); + DST = (u64) (u32) (((s32) DST) >> IMM); CONT; ALU64_ARSH_X: (*(s64 *) &DST) >>= SRC; @@ -1390,145 +1478,49 @@ select_insn: out: CONT; } - /* JMP */ JMP_JA: insn += insn->off; CONT; - JMP_JEQ_X: - if (DST == SRC) { - insn += insn->off; - CONT_JMP; - } - CONT; - JMP_JEQ_K: - if (DST == IMM) { - insn += insn->off; - CONT_JMP; - } - CONT; - JMP_JNE_X: - if (DST != SRC) { - insn += insn->off; - CONT_JMP; - } - CONT; - JMP_JNE_K: - if (DST != IMM) { - insn += insn->off; - CONT_JMP; - } - CONT; - JMP_JGT_X: - if (DST > SRC) { - insn += insn->off; - CONT_JMP; - } - CONT; - JMP_JGT_K: - if (DST > IMM) { - insn += insn->off; - CONT_JMP; - } - CONT; - JMP_JLT_X: - if (DST < SRC) { - insn += insn->off; - CONT_JMP; - } - CONT; - JMP_JLT_K: - if (DST < IMM) { - insn += insn->off; - CONT_JMP; - } - CONT; - JMP_JGE_X: - if (DST >= SRC) { - insn += insn->off; - CONT_JMP; - } - CONT; - JMP_JGE_K: - if (DST >= IMM) { - insn += insn->off; - CONT_JMP; - } - CONT; - JMP_JLE_X: - if (DST <= SRC) { - insn += insn->off; - CONT_JMP; - } - CONT; - JMP_JLE_K: - if (DST <= IMM) { - insn += insn->off; - CONT_JMP; - } - CONT; - JMP_JSGT_X: - if (((s64) DST) > ((s64) SRC)) { - insn += insn->off; - CONT_JMP; - } - CONT; - JMP_JSGT_K: - if (((s64) DST) > ((s64) IMM)) { - insn += insn->off; - CONT_JMP; - } - CONT; - JMP_JSLT_X: - if (((s64) DST) < ((s64) SRC)) { - insn += insn->off; - CONT_JMP; - } - CONT; - JMP_JSLT_K: - if (((s64) DST) < ((s64) IMM)) { - insn += insn->off; - CONT_JMP; - } - CONT; - JMP_JSGE_X: - if (((s64) DST) >= ((s64) SRC)) { - insn += insn->off; - CONT_JMP; - } - CONT; - JMP_JSGE_K: - if (((s64) DST) >= ((s64) IMM)) { - insn += insn->off; - CONT_JMP; - } - CONT; - JMP_JSLE_X: - if (((s64) DST) <= ((s64) SRC)) { - insn += insn->off; - CONT_JMP; - } - CONT; - JMP_JSLE_K: - if (((s64) DST) <= ((s64) IMM)) { - insn += insn->off; - CONT_JMP; - } - CONT; - JMP_JSET_X: - if (DST & SRC) { - insn += insn->off; - CONT_JMP; - } - CONT; - JMP_JSET_K: - if (DST & IMM) { - insn += insn->off; - CONT_JMP; - } - CONT; JMP_EXIT: return BPF_R0; - + /* JMP */ +#define COND_JMP(SIGN, OPCODE, CMP_OP) \ + JMP_##OPCODE##_X: \ + if ((SIGN##64) DST CMP_OP (SIGN##64) SRC) { \ + insn += insn->off; \ + CONT_JMP; \ + } \ + CONT; \ + JMP32_##OPCODE##_X: \ + if ((SIGN##32) DST CMP_OP (SIGN##32) SRC) { \ + insn += insn->off; \ + CONT_JMP; \ + } \ + CONT; \ + JMP_##OPCODE##_K: \ + if ((SIGN##64) DST CMP_OP (SIGN##64) IMM) { \ + insn += insn->off; \ + CONT_JMP; \ + } \ + CONT; \ + JMP32_##OPCODE##_K: \ + if ((SIGN##32) DST CMP_OP (SIGN##32) IMM) { \ + insn += insn->off; \ + CONT_JMP; \ + } \ + CONT; + COND_JMP(u, JEQ, ==) + COND_JMP(u, JNE, !=) + COND_JMP(u, JGT, >) + COND_JMP(u, JLT, <) + COND_JMP(u, JGE, >=) + COND_JMP(u, JLE, <=) + COND_JMP(u, JSET, &) + COND_JMP(s, JSGT, >) + COND_JMP(s, JSLT, <) + COND_JMP(s, JSGE, >=) + COND_JMP(s, JSLE, <=) +#undef COND_JMP /* STX and ST and LDX*/ #define LDST(SIZEOP, SIZE) \ STX_MEM_##SIZEOP: \ @@ -1799,38 +1791,42 @@ struct bpf_prog_array *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags) return &empty_prog_array.hdr; } -void bpf_prog_array_free(struct bpf_prog_array __rcu *progs) +void bpf_prog_array_free(struct bpf_prog_array *progs) { - if (!progs || - progs == (struct bpf_prog_array __rcu *)&empty_prog_array.hdr) + if (!progs || progs == &empty_prog_array.hdr) return; kfree_rcu(progs, rcu); } -int bpf_prog_array_length(struct bpf_prog_array __rcu *array) +int bpf_prog_array_length(struct bpf_prog_array *array) { struct bpf_prog_array_item *item; u32 cnt = 0; - rcu_read_lock(); - item = rcu_dereference(array)->items; - for (; item->prog; item++) + for (item = array->items; item->prog; item++) if (item->prog != &dummy_bpf_prog.prog) cnt++; - rcu_read_unlock(); return cnt; } +bool bpf_prog_array_is_empty(struct bpf_prog_array *array) +{ + struct bpf_prog_array_item *item; -static bool bpf_prog_array_copy_core(struct bpf_prog_array __rcu *array, + for (item = array->items; item->prog; item++) + if (item->prog != &dummy_bpf_prog.prog) + return false; + return true; +} + +static bool bpf_prog_array_copy_core(struct bpf_prog_array *array, u32 *prog_ids, u32 request_cnt) { struct bpf_prog_array_item *item; int i = 0; - item = rcu_dereference_check(array, 1)->items; - for (; item->prog; item++) { + for (item = array->items; item->prog; item++) { if (item->prog == &dummy_bpf_prog.prog) continue; prog_ids[i] = item->prog->aux->id; @@ -1843,7 +1839,7 @@ static bool bpf_prog_array_copy_core(struct bpf_prog_array __rcu *array, return !!(item->prog); } -int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *array, +int bpf_prog_array_copy_to_user(struct bpf_prog_array *array, __u32 __user *prog_ids, u32 cnt) { unsigned long err = 0; @@ -1854,18 +1850,12 @@ int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *array, * cnt = bpf_prog_array_length(); * if (cnt > 0) * bpf_prog_array_copy_to_user(..., cnt); - * so below kcalloc doesn't need extra cnt > 0 check, but - * bpf_prog_array_length() releases rcu lock and - * prog array could have been swapped with empty or larger array, - * so always copy 'cnt' prog_ids to the user. - * In a rare race the user will see zero prog_ids + * so below kcalloc doesn't need extra cnt > 0 check. */ ids = kcalloc(cnt, sizeof(u32), GFP_USER | __GFP_NOWARN); if (!ids) return -ENOMEM; - rcu_read_lock(); nospc = bpf_prog_array_copy_core(array, ids, cnt); - rcu_read_unlock(); err = copy_to_user(prog_ids, ids, cnt * sizeof(u32)); kfree(ids); if (err) @@ -1875,19 +1865,19 @@ int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *array, return 0; } -void bpf_prog_array_delete_safe(struct bpf_prog_array __rcu *array, +void bpf_prog_array_delete_safe(struct bpf_prog_array *array, struct bpf_prog *old_prog) { - struct bpf_prog_array_item *item = array->items; + struct bpf_prog_array_item *item; - for (; item->prog; item++) + for (item = array->items; item->prog; item++) if (item->prog == old_prog) { WRITE_ONCE(item->prog, &dummy_bpf_prog.prog); break; } } -int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array, +int bpf_prog_array_copy(struct bpf_prog_array *old_array, struct bpf_prog *exclude_prog, struct bpf_prog *include_prog, struct bpf_prog_array **new_array) @@ -1951,7 +1941,7 @@ int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array, return 0; } -int bpf_prog_array_copy_info(struct bpf_prog_array __rcu *array, +int bpf_prog_array_copy_info(struct bpf_prog_array *array, u32 *prog_ids, u32 request_cnt, u32 *prog_cnt) { @@ -2036,6 +2026,8 @@ const struct bpf_func_proto bpf_map_delete_elem_proto __weak; const struct bpf_func_proto bpf_map_push_elem_proto __weak; const struct bpf_func_proto bpf_map_pop_elem_proto __weak; const struct bpf_func_proto bpf_map_peek_elem_proto __weak; +const struct bpf_func_proto bpf_spin_lock_proto __weak; +const struct bpf_func_proto bpf_spin_unlock_proto __weak; const struct bpf_func_proto bpf_get_prandom_u32_proto __weak; const struct bpf_func_proto bpf_get_smp_processor_id_proto __weak; @@ -2092,6 +2084,15 @@ bool __weak bpf_helper_changes_pkt_data(void *func) return false; } +/* Return TRUE if the JIT backend wants verifier to enable sub-register usage + * analysis code and wants explicit zero extension inserted by verifier. + * Otherwise, return FALSE. + */ +bool __weak bpf_jit_needs_zext(void) +{ + return false; +} + /* To execute LD_ABS/LD_IND instructions __bpf_prog_run() may call * skb_copy_bits(), so provide a weak definition of it for NET-less config. */ @@ -2101,8 +2102,12 @@ int __weak skb_copy_bits(const struct sk_buff *skb, int offset, void *to, return -EFAULT; } +DEFINE_STATIC_KEY_FALSE(bpf_stats_enabled_key); +EXPORT_SYMBOL(bpf_stats_enabled_key); + /* All definitions of tracepoints related to BPF. */ #define CREATE_TRACE_POINTS #include <linux/bpf_trace.h> EXPORT_TRACEPOINT_SYMBOL_GPL(xdp_exception); +EXPORT_TRACEPOINT_SYMBOL_GPL(xdp_bulk_tx); diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index 8974b3755670..ef49e17ae47c 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -1,7 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0-only /* bpf/cpumap.c * * Copyright (c) 2017 Jesper Dangaard Brouer, Red Hat Inc. - * Released under terms in GPL version 2. See COPYING. */ /* The 'cpumap' is primarily used as a backend map for XDP BPF helper @@ -32,14 +32,19 @@ /* General idea: XDP packets getting XDP redirected to another CPU, * will maximum be stored/queued for one driver ->poll() call. It is - * guaranteed that setting flush bit and flush operation happen on + * guaranteed that queueing the frame and the flush operation happen on * same CPU. Thus, cpu_map_flush operation can deduct via this_cpu_ptr() * which queue in bpf_cpu_map_entry contains packets. */ #define CPU_MAP_BULK_SIZE 8 /* 8 == one cacheline on 64-bit archs */ +struct bpf_cpu_map_entry; +struct bpf_cpu_map; + struct xdp_bulk_queue { void *q[CPU_MAP_BULK_SIZE]; + struct list_head flush_node; + struct bpf_cpu_map_entry *obj; unsigned int count; }; @@ -52,6 +57,8 @@ struct bpf_cpu_map_entry { /* XDP can run multiple RX-ring queues, need __percpu enqueue store */ struct xdp_bulk_queue __percpu *bulkq; + struct bpf_cpu_map *cmap; + /* Queue with potential multi-producers, and single-consumer kthread */ struct ptr_ring *queue; struct task_struct *kthread; @@ -65,23 +72,17 @@ struct bpf_cpu_map { struct bpf_map map; /* Below members specific for map type */ struct bpf_cpu_map_entry **cpu_map; - unsigned long __percpu *flush_needed; + struct list_head __percpu *flush_list; }; -static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu, - struct xdp_bulk_queue *bq, bool in_napi_ctx); - -static u64 cpu_map_bitmap_size(const union bpf_attr *attr) -{ - return BITS_TO_LONGS(attr->max_entries) * sizeof(unsigned long); -} +static int bq_flush_to_queue(struct xdp_bulk_queue *bq, bool in_napi_ctx); static struct bpf_map *cpu_map_alloc(union bpf_attr *attr) { struct bpf_cpu_map *cmap; int err = -ENOMEM; + int ret, cpu; u64 cost; - int ret; if (!capable(CAP_SYS_ADMIN)) return ERR_PTR(-EPERM); @@ -105,23 +106,21 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr) /* make sure page count doesn't overflow */ cost = (u64) cmap->map.max_entries * sizeof(struct bpf_cpu_map_entry *); - cost += cpu_map_bitmap_size(attr) * num_possible_cpus(); - if (cost >= U32_MAX - PAGE_SIZE) - goto free_cmap; - cmap->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; + cost += sizeof(struct list_head) * num_possible_cpus(); /* Notice returns -EPERM on if map size is larger than memlock limit */ - ret = bpf_map_precharge_memlock(cmap->map.pages); + ret = bpf_map_charge_init(&cmap->map.memory, cost); if (ret) { err = ret; goto free_cmap; } - /* A per cpu bitfield with a bit per possible CPU in map */ - cmap->flush_needed = __alloc_percpu(cpu_map_bitmap_size(attr), - __alignof__(unsigned long)); - if (!cmap->flush_needed) - goto free_cmap; + cmap->flush_list = alloc_percpu(struct list_head); + if (!cmap->flush_list) + goto free_charge; + + for_each_possible_cpu(cpu) + INIT_LIST_HEAD(per_cpu_ptr(cmap->flush_list, cpu)); /* Alloc array for possible remote "destination" CPUs */ cmap->cpu_map = bpf_map_area_alloc(cmap->map.max_entries * @@ -132,7 +131,9 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr) return &cmap->map; free_percpu: - free_percpu(cmap->flush_needed); + free_percpu(cmap->flush_list); +free_charge: + bpf_map_charge_finish(&cmap->map.memory); free_cmap: kfree(cmap); return ERR_PTR(err); @@ -160,11 +161,15 @@ static void cpu_map_kthread_stop(struct work_struct *work) } static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu, - struct xdp_frame *xdpf) + struct xdp_frame *xdpf, + struct sk_buff *skb) { + unsigned int hard_start_headroom; unsigned int frame_size; void *pkt_data_start; - struct sk_buff *skb; + + /* Part of headroom was reserved to xdpf */ + hard_start_headroom = sizeof(struct xdp_frame) + xdpf->headroom; /* build_skb need to place skb_shared_info after SKB end, and * also want to know the memory "truesize". Thus, need to @@ -183,15 +188,15 @@ static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu, * is not at a fixed memory location, with mixed length * packets, which is bad for cache-line hotness. */ - frame_size = SKB_DATA_ALIGN(xdpf->len + xdpf->headroom) + + frame_size = SKB_DATA_ALIGN(xdpf->len + hard_start_headroom) + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); - pkt_data_start = xdpf->data - xdpf->headroom; - skb = build_skb(pkt_data_start, frame_size); - if (!skb) + pkt_data_start = xdpf->data - hard_start_headroom; + skb = build_skb_around(skb, pkt_data_start, frame_size); + if (unlikely(!skb)) return NULL; - skb_reserve(skb, xdpf->headroom); + skb_reserve(skb, hard_start_headroom); __skb_put(skb, xdpf->len); if (xdpf->metasize) skb_metadata_set(skb, xdpf->metasize); @@ -205,6 +210,12 @@ static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu, * - RX ring dev queue index (skb_record_rx_queue) */ + /* Until page_pool get SKB return path, release DMA here */ + xdp_release_frame(xdpf); + + /* Allow SKB to reuse area used by xdp_frame */ + xdp_scrub_frame(xdpf); + return skb; } @@ -233,6 +244,8 @@ static void put_cpu_map_entry(struct bpf_cpu_map_entry *rcpu) } } +#define CPUMAP_BATCH 8 + static int cpu_map_kthread_run(void *data) { struct bpf_cpu_map_entry *rcpu = data; @@ -245,8 +258,11 @@ static int cpu_map_kthread_run(void *data) * kthread_stop signal until queue is empty. */ while (!kthread_should_stop() || !__ptr_ring_empty(rcpu->queue)) { - unsigned int processed = 0, drops = 0, sched = 0; - struct xdp_frame *xdpf; + unsigned int drops = 0, sched = 0; + void *frames[CPUMAP_BATCH]; + void *skbs[CPUMAP_BATCH]; + gfp_t gfp = __GFP_ZERO | GFP_ATOMIC; + int i, n, m; /* Release CPU reschedule checks */ if (__ptr_ring_empty(rcpu->queue)) { @@ -262,18 +278,38 @@ static int cpu_map_kthread_run(void *data) sched = cond_resched(); } - /* Process packets in rcpu->queue */ - local_bh_disable(); /* * The bpf_cpu_map_entry is single consumer, with this * kthread CPU pinned. Lockless access to ptr_ring * consume side valid as no-resize allowed of queue. */ - while ((xdpf = __ptr_ring_consume(rcpu->queue))) { - struct sk_buff *skb; + n = ptr_ring_consume_batched(rcpu->queue, frames, CPUMAP_BATCH); + + for (i = 0; i < n; i++) { + void *f = frames[i]; + struct page *page = virt_to_page(f); + + /* Bring struct page memory area to curr CPU. Read by + * build_skb_around via page_is_pfmemalloc(), and when + * freed written by page_frag_free call. + */ + prefetchw(page); + } + + m = kmem_cache_alloc_bulk(skbuff_head_cache, gfp, n, skbs); + if (unlikely(m == 0)) { + for (i = 0; i < n; i++) + skbs[i] = NULL; /* effect: xdp_return_frame */ + drops = n; + } + + local_bh_disable(); + for (i = 0; i < n; i++) { + struct xdp_frame *xdpf = frames[i]; + struct sk_buff *skb = skbs[i]; int ret; - skb = cpu_map_build_skb(rcpu, xdpf); + skb = cpu_map_build_skb(rcpu, xdpf, skb); if (!skb) { xdp_return_frame(xdpf); continue; @@ -283,13 +319,9 @@ static int cpu_map_kthread_run(void *data) ret = netif_receive_skb_core(skb); if (ret == NET_RX_DROP) drops++; - - /* Limit BH-disable period */ - if (++processed == 8) - break; } /* Feedback loop via tracepoint */ - trace_xdp_cpumap_kthread(rcpu->map_id, processed, drops, sched); + trace_xdp_cpumap_kthread(rcpu->map_id, n, drops, sched); local_bh_enable(); /* resched point, may call do_softirq() */ } @@ -304,7 +336,8 @@ static struct bpf_cpu_map_entry *__cpu_map_entry_alloc(u32 qsize, u32 cpu, { gfp_t gfp = GFP_KERNEL | __GFP_NOWARN; struct bpf_cpu_map_entry *rcpu; - int numa, err; + struct xdp_bulk_queue *bq; + int numa, err, i; /* Have map->numa_node, but choose node of redirect target CPU */ numa = cpu_to_node(cpu); @@ -319,6 +352,11 @@ static struct bpf_cpu_map_entry *__cpu_map_entry_alloc(u32 qsize, u32 cpu, if (!rcpu->bulkq) goto free_rcu; + for_each_possible_cpu(i) { + bq = per_cpu_ptr(rcpu->bulkq, i); + bq->obj = rcpu; + } + /* Alloc queue */ rcpu->queue = kzalloc_node(sizeof(*rcpu->queue), gfp, numa); if (!rcpu->queue) @@ -375,7 +413,7 @@ static void __cpu_map_entry_free(struct rcu_head *rcu) struct xdp_bulk_queue *bq = per_cpu_ptr(rcpu->bulkq, cpu); /* No concurrent bq_enqueue can run at this point */ - bq_flush_to_queue(rcpu, bq, false); + bq_flush_to_queue(bq, false); } free_percpu(rcpu->bulkq); /* Cannot kthread_stop() here, last put free rcpu resources */ @@ -458,6 +496,7 @@ static int cpu_map_update_elem(struct bpf_map *map, void *key, void *value, rcpu = __cpu_map_entry_alloc(qsize, key_cpu, map->id); if (!rcpu) return -ENOMEM; + rcpu->cmap = cmap; } rcu_read_lock(); __cpu_map_entry_replace(cmap, key_cpu, rcpu); @@ -484,14 +523,14 @@ static void cpu_map_free(struct bpf_map *map) synchronize_rcu(); /* To ensure all pending flush operations have completed wait for flush - * bitmap to indicate all flush_needed bits to be zero on _all_ cpus. - * Because the above synchronize_rcu() ensures the map is disconnected - * from the program we can assume no new bits will be set. + * list be empty on _all_ cpus. Because the above synchronize_rcu() + * ensures the map is disconnected from the program we can assume no new + * items will be added to the list. */ for_each_online_cpu(cpu) { - unsigned long *bitmap = per_cpu_ptr(cmap->flush_needed, cpu); + struct list_head *flush_list = per_cpu_ptr(cmap->flush_list, cpu); - while (!bitmap_empty(bitmap, cmap->map.max_entries)) + while (!list_empty(flush_list)) cond_resched(); } @@ -508,7 +547,7 @@ static void cpu_map_free(struct bpf_map *map) /* bq flush and cleanup happens after RCU graze-period */ __cpu_map_entry_replace(cmap, i, NULL); /* call_rcu */ } - free_percpu(cmap->flush_needed); + free_percpu(cmap->flush_list); bpf_map_area_free(cmap->cpu_map); kfree(cmap); } @@ -560,9 +599,9 @@ const struct bpf_map_ops cpu_map_ops = { .map_check_btf = map_check_no_btf, }; -static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu, - struct xdp_bulk_queue *bq, bool in_napi_ctx) +static int bq_flush_to_queue(struct xdp_bulk_queue *bq, bool in_napi_ctx) { + struct bpf_cpu_map_entry *rcpu = bq->obj; unsigned int processed = 0, drops = 0; const int to_cpu = rcpu->cpu; struct ptr_ring *q; @@ -591,6 +630,8 @@ static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu, bq->count = 0; spin_unlock(&q->producer_lock); + __list_del_clearprev(&bq->flush_node); + /* Feedback loop via tracepoints */ trace_xdp_cpumap_enqueue(rcpu->map_id, processed, drops, to_cpu); return 0; @@ -601,10 +642,11 @@ static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu, */ static int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf) { + struct list_head *flush_list = this_cpu_ptr(rcpu->cmap->flush_list); struct xdp_bulk_queue *bq = this_cpu_ptr(rcpu->bulkq); if (unlikely(bq->count == CPU_MAP_BULK_SIZE)) - bq_flush_to_queue(rcpu, bq, true); + bq_flush_to_queue(bq, true); /* Notice, xdp_buff/page MUST be queued here, long enough for * driver to code invoking us to finished, due to driver @@ -616,6 +658,10 @@ static int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf) * operation, when completing napi->poll call. */ bq->q[bq->count++] = xdpf; + + if (!bq->flush_node.prev) + list_add(&bq->flush_node, flush_list); + return 0; } @@ -635,41 +681,16 @@ int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp, return 0; } -void __cpu_map_insert_ctx(struct bpf_map *map, u32 bit) -{ - struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); - unsigned long *bitmap = this_cpu_ptr(cmap->flush_needed); - - __set_bit(bit, bitmap); -} - void __cpu_map_flush(struct bpf_map *map) { struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); - unsigned long *bitmap = this_cpu_ptr(cmap->flush_needed); - u32 bit; - - /* The napi->poll softirq makes sure __cpu_map_insert_ctx() - * and __cpu_map_flush() happen on same CPU. Thus, the percpu - * bitmap indicate which percpu bulkq have packets. - */ - for_each_set_bit(bit, bitmap, map->max_entries) { - struct bpf_cpu_map_entry *rcpu = READ_ONCE(cmap->cpu_map[bit]); - struct xdp_bulk_queue *bq; - - /* This is possible if entry is removed by user space - * between xdp redirect and flush op. - */ - if (unlikely(!rcpu)) - continue; - - __clear_bit(bit, bitmap); + struct list_head *flush_list = this_cpu_ptr(cmap->flush_list); + struct xdp_bulk_queue *bq, *tmp; - /* Flush all frames in bulkq to real queue */ - bq = this_cpu_ptr(rcpu->bulkq); - bq_flush_to_queue(rcpu, bq, true); + list_for_each_entry_safe(bq, tmp, flush_list, flush_node) { + bq_flush_to_queue(bq, true); /* If already running, costs spin_lock_irqsave + smb_mb */ - wake_up_process(rcpu->kthread); + wake_up_process(bq->obj->kthread); } } diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 191b79948424..d83cf8ccc872 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -1,13 +1,5 @@ +// SPDX-License-Identifier: GPL-2.0-only /* Copyright (c) 2017 Covalent IO, Inc. http://covalent.io - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. */ /* Devmaps primary use is as a backend map for XDP BPF helper call @@ -25,9 +17,8 @@ * datapath always has a valid copy. However, the datapath does a "flush" * operation that pushes any pending packets in the driver outside the RCU * critical section. Each bpf_dtab_netdev tracks these pending operations using - * an atomic per-cpu bitmap. The bpf_dtab_netdev object will not be destroyed - * until all bits are cleared indicating outstanding flush operations have - * completed. + * a per-cpu flush list. The bpf_dtab_netdev object will not be destroyed until + * this list is empty, indicating outstanding flush operations have completed. * * BPF syscalls may race with BPF program calls on any of the update, delete * or lookup operations. As noted above the xchg() operation also keep the @@ -56,9 +47,13 @@ (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) #define DEV_MAP_BULK_SIZE 16 +struct bpf_dtab_netdev; + struct xdp_bulk_queue { struct xdp_frame *q[DEV_MAP_BULK_SIZE]; + struct list_head flush_node; struct net_device *dev_rx; + struct bpf_dtab_netdev *obj; unsigned int count; }; @@ -73,22 +68,17 @@ struct bpf_dtab_netdev { struct bpf_dtab { struct bpf_map map; struct bpf_dtab_netdev **netdev_map; - unsigned long __percpu *flush_needed; + struct list_head __percpu *flush_list; struct list_head list; }; static DEFINE_SPINLOCK(dev_map_lock); static LIST_HEAD(dev_map_list); -static u64 dev_map_bitmap_size(const union bpf_attr *attr) -{ - return BITS_TO_LONGS((u64) attr->max_entries) * sizeof(unsigned long); -} - static struct bpf_map *dev_map_alloc(union bpf_attr *attr) { struct bpf_dtab *dtab; - int err = -EINVAL; + int err, cpu; u64 cost; if (!capable(CAP_NET_ADMIN)) @@ -99,6 +89,11 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr) attr->value_size != 4 || attr->map_flags & ~DEV_CREATE_FLAG_MASK) return ERR_PTR(-EINVAL); + /* Lookup returns a pointer straight to dev->ifindex, so make sure the + * verifier prevents writes from the BPF side + */ + attr->map_flags |= BPF_F_RDONLY_PROG; + dtab = kzalloc(sizeof(*dtab), GFP_USER); if (!dtab) return ERR_PTR(-ENOMEM); @@ -107,39 +102,39 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr) /* make sure page count doesn't overflow */ cost = (u64) dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *); - cost += dev_map_bitmap_size(attr) * num_possible_cpus(); - if (cost >= U32_MAX - PAGE_SIZE) - goto free_dtab; + cost += sizeof(struct list_head) * num_possible_cpus(); - dtab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; - - /* if map size is larger than memlock limit, reject it early */ - err = bpf_map_precharge_memlock(dtab->map.pages); + /* if map size is larger than memlock limit, reject it */ + err = bpf_map_charge_init(&dtab->map.memory, cost); if (err) goto free_dtab; err = -ENOMEM; - /* A per cpu bitfield with a bit per possible net device */ - dtab->flush_needed = __alloc_percpu_gfp(dev_map_bitmap_size(attr), - __alignof__(unsigned long), - GFP_KERNEL | __GFP_NOWARN); - if (!dtab->flush_needed) - goto free_dtab; + dtab->flush_list = alloc_percpu(struct list_head); + if (!dtab->flush_list) + goto free_charge; + + for_each_possible_cpu(cpu) + INIT_LIST_HEAD(per_cpu_ptr(dtab->flush_list, cpu)); dtab->netdev_map = bpf_map_area_alloc(dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *), dtab->map.numa_node); if (!dtab->netdev_map) - goto free_dtab; + goto free_percpu; spin_lock(&dev_map_lock); list_add_tail_rcu(&dtab->list, &dev_map_list); spin_unlock(&dev_map_lock); return &dtab->map; + +free_percpu: + free_percpu(dtab->flush_list); +free_charge: + bpf_map_charge_finish(&dtab->map.memory); free_dtab: - free_percpu(dtab->flush_needed); kfree(dtab); return ERR_PTR(err); } @@ -164,15 +159,18 @@ static void dev_map_free(struct bpf_map *map) bpf_clear_redirect_map(map); synchronize_rcu(); + /* Make sure prior __dev_map_entry_free() have completed. */ + rcu_barrier(); + /* To ensure all pending flush operations have completed wait for flush - * bitmap to indicate all flush_needed bits to be zero on _all_ cpus. + * list to empty on _all_ cpus. * Because the above synchronize_rcu() ensures the map is disconnected - * from the program we can assume no new bits will be set. + * from the program we can assume no new items will be added. */ for_each_online_cpu(cpu) { - unsigned long *bitmap = per_cpu_ptr(dtab->flush_needed, cpu); + struct list_head *flush_list = per_cpu_ptr(dtab->flush_list, cpu); - while (!bitmap_empty(bitmap, dtab->map.max_entries)) + while (!list_empty(flush_list)) cond_resched(); } @@ -183,11 +181,12 @@ static void dev_map_free(struct bpf_map *map) if (!dev) continue; + free_percpu(dev->bulkq); dev_put(dev->dev); kfree(dev); } - free_percpu(dtab->flush_needed); + free_percpu(dtab->flush_list); bpf_map_area_free(dtab->netdev_map); kfree(dtab); } @@ -209,18 +208,10 @@ static int dev_map_get_next_key(struct bpf_map *map, void *key, void *next_key) return 0; } -void __dev_map_insert_ctx(struct bpf_map *map, u32 bit) -{ - struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); - unsigned long *bitmap = this_cpu_ptr(dtab->flush_needed); - - __set_bit(bit, bitmap); -} - -static int bq_xmit_all(struct bpf_dtab_netdev *obj, - struct xdp_bulk_queue *bq, u32 flags, +static int bq_xmit_all(struct xdp_bulk_queue *bq, u32 flags, bool in_napi_ctx) { + struct bpf_dtab_netdev *obj = bq->obj; struct net_device *dev = obj->dev; int sent = 0, drops = 0, err = 0; int i; @@ -247,6 +238,7 @@ out: trace_xdp_devmap_xmit(&obj->dtab->map, obj->bit, sent, drops, bq->dev_rx, dev, err); bq->dev_rx = NULL; + __list_del_clearprev(&bq->flush_node); return 0; error: /* If ndo_xdp_xmit fails with an errno, no frames have been @@ -269,30 +261,19 @@ error: * from the driver before returning from its napi->poll() routine. The poll() * routine is called either from busy_poll context or net_rx_action signaled * from NET_RX_SOFTIRQ. Either way the poll routine must complete before the - * net device can be torn down. On devmap tear down we ensure the ctx bitmap - * is zeroed before completing to ensure all flush operations have completed. + * net device can be torn down. On devmap tear down we ensure the flush list + * is empty before completing to ensure all flush operations have completed. */ void __dev_map_flush(struct bpf_map *map) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); - unsigned long *bitmap = this_cpu_ptr(dtab->flush_needed); - u32 bit; - - for_each_set_bit(bit, bitmap, map->max_entries) { - struct bpf_dtab_netdev *dev = READ_ONCE(dtab->netdev_map[bit]); - struct xdp_bulk_queue *bq; - - /* This is possible if the dev entry is removed by user space - * between xdp redirect and flush op. - */ - if (unlikely(!dev)) - continue; - - __clear_bit(bit, bitmap); + struct list_head *flush_list = this_cpu_ptr(dtab->flush_list); + struct xdp_bulk_queue *bq, *tmp; - bq = this_cpu_ptr(dev->bulkq); - bq_xmit_all(dev, bq, XDP_XMIT_FLUSH, true); - } + rcu_read_lock(); + list_for_each_entry_safe(bq, tmp, flush_list, flush_node) + bq_xmit_all(bq, XDP_XMIT_FLUSH, true); + rcu_read_unlock(); } /* rcu_read_lock (from syscall and BPF contexts) ensures that if a delete and/or @@ -318,10 +299,11 @@ static int bq_enqueue(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf, struct net_device *dev_rx) { + struct list_head *flush_list = this_cpu_ptr(obj->dtab->flush_list); struct xdp_bulk_queue *bq = this_cpu_ptr(obj->bulkq); if (unlikely(bq->count == DEV_MAP_BULK_SIZE)) - bq_xmit_all(obj, bq, 0, true); + bq_xmit_all(bq, 0, true); /* Ingress dev_rx will be the same for all xdp_frame's in * bulk_queue, because bq stored per-CPU and must be flushed @@ -331,6 +313,10 @@ static int bq_enqueue(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf, bq->dev_rx = dev_rx; bq->q[bq->count++] = xdpf; + + if (!bq->flush_node.prev) + list_add(&bq->flush_node, flush_list); + return 0; } @@ -381,17 +367,14 @@ static void dev_map_flush_old(struct bpf_dtab_netdev *dev) { if (dev->dev->netdev_ops->ndo_xdp_xmit) { struct xdp_bulk_queue *bq; - unsigned long *bitmap; - int cpu; + rcu_read_lock(); for_each_online_cpu(cpu) { - bitmap = per_cpu_ptr(dev->dtab->flush_needed, cpu); - __clear_bit(dev->bit, bitmap); - bq = per_cpu_ptr(dev->bulkq, cpu); - bq_xmit_all(dev, bq, XDP_XMIT_FLUSH, false); + bq_xmit_all(bq, XDP_XMIT_FLUSH, false); } + rcu_read_unlock(); } } @@ -436,8 +419,10 @@ static int dev_map_update_elem(struct bpf_map *map, void *key, void *value, struct net *net = current->nsproxy->net_ns; gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN; struct bpf_dtab_netdev *dev, *old_dev; - u32 i = *(u32 *)key; u32 ifindex = *(u32 *)value; + struct xdp_bulk_queue *bq; + u32 i = *(u32 *)key; + int cpu; if (unlikely(map_flags > BPF_EXIST)) return -EINVAL; @@ -460,6 +445,11 @@ static int dev_map_update_elem(struct bpf_map *map, void *key, void *value, return -ENOMEM; } + for_each_possible_cpu(cpu) { + bq = per_cpu_ptr(dev->bulkq, cpu); + bq->obj = dev; + } + dev->dev = dev_get_by_index(net, ifindex); if (!dev->dev) { free_percpu(dev->bulkq); diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c index d6b76377cb6e..b44d8c447afd 100644 --- a/kernel/bpf/disasm.c +++ b/kernel/bpf/disasm.c @@ -1,14 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0-only /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com * Copyright (c) 2016 Facebook - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. */ #include <linux/bpf.h> @@ -67,7 +59,7 @@ const char *const bpf_class_string[8] = { [BPF_STX] = "stx", [BPF_ALU] = "alu", [BPF_JMP] = "jmp", - [BPF_RET] = "BUG", + [BPF_JMP32] = "jmp32", [BPF_ALU64] = "alu64", }; @@ -136,23 +128,22 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs, else print_bpf_end_insn(verbose, cbs->private_data, insn); } else if (BPF_OP(insn->code) == BPF_NEG) { - verbose(cbs->private_data, "(%02x) r%d = %s-r%d\n", - insn->code, insn->dst_reg, - class == BPF_ALU ? "(u32) " : "", + verbose(cbs->private_data, "(%02x) %c%d = -%c%d\n", + insn->code, class == BPF_ALU ? 'w' : 'r', + insn->dst_reg, class == BPF_ALU ? 'w' : 'r', insn->dst_reg); } else if (BPF_SRC(insn->code) == BPF_X) { - verbose(cbs->private_data, "(%02x) %sr%d %s %sr%d\n", - insn->code, class == BPF_ALU ? "(u32) " : "", + verbose(cbs->private_data, "(%02x) %c%d %s %c%d\n", + insn->code, class == BPF_ALU ? 'w' : 'r', insn->dst_reg, bpf_alu_string[BPF_OP(insn->code) >> 4], - class == BPF_ALU ? "(u32) " : "", + class == BPF_ALU ? 'w' : 'r', insn->src_reg); } else { - verbose(cbs->private_data, "(%02x) %sr%d %s %s%d\n", - insn->code, class == BPF_ALU ? "(u32) " : "", + verbose(cbs->private_data, "(%02x) %c%d %s %d\n", + insn->code, class == BPF_ALU ? 'w' : 'r', insn->dst_reg, bpf_alu_string[BPF_OP(insn->code) >> 4], - class == BPF_ALU ? "(u32) " : "", insn->imm); } } else if (class == BPF_STX) { @@ -206,10 +197,11 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs, * part of the ldimm64 insn is accessible. */ u64 imm = ((u64)(insn + 1)->imm << 32) | (u32)insn->imm; - bool map_ptr = insn->src_reg == BPF_PSEUDO_MAP_FD; + bool is_ptr = insn->src_reg == BPF_PSEUDO_MAP_FD || + insn->src_reg == BPF_PSEUDO_MAP_VALUE; char tmp[64]; - if (map_ptr && !allow_ptr_leaks) + if (is_ptr && !allow_ptr_leaks) imm = 0; verbose(cbs->private_data, "(%02x) r%d = %s\n", @@ -220,7 +212,7 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs, verbose(cbs->private_data, "BUG_ld_%02x\n", insn->code); return; } - } else if (class == BPF_JMP) { + } else if (class == BPF_JMP32 || class == BPF_JMP) { u8 opcode = BPF_OP(insn->code); if (opcode == BPF_CALL) { @@ -244,13 +236,18 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs, } else if (insn->code == (BPF_JMP | BPF_EXIT)) { verbose(cbs->private_data, "(%02x) exit\n", insn->code); } else if (BPF_SRC(insn->code) == BPF_X) { - verbose(cbs->private_data, "(%02x) if r%d %s r%d goto pc%+d\n", - insn->code, insn->dst_reg, + verbose(cbs->private_data, + "(%02x) if %c%d %s %c%d goto pc%+d\n", + insn->code, class == BPF_JMP32 ? 'w' : 'r', + insn->dst_reg, bpf_jmp_string[BPF_OP(insn->code) >> 4], + class == BPF_JMP32 ? 'w' : 'r', insn->src_reg, insn->off); } else { - verbose(cbs->private_data, "(%02x) if r%d %s 0x%x goto pc%+d\n", - insn->code, insn->dst_reg, + verbose(cbs->private_data, + "(%02x) if %c%d %s 0x%x goto pc%+d\n", + insn->code, class == BPF_JMP32 ? 'w' : 'r', + insn->dst_reg, bpf_jmp_string[BPF_OP(insn->code) >> 4], insn->imm, insn->off); } diff --git a/kernel/bpf/disasm.h b/kernel/bpf/disasm.h index e1324a834a24..e546b18d27da 100644 --- a/kernel/bpf/disasm.h +++ b/kernel/bpf/disasm.h @@ -1,14 +1,6 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com * Copyright (c) 2016 Facebook - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. */ #ifndef __BPF_DISASM_H__ diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index f9274114c88d..22066a62c8c9 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -1,14 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0-only /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com * Copyright (c) 2016 Facebook - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. */ #include <linux/bpf.h> #include <linux/btf.h> @@ -23,7 +15,7 @@ #define HTAB_CREATE_FLAG_MASK \ (BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU | BPF_F_NUMA_NODE | \ - BPF_F_RDONLY | BPF_F_WRONLY | BPF_F_ZERO_SEED) + BPF_F_ACCESS_MASK | BPF_F_ZERO_SEED) struct bucket { struct hlist_nulls_head head; @@ -262,8 +254,8 @@ static int htab_map_alloc_check(union bpf_attr *attr) /* Guard against local DoS, and discourage production use. */ return -EPERM; - if (attr->map_flags & ~HTAB_CREATE_FLAG_MASK) - /* reserved bits should not be used */ + if (attr->map_flags & ~HTAB_CREATE_FLAG_MASK || + !bpf_map_flags_access_ok(attr->map_flags)) return -EINVAL; if (!lru && percpu_lru) @@ -360,14 +352,8 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) else cost += (u64) htab->elem_size * num_possible_cpus(); - if (cost >= U32_MAX - PAGE_SIZE) - /* make sure page count doesn't overflow */ - goto free_htab; - - htab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; - - /* if map size is larger than memlock limit, reject it early */ - err = bpf_map_precharge_memlock(htab->map.pages); + /* if map size is larger than memlock limit, reject it */ + err = bpf_map_charge_init(&htab->map.memory, cost); if (err) goto free_htab; @@ -376,7 +362,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) sizeof(struct bucket), htab->map.numa_node); if (!htab->buckets) - goto free_htab; + goto free_charge; if (htab->map.map_flags & BPF_F_ZERO_SEED) htab->hashrnd = 0; @@ -409,6 +395,8 @@ free_prealloc: prealloc_destroy(htab); free_buckets: bpf_map_area_free(htab->buckets); +free_charge: + bpf_map_charge_finish(&htab->map.memory); free_htab: kfree(htab); return ERR_PTR(err); @@ -527,18 +515,30 @@ static u32 htab_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) return insn - insn_buf; } -static void *htab_lru_map_lookup_elem(struct bpf_map *map, void *key) +static __always_inline void *__htab_lru_map_lookup_elem(struct bpf_map *map, + void *key, const bool mark) { struct htab_elem *l = __htab_map_lookup_elem(map, key); if (l) { - bpf_lru_node_set_ref(&l->lru_node); + if (mark) + bpf_lru_node_set_ref(&l->lru_node); return l->key + round_up(map->key_size, 8); } return NULL; } +static void *htab_lru_map_lookup_elem(struct bpf_map *map, void *key) +{ + return __htab_lru_map_lookup_elem(map, key, true); +} + +static void *htab_lru_map_lookup_elem_sys(struct bpf_map *map, void *key) +{ + return __htab_lru_map_lookup_elem(map, key, false); +} + static u32 htab_lru_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) { @@ -718,21 +718,12 @@ static bool fd_htab_map_needs_adjust(const struct bpf_htab *htab) BITS_PER_LONG == 64; } -static u32 htab_size_value(const struct bpf_htab *htab, bool percpu) -{ - u32 size = htab->map.value_size; - - if (percpu || fd_htab_map_needs_adjust(htab)) - size = round_up(size, 8); - return size; -} - static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, void *value, u32 key_size, u32 hash, bool percpu, bool onallcpus, struct htab_elem *old_elem) { - u32 size = htab_size_value(htab, percpu); + u32 size = htab->map.value_size; bool prealloc = htab_is_prealloc(htab); struct htab_elem *l_new, **pl_new; void __percpu *pptr; @@ -770,10 +761,13 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, l_new = ERR_PTR(-ENOMEM); goto dec_count; } + check_and_init_map_lock(&htab->map, + l_new->key + round_up(key_size, 8)); } memcpy(l_new->key, key, key_size); if (percpu) { + size = round_up(size, 8); if (prealloc) { pptr = htab_elem_get_ptr(l_new, key_size); } else { @@ -791,8 +785,13 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, if (!prealloc) htab_elem_set_ptr(l_new, key_size, pptr); - } else { + } else if (fd_htab_map_needs_adjust(htab)) { + size = round_up(size, 8); memcpy(l_new->key + round_up(key_size, 8), value, size); + } else { + copy_map_value(&htab->map, + l_new->key + round_up(key_size, 8), + value); } l_new->hash = hash; @@ -805,11 +804,11 @@ dec_count: static int check_flags(struct bpf_htab *htab, struct htab_elem *l_old, u64 map_flags) { - if (l_old && map_flags == BPF_NOEXIST) + if (l_old && (map_flags & ~BPF_F_LOCK) == BPF_NOEXIST) /* elem already exists */ return -EEXIST; - if (!l_old && map_flags == BPF_EXIST) + if (!l_old && (map_flags & ~BPF_F_LOCK) == BPF_EXIST) /* elem doesn't exist, cannot update it */ return -ENOENT; @@ -828,7 +827,7 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value, u32 key_size, hash; int ret; - if (unlikely(map_flags > BPF_EXIST)) + if (unlikely((map_flags & ~BPF_F_LOCK) > BPF_EXIST)) /* unknown flags */ return -EINVAL; @@ -841,6 +840,28 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value, b = __select_bucket(htab, hash); head = &b->head; + if (unlikely(map_flags & BPF_F_LOCK)) { + if (unlikely(!map_value_has_spin_lock(map))) + return -EINVAL; + /* find an element without taking the bucket lock */ + l_old = lookup_nulls_elem_raw(head, hash, key, key_size, + htab->n_buckets); + ret = check_flags(htab, l_old, map_flags); + if (ret) + return ret; + if (l_old) { + /* grab the element lock and update value in place */ + copy_map_value_locked(map, + l_old->key + round_up(key_size, 8), + value, false); + return 0; + } + /* fall through, grab the bucket lock and lookup again. + * 99.9% chance that the element won't be found, + * but second lookup under lock has to be done. + */ + } + /* bpf_map_update_elem() can be called in_irq() */ raw_spin_lock_irqsave(&b->lock, flags); @@ -850,6 +871,20 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value, if (ret) goto err; + if (unlikely(l_old && (map_flags & BPF_F_LOCK))) { + /* first lookup without the bucket lock didn't find the element, + * but second lookup with the bucket lock found it. + * This case is highly unlikely, but has to be dealt with: + * grab the element lock in addition to the bucket lock + * and update element in place + */ + copy_map_value_locked(map, + l_old->key + round_up(key_size, 8), + value, false); + ret = 0; + goto err; + } + l_new = alloc_htab_elem(htab, key, value, key_size, hash, false, false, l_old); if (IS_ERR(l_new)) { @@ -1215,6 +1250,7 @@ const struct bpf_map_ops htab_lru_map_ops = { .map_free = htab_map_free, .map_get_next_key = htab_map_get_next_key, .map_lookup_elem = htab_lru_map_lookup_elem, + .map_lookup_elem_sys_only = htab_lru_map_lookup_elem_sys, .map_update_elem = htab_lru_map_update_elem, .map_delete_elem = htab_lru_map_delete_elem, .map_gen_lookup = htab_lru_map_gen_lookup, @@ -1246,7 +1282,6 @@ static void *htab_lru_percpu_map_lookup_elem(struct bpf_map *map, void *key) int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value) { - struct bpf_htab *htab = container_of(map, struct bpf_htab, map); struct htab_elem *l; void __percpu *pptr; int ret = -ENOENT; @@ -1262,8 +1297,9 @@ int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value) l = __htab_map_lookup_elem(map, key); if (!l) goto out; - if (htab_is_lru(htab)) - bpf_lru_node_set_ref(&l->lru_node); + /* We do not mark LRU map element here in order to not mess up + * eviction heuristics when user space does a map walk. + */ pptr = htab_elem_get_ptr(l, map->key_size); for_each_possible_cpu(cpu) { bpf_long_memcpy(value + off, diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index a74972b07e74..5e28718928ca 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1,13 +1,5 @@ +// SPDX-License-Identifier: GPL-2.0-only /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. */ #include <linux/bpf.h> #include <linux/rcupdate.h> @@ -18,6 +10,9 @@ #include <linux/sched.h> #include <linux/uidgid.h> #include <linux/filter.h> +#include <linux/ctype.h> + +#include "../../lib/kstrtox.h" /* If kernel subsystem is allowing eBPF programs to call this function, * inside its own verifier_ops->get_func_proto() callback it should return @@ -221,6 +216,102 @@ const struct bpf_func_proto bpf_get_current_comm_proto = { .arg2_type = ARG_CONST_SIZE, }; +#if defined(CONFIG_QUEUED_SPINLOCKS) || defined(CONFIG_BPF_ARCH_SPINLOCK) + +static inline void __bpf_spin_lock(struct bpf_spin_lock *lock) +{ + arch_spinlock_t *l = (void *)lock; + union { + __u32 val; + arch_spinlock_t lock; + } u = { .lock = __ARCH_SPIN_LOCK_UNLOCKED }; + + compiletime_assert(u.val == 0, "__ARCH_SPIN_LOCK_UNLOCKED not 0"); + BUILD_BUG_ON(sizeof(*l) != sizeof(__u32)); + BUILD_BUG_ON(sizeof(*lock) != sizeof(__u32)); + arch_spin_lock(l); +} + +static inline void __bpf_spin_unlock(struct bpf_spin_lock *lock) +{ + arch_spinlock_t *l = (void *)lock; + + arch_spin_unlock(l); +} + +#else + +static inline void __bpf_spin_lock(struct bpf_spin_lock *lock) +{ + atomic_t *l = (void *)lock; + + BUILD_BUG_ON(sizeof(*l) != sizeof(*lock)); + do { + atomic_cond_read_relaxed(l, !VAL); + } while (atomic_xchg(l, 1)); +} + +static inline void __bpf_spin_unlock(struct bpf_spin_lock *lock) +{ + atomic_t *l = (void *)lock; + + atomic_set_release(l, 0); +} + +#endif + +static DEFINE_PER_CPU(unsigned long, irqsave_flags); + +notrace BPF_CALL_1(bpf_spin_lock, struct bpf_spin_lock *, lock) +{ + unsigned long flags; + + local_irq_save(flags); + __bpf_spin_lock(lock); + __this_cpu_write(irqsave_flags, flags); + return 0; +} + +const struct bpf_func_proto bpf_spin_lock_proto = { + .func = bpf_spin_lock, + .gpl_only = false, + .ret_type = RET_VOID, + .arg1_type = ARG_PTR_TO_SPIN_LOCK, +}; + +notrace BPF_CALL_1(bpf_spin_unlock, struct bpf_spin_lock *, lock) +{ + unsigned long flags; + + flags = __this_cpu_read(irqsave_flags); + __bpf_spin_unlock(lock); + local_irq_restore(flags); + return 0; +} + +const struct bpf_func_proto bpf_spin_unlock_proto = { + .func = bpf_spin_unlock, + .gpl_only = false, + .ret_type = RET_VOID, + .arg1_type = ARG_PTR_TO_SPIN_LOCK, +}; + +void copy_map_value_locked(struct bpf_map *map, void *dst, void *src, + bool lock_src) +{ + struct bpf_spin_lock *lock; + + if (lock_src) + lock = src + map->spin_lock_off; + else + lock = dst + map->spin_lock_off; + preempt_disable(); + ____bpf_spin_lock(lock); + copy_map_value(map, dst, src); + ____bpf_spin_unlock(lock); + preempt_enable(); +} + #ifdef CONFIG_CGROUPS BPF_CALL_0(bpf_get_current_cgroup_id) { @@ -267,4 +358,132 @@ const struct bpf_func_proto bpf_get_local_storage_proto = { .arg2_type = ARG_ANYTHING, }; #endif + +#define BPF_STRTOX_BASE_MASK 0x1F + +static int __bpf_strtoull(const char *buf, size_t buf_len, u64 flags, + unsigned long long *res, bool *is_negative) +{ + unsigned int base = flags & BPF_STRTOX_BASE_MASK; + const char *cur_buf = buf; + size_t cur_len = buf_len; + unsigned int consumed; + size_t val_len; + char str[64]; + + if (!buf || !buf_len || !res || !is_negative) + return -EINVAL; + + if (base != 0 && base != 8 && base != 10 && base != 16) + return -EINVAL; + + if (flags & ~BPF_STRTOX_BASE_MASK) + return -EINVAL; + + while (cur_buf < buf + buf_len && isspace(*cur_buf)) + ++cur_buf; + + *is_negative = (cur_buf < buf + buf_len && *cur_buf == '-'); + if (*is_negative) + ++cur_buf; + + consumed = cur_buf - buf; + cur_len -= consumed; + if (!cur_len) + return -EINVAL; + + cur_len = min(cur_len, sizeof(str) - 1); + memcpy(str, cur_buf, cur_len); + str[cur_len] = '\0'; + cur_buf = str; + + cur_buf = _parse_integer_fixup_radix(cur_buf, &base); + val_len = _parse_integer(cur_buf, base, res); + + if (val_len & KSTRTOX_OVERFLOW) + return -ERANGE; + + if (val_len == 0) + return -EINVAL; + + cur_buf += val_len; + consumed += cur_buf - str; + + return consumed; +} + +static int __bpf_strtoll(const char *buf, size_t buf_len, u64 flags, + long long *res) +{ + unsigned long long _res; + bool is_negative; + int err; + + err = __bpf_strtoull(buf, buf_len, flags, &_res, &is_negative); + if (err < 0) + return err; + if (is_negative) { + if ((long long)-_res > 0) + return -ERANGE; + *res = -_res; + } else { + if ((long long)_res < 0) + return -ERANGE; + *res = _res; + } + return err; +} + +BPF_CALL_4(bpf_strtol, const char *, buf, size_t, buf_len, u64, flags, + long *, res) +{ + long long _res; + int err; + + err = __bpf_strtoll(buf, buf_len, flags, &_res); + if (err < 0) + return err; + if (_res != (long)_res) + return -ERANGE; + *res = _res; + return err; +} + +const struct bpf_func_proto bpf_strtol_proto = { + .func = bpf_strtol, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_CONST_SIZE, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_PTR_TO_LONG, +}; + +BPF_CALL_4(bpf_strtoul, const char *, buf, size_t, buf_len, u64, flags, + unsigned long *, res) +{ + unsigned long long _res; + bool is_negative; + int err; + + err = __bpf_strtoull(buf, buf_len, flags, &_res, &is_negative); + if (err < 0) + return err; + if (is_negative) + return -EINVAL; + if (_res != (unsigned long)_res) + return -ERANGE; + *res = _res; + return err; +} + +const struct bpf_func_proto bpf_strtoul_proto = { + .func = bpf_strtoul, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_CONST_SIZE, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_PTR_TO_LONG, +}; #endif diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 2ada5e21dfa6..cc0d0cf114e3 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Minimal file system backend for holding eBPF maps and programs, * used by bpf(2) object pinning. @@ -5,10 +6,6 @@ * Authors: * * Daniel Borkmann <daniel@iogearbox.net> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * version 2 as published by the Free Software Foundation. */ #include <linux/init.h> @@ -518,7 +515,7 @@ out: static struct bpf_prog *__get_prog_inode(struct inode *inode, enum bpf_prog_type type) { struct bpf_prog *prog; - int ret = inode_permission(inode, MAY_READ | MAY_WRITE); + int ret = inode_permission(inode, MAY_READ); if (ret) return ERR_PTR(ret); @@ -554,19 +551,6 @@ struct bpf_prog *bpf_prog_get_type_path(const char *name, enum bpf_prog_type typ } EXPORT_SYMBOL(bpf_prog_get_type_path); -static void bpf_evict_inode(struct inode *inode) -{ - enum bpf_type type; - - truncate_inode_pages_final(&inode->i_data); - clear_inode(inode); - - if (S_ISLNK(inode->i_mode)) - kfree(inode->i_link); - if (!bpf_inode_type(inode, &type)) - bpf_any_put(inode->i_private, type); -} - /* * Display the mount options in /proc/mounts. */ @@ -579,11 +563,22 @@ static int bpf_show_options(struct seq_file *m, struct dentry *root) return 0; } +static void bpf_free_inode(struct inode *inode) +{ + enum bpf_type type; + + if (S_ISLNK(inode->i_mode)) + kfree(inode->i_link); + if (!bpf_inode_type(inode, &type)) + bpf_any_put(inode->i_private, type); + free_inode_nonrcu(inode); +} + static const struct super_operations bpf_super_ops = { .statfs = simple_statfs, .drop_inode = generic_delete_inode, .show_options = bpf_show_options, - .evict_inode = bpf_evict_inode, + .free_inode = bpf_free_inode, }; enum { diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c index 07a34ef562a0..addd6fdceec8 100644 --- a/kernel/bpf/local_storage.c +++ b/kernel/bpf/local_storage.c @@ -14,7 +14,7 @@ DEFINE_PER_CPU(struct bpf_cgroup_storage*, bpf_cgroup_storage[MAX_BPF_CGROUP_STO #ifdef CONFIG_CGROUP_BPF #define LOCAL_STORAGE_CREATE_FLAG_MASK \ - (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) + (BPF_F_NUMA_NODE | BPF_F_ACCESS_MASK) struct bpf_cgroup_storage_map { struct bpf_map map; @@ -131,7 +131,14 @@ static int cgroup_storage_update_elem(struct bpf_map *map, void *_key, struct bpf_cgroup_storage *storage; struct bpf_storage_buffer *new; - if (flags != BPF_ANY && flags != BPF_EXIST) + if (unlikely(flags & ~(BPF_F_LOCK | BPF_EXIST | BPF_NOEXIST))) + return -EINVAL; + + if (unlikely(flags & BPF_NOEXIST)) + return -EINVAL; + + if (unlikely((flags & BPF_F_LOCK) && + !map_value_has_spin_lock(map))) return -EINVAL; storage = cgroup_storage_lookup((struct bpf_cgroup_storage_map *)map, @@ -139,6 +146,11 @@ static int cgroup_storage_update_elem(struct bpf_map *map, void *_key, if (!storage) return -ENOENT; + if (flags & BPF_F_LOCK) { + copy_map_value_locked(map, storage->buf->data, value, false); + return 0; + } + new = kmalloc_node(sizeof(struct bpf_storage_buffer) + map->value_size, __GFP_ZERO | GFP_ATOMIC | __GFP_NOWARN, @@ -147,6 +159,7 @@ static int cgroup_storage_update_elem(struct bpf_map *map, void *_key, return -ENOMEM; memcpy(&new->data[0], value, map->value_size); + check_and_init_map_lock(map, new->data); new = xchg(&storage->buf, new); kfree_rcu(new, rcu); @@ -259,6 +272,8 @@ static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr) { int numa_node = bpf_map_attr_numa_node(attr); struct bpf_cgroup_storage_map *map; + struct bpf_map_memory mem; + int ret; if (attr->key_size != sizeof(struct bpf_cgroup_storage_key)) return ERR_PTR(-EINVAL); @@ -269,21 +284,26 @@ static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr) if (attr->value_size > PAGE_SIZE) return ERR_PTR(-E2BIG); - if (attr->map_flags & ~LOCAL_STORAGE_CREATE_FLAG_MASK) - /* reserved bits should not be used */ + if (attr->map_flags & ~LOCAL_STORAGE_CREATE_FLAG_MASK || + !bpf_map_flags_access_ok(attr->map_flags)) return ERR_PTR(-EINVAL); if (attr->max_entries) /* max_entries is not used and enforced to be 0 */ return ERR_PTR(-EINVAL); + ret = bpf_map_charge_init(&mem, sizeof(struct bpf_cgroup_storage_map)); + if (ret < 0) + return ERR_PTR(ret); + map = kmalloc_node(sizeof(struct bpf_cgroup_storage_map), __GFP_ZERO | GFP_USER, numa_node); - if (!map) + if (!map) { + bpf_map_charge_finish(&mem); return ERR_PTR(-ENOMEM); + } - map->map.pages = round_up(sizeof(struct bpf_cgroup_storage_map), - PAGE_SIZE) >> PAGE_SHIFT; + bpf_map_charge_move(&map->map.memory, &mem); /* copy mandatory map attributes */ bpf_map_init_from_attr(&map->map, attr); @@ -483,6 +503,7 @@ struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(struct bpf_prog *prog, storage->buf = kmalloc_node(size, flags, map->numa_node); if (!storage->buf) goto enomem; + check_and_init_map_lock(map, storage->buf->data); } else { storage->percpu_buf = __alloc_percpu_gfp(size, 8, flags); if (!storage->percpu_buf) diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index abf1002080df..56e6c75d354d 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -1,12 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Longest prefix match list implementation * * Copyright (c) 2016,2017 Daniel Mack * Copyright (c) 2016 David Herrmann - * - * This file is subject to the terms and conditions of version 2 of the GNU - * General Public License. See the file COPYING in the main directory of the - * Linux distribution for more details. */ #include <linux/bpf.h> @@ -471,6 +468,7 @@ static int trie_delete_elem(struct bpf_map *map, void *_key) } if (!node || node->prefixlen != key->prefixlen || + node->prefixlen != matchlen || (node->flags & LPM_TREE_NODE_FLAG_IM)) { ret = -ENOENT; goto out; @@ -537,7 +535,7 @@ out: #define LPM_KEY_SIZE_MIN LPM_KEY_SIZE(LPM_DATA_SIZE_MIN) #define LPM_CREATE_FLAG_MASK (BPF_F_NO_PREALLOC | BPF_F_NUMA_NODE | \ - BPF_F_RDONLY | BPF_F_WRONLY) + BPF_F_ACCESS_MASK) static struct bpf_map *trie_alloc(union bpf_attr *attr) { @@ -552,6 +550,7 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr) if (attr->max_entries == 0 || !(attr->map_flags & BPF_F_NO_PREALLOC) || attr->map_flags & ~LPM_CREATE_FLAG_MASK || + !bpf_map_flags_access_ok(attr->map_flags) || attr->key_size < LPM_KEY_SIZE_MIN || attr->key_size > LPM_KEY_SIZE_MAX || attr->value_size < LPM_VAL_SIZE_MIN || @@ -571,14 +570,8 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr) cost_per_node = sizeof(struct lpm_trie_node) + attr->value_size + trie->data_size; cost += (u64) attr->max_entries * cost_per_node; - if (cost >= U32_MAX - PAGE_SIZE) { - ret = -E2BIG; - goto out_err; - } - - trie->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; - ret = bpf_map_precharge_memlock(trie->map.pages); + ret = bpf_map_charge_init(&trie->map.memory, cost); if (ret) goto out_err; @@ -714,9 +707,14 @@ find_leftmost: * have exact two children, so this function will never return NULL. */ for (node = search_root; node;) { - if (!(node->flags & LPM_TREE_NODE_FLAG_IM)) + if (node->flags & LPM_TREE_NODE_FLAG_IM) { + node = rcu_dereference(node->child[0]); + } else { next_node = node; - node = rcu_dereference(node->child[0]); + node = rcu_dereference(node->child[0]); + if (!node) + node = rcu_dereference(next_node->child[1]); + } } do_copy: next_key->prefixlen = next_node->prefixlen; diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c index 52378d3e34b3..fab4fb134547 100644 --- a/kernel/bpf/map_in_map.c +++ b/kernel/bpf/map_in_map.c @@ -1,8 +1,5 @@ +// SPDX-License-Identifier: GPL-2.0-only /* Copyright (c) 2017 Facebook - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. */ #include <linux/slab.h> #include <linux/bpf.h> @@ -37,6 +34,11 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd) return ERR_PTR(-EINVAL); } + if (map_value_has_spin_lock(inner_map)) { + fdput(f); + return ERR_PTR(-ENOTSUPP); + } + inner_map_meta_size = sizeof(*inner_map_meta); /* In some cases verifier needs to access beyond just base map. */ if (inner_map->ops == &array_map_ops) @@ -53,6 +55,7 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd) inner_map_meta->value_size = inner_map->value_size; inner_map_meta->map_flags = inner_map->map_flags; inner_map_meta->max_entries = inner_map->max_entries; + inner_map_meta->spin_lock_off = inner_map->spin_lock_off; /* Misc members not needed in bpf_map_meta_equal() check. */ inner_map_meta->ops = inner_map->ops; diff --git a/kernel/bpf/map_in_map.h b/kernel/bpf/map_in_map.h index 6183db9ec08c..a507bf6ef8b9 100644 --- a/kernel/bpf/map_in_map.h +++ b/kernel/bpf/map_in_map.h @@ -1,8 +1,5 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ /* Copyright (c) 2017 Facebook - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. */ #ifndef __MAP_IN_MAP_H__ #define __MAP_IN_MAP_H__ diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index 54cf2b9c44a4..ba635209ae9a 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -35,6 +35,7 @@ static DECLARE_RWSEM(bpf_devs_lock); struct bpf_offload_dev { const struct bpf_prog_offload_ops *ops; struct list_head netdevs; + void *priv; }; struct bpf_offload_netdev { @@ -173,6 +174,41 @@ int bpf_prog_offload_finalize(struct bpf_verifier_env *env) return ret; } +void +bpf_prog_offload_replace_insn(struct bpf_verifier_env *env, u32 off, + struct bpf_insn *insn) +{ + const struct bpf_prog_offload_ops *ops; + struct bpf_prog_offload *offload; + int ret = -EOPNOTSUPP; + + down_read(&bpf_devs_lock); + offload = env->prog->aux->offload; + if (offload) { + ops = offload->offdev->ops; + if (!offload->opt_failed && ops->replace_insn) + ret = ops->replace_insn(env, off, insn); + offload->opt_failed |= ret; + } + up_read(&bpf_devs_lock); +} + +void +bpf_prog_offload_remove_insns(struct bpf_verifier_env *env, u32 off, u32 cnt) +{ + struct bpf_prog_offload *offload; + int ret = -EOPNOTSUPP; + + down_read(&bpf_devs_lock); + offload = env->prog->aux->offload; + if (offload) { + if (!offload->opt_failed && offload->offdev->ops->remove_insns) + ret = offload->offdev->ops->remove_insns(env, off, cnt); + offload->opt_failed |= ret; + } + up_read(&bpf_devs_lock); +} + static void __bpf_prog_offload_destroy(struct bpf_prog *prog) { struct bpf_prog_offload *offload = prog->aux->offload; @@ -634,7 +670,7 @@ unlock: EXPORT_SYMBOL_GPL(bpf_offload_dev_netdev_unregister); struct bpf_offload_dev * -bpf_offload_dev_create(const struct bpf_prog_offload_ops *ops) +bpf_offload_dev_create(const struct bpf_prog_offload_ops *ops, void *priv) { struct bpf_offload_dev *offdev; int err; @@ -653,6 +689,7 @@ bpf_offload_dev_create(const struct bpf_prog_offload_ops *ops) return ERR_PTR(-ENOMEM); offdev->ops = ops; + offdev->priv = priv; INIT_LIST_HEAD(&offdev->netdevs); return offdev; @@ -665,3 +702,9 @@ void bpf_offload_dev_destroy(struct bpf_offload_dev *offdev) kfree(offdev); } EXPORT_SYMBOL_GPL(bpf_offload_dev_destroy); + +void *bpf_offload_dev_priv(struct bpf_offload_dev *offdev) +{ + return offdev->priv; +} +EXPORT_SYMBOL_GPL(bpf_offload_dev_priv); diff --git a/kernel/bpf/percpu_freelist.c b/kernel/bpf/percpu_freelist.c index 0c1b4ba9e90e..6e090140b924 100644 --- a/kernel/bpf/percpu_freelist.c +++ b/kernel/bpf/percpu_freelist.c @@ -1,8 +1,5 @@ +// SPDX-License-Identifier: GPL-2.0-only /* Copyright (c) 2016 Facebook - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. */ #include "percpu_freelist.h" diff --git a/kernel/bpf/percpu_freelist.h b/kernel/bpf/percpu_freelist.h index c3960118e617..fbf8a8a28979 100644 --- a/kernel/bpf/percpu_freelist.h +++ b/kernel/bpf/percpu_freelist.h @@ -1,8 +1,5 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ /* Copyright (c) 2016 Facebook - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. */ #ifndef __PERCPU_FREELIST_H__ #define __PERCPU_FREELIST_H__ diff --git a/kernel/bpf/queue_stack_maps.c b/kernel/bpf/queue_stack_maps.c index b384ea9f3254..f697647ceb54 100644 --- a/kernel/bpf/queue_stack_maps.c +++ b/kernel/bpf/queue_stack_maps.c @@ -11,8 +11,7 @@ #include "percpu_freelist.h" #define QUEUE_STACK_CREATE_FLAG_MASK \ - (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) - + (BPF_F_NUMA_NODE | BPF_F_ACCESS_MASK) struct bpf_queue_stack { struct bpf_map map; @@ -52,7 +51,8 @@ static int queue_stack_map_alloc_check(union bpf_attr *attr) /* check sanity of attributes */ if (attr->max_entries == 0 || attr->key_size != 0 || attr->value_size == 0 || - attr->map_flags & ~QUEUE_STACK_CREATE_FLAG_MASK) + attr->map_flags & ~QUEUE_STACK_CREATE_FLAG_MASK || + !bpf_map_flags_access_ok(attr->map_flags)) return -EINVAL; if (attr->value_size > KMALLOC_MAX_SIZE) @@ -67,29 +67,28 @@ static int queue_stack_map_alloc_check(union bpf_attr *attr) static struct bpf_map *queue_stack_map_alloc(union bpf_attr *attr) { int ret, numa_node = bpf_map_attr_numa_node(attr); + struct bpf_map_memory mem = {0}; struct bpf_queue_stack *qs; u64 size, queue_size, cost; size = (u64) attr->max_entries + 1; cost = queue_size = sizeof(*qs) + size * attr->value_size; - if (cost >= U32_MAX - PAGE_SIZE) - return ERR_PTR(-E2BIG); - - cost = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; - ret = bpf_map_precharge_memlock(cost); + ret = bpf_map_charge_init(&mem, cost); if (ret < 0) return ERR_PTR(ret); qs = bpf_map_area_alloc(queue_size, numa_node); - if (!qs) + if (!qs) { + bpf_map_charge_finish(&mem); return ERR_PTR(-ENOMEM); + } memset(qs, 0, sizeof(*qs)); bpf_map_init_from_attr(&qs->map, attr); - qs->map.pages = cost; + bpf_map_charge_move(&qs->map.memory, &mem); qs->size = size; raw_spin_lock_init(&qs->lock); diff --git a/kernel/bpf/reuseport_array.c b/kernel/bpf/reuseport_array.c index 18e225de80ff..50c083ba978c 100644 --- a/kernel/bpf/reuseport_array.c +++ b/kernel/bpf/reuseport_array.c @@ -151,7 +151,8 @@ static struct bpf_map *reuseport_array_alloc(union bpf_attr *attr) { int err, numa_node = bpf_map_attr_numa_node(attr); struct reuseport_array *array; - u64 cost, array_size; + struct bpf_map_memory mem; + u64 array_size; if (!capable(CAP_SYS_ADMIN)) return ERR_PTR(-EPERM); @@ -159,24 +160,20 @@ static struct bpf_map *reuseport_array_alloc(union bpf_attr *attr) array_size = sizeof(*array); array_size += (u64)attr->max_entries * sizeof(struct sock *); - /* make sure there is no u32 overflow later in round_up() */ - cost = array_size; - if (cost >= U32_MAX - PAGE_SIZE) - return ERR_PTR(-ENOMEM); - cost = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; - - err = bpf_map_precharge_memlock(cost); + err = bpf_map_charge_init(&mem, array_size); if (err) return ERR_PTR(err); /* allocate all map elements and zero-initialize them */ array = bpf_map_area_alloc(array_size, numa_node); - if (!array) + if (!array) { + bpf_map_charge_finish(&mem); return ERR_PTR(-ENOMEM); + } /* copy mandatory map attributes */ bpf_map_init_from_attr(&array->map, attr); - array->map.pages = cost; + bpf_map_charge_move(&array->map.memory, &mem); return &array->map; } diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index d43b14535827..052580c33d26 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -1,8 +1,5 @@ +// SPDX-License-Identifier: GPL-2.0-only /* Copyright (c) 2016 Facebook - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. */ #include <linux/bpf.h> #include <linux/jhash.h> @@ -44,7 +41,7 @@ static void do_up_read(struct irq_work *entry) struct stack_map_irq_work *work; work = container_of(entry, struct stack_map_irq_work, irq_work); - up_read(work->sem); + up_read_non_owner(work->sem); work->sem = NULL; } @@ -89,6 +86,7 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr) { u32 value_size = attr->value_size; struct bpf_stack_map *smap; + struct bpf_map_memory mem; u64 cost, n_buckets; int err; @@ -116,40 +114,37 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr) n_buckets = roundup_pow_of_two(attr->max_entries); cost = n_buckets * sizeof(struct stack_map_bucket *) + sizeof(*smap); - if (cost >= U32_MAX - PAGE_SIZE) - return ERR_PTR(-E2BIG); + cost += n_buckets * (value_size + sizeof(struct stack_map_bucket)); + err = bpf_map_charge_init(&mem, cost); + if (err) + return ERR_PTR(err); smap = bpf_map_area_alloc(cost, bpf_map_attr_numa_node(attr)); - if (!smap) + if (!smap) { + bpf_map_charge_finish(&mem); return ERR_PTR(-ENOMEM); - - err = -E2BIG; - cost += n_buckets * (value_size + sizeof(struct stack_map_bucket)); - if (cost >= U32_MAX - PAGE_SIZE) - goto free_smap; + } bpf_map_init_from_attr(&smap->map, attr); smap->map.value_size = value_size; smap->n_buckets = n_buckets; - smap->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; - - err = bpf_map_precharge_memlock(smap->map.pages); - if (err) - goto free_smap; err = get_callchain_buffers(sysctl_perf_event_max_stack); if (err) - goto free_smap; + goto free_charge; err = prealloc_elems_and_freelist(smap); if (err) goto put_buffers; + bpf_map_charge_move(&smap->map.memory, &mem); + return &smap->map; put_buffers: put_callchain_buffers(); -free_smap: +free_charge: + bpf_map_charge_finish(&mem); bpf_map_area_free(smap); return ERR_PTR(err); } @@ -338,6 +333,12 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs, } else { work->sem = ¤t->mm->mmap_sem; irq_work_queue(&work->irq_work); + /* + * The irq_work will release the mmap_sem with + * up_read_non_owner(). The rwsem_release() is called + * here to release the lock from lockdep's perspective. + */ + rwsem_release(¤t->mm->mmap_sem.dep_map, 1, _RET_IP_); } } diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 8577bb7f8be6..5d141f16f6fa 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1,13 +1,5 @@ +// SPDX-License-Identifier: GPL-2.0-only /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. */ #include <linux/bpf.h> #include <linux/bpf_trace.h> @@ -136,21 +128,29 @@ static struct bpf_map *find_and_alloc_map(union bpf_attr *attr) void *bpf_map_area_alloc(size_t size, int numa_node) { - /* We definitely need __GFP_NORETRY, so OOM killer doesn't - * trigger under memory pressure as we really just want to - * fail instead. + /* We really just want to fail instead of triggering OOM killer + * under memory pressure, therefore we set __GFP_NORETRY to kmalloc, + * which is used for lower order allocation requests. + * + * It has been observed that higher order allocation requests done by + * vmalloc with __GFP_NORETRY being set might fail due to not trying + * to reclaim memory from the page cache, thus we set + * __GFP_RETRY_MAYFAIL to avoid such situations. */ - const gfp_t flags = __GFP_NOWARN | __GFP_NORETRY | __GFP_ZERO; + + const gfp_t flags = __GFP_NOWARN | __GFP_ZERO; void *area; if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) { - area = kmalloc_node(size, GFP_USER | flags, numa_node); + area = kmalloc_node(size, GFP_USER | __GFP_NORETRY | flags, + numa_node); if (area != NULL) return area; } - return __vmalloc_node_flags_caller(size, numa_node, GFP_KERNEL | flags, - __builtin_return_address(0)); + return __vmalloc_node_flags_caller(size, numa_node, + GFP_KERNEL | __GFP_RETRY_MAYFAIL | + flags, __builtin_return_address(0)); } void bpf_map_area_free(void *area) @@ -158,29 +158,28 @@ void bpf_map_area_free(void *area) kvfree(area); } +static u32 bpf_map_flags_retain_permanent(u32 flags) +{ + /* Some map creation flags are not tied to the map object but + * rather to the map fd instead, so they have no meaning upon + * map object inspection since multiple file descriptors with + * different (access) properties can exist here. Thus, given + * this has zero meaning for the map itself, lets clear these + * from here. + */ + return flags & ~(BPF_F_RDONLY | BPF_F_WRONLY); +} + void bpf_map_init_from_attr(struct bpf_map *map, union bpf_attr *attr) { map->map_type = attr->map_type; map->key_size = attr->key_size; map->value_size = attr->value_size; map->max_entries = attr->max_entries; - map->map_flags = attr->map_flags; + map->map_flags = bpf_map_flags_retain_permanent(attr->map_flags); map->numa_node = bpf_map_attr_numa_node(attr); } -int bpf_map_precharge_memlock(u32 pages) -{ - struct user_struct *user = get_current_user(); - unsigned long memlock_limit, cur; - - memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; - cur = atomic_long_read(&user->locked_vm); - free_uid(user); - if (cur + pages > memlock_limit) - return -EPERM; - return 0; -} - static int bpf_charge_memlock(struct user_struct *user, u32 pages) { unsigned long memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; @@ -194,45 +193,62 @@ static int bpf_charge_memlock(struct user_struct *user, u32 pages) static void bpf_uncharge_memlock(struct user_struct *user, u32 pages) { - atomic_long_sub(pages, &user->locked_vm); + if (user) + atomic_long_sub(pages, &user->locked_vm); } -static int bpf_map_init_memlock(struct bpf_map *map) +int bpf_map_charge_init(struct bpf_map_memory *mem, size_t size) { - struct user_struct *user = get_current_user(); + u32 pages = round_up(size, PAGE_SIZE) >> PAGE_SHIFT; + struct user_struct *user; int ret; - ret = bpf_charge_memlock(user, map->pages); + if (size >= U32_MAX - PAGE_SIZE) + return -E2BIG; + + user = get_current_user(); + ret = bpf_charge_memlock(user, pages); if (ret) { free_uid(user); return ret; } - map->user = user; - return ret; + + mem->pages = pages; + mem->user = user; + + return 0; } -static void bpf_map_release_memlock(struct bpf_map *map) +void bpf_map_charge_finish(struct bpf_map_memory *mem) { - struct user_struct *user = map->user; - bpf_uncharge_memlock(user, map->pages); - free_uid(user); + bpf_uncharge_memlock(mem->user, mem->pages); + free_uid(mem->user); +} + +void bpf_map_charge_move(struct bpf_map_memory *dst, + struct bpf_map_memory *src) +{ + *dst = *src; + + /* Make sure src will not be used for the redundant uncharging. */ + memset(src, 0, sizeof(struct bpf_map_memory)); } int bpf_map_charge_memlock(struct bpf_map *map, u32 pages) { int ret; - ret = bpf_charge_memlock(map->user, pages); + ret = bpf_charge_memlock(map->memory.user, pages); if (ret) return ret; - map->pages += pages; + map->memory.pages += pages; return ret; } void bpf_map_uncharge_memlock(struct bpf_map *map, u32 pages) { - bpf_uncharge_memlock(map->user, pages); - map->pages -= pages; + bpf_uncharge_memlock(map->memory.user, pages); + map->memory.pages -= pages; } static int bpf_map_alloc_id(struct bpf_map *map) @@ -283,11 +299,13 @@ void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock) static void bpf_map_free_deferred(struct work_struct *work) { struct bpf_map *map = container_of(work, struct bpf_map, work); + struct bpf_map_memory mem; - bpf_map_release_memlock(map); + bpf_map_charge_move(&mem, &map->memory); security_bpf_map_free(map); /* implementation dependent freeing */ map->ops->map_free(map); + bpf_map_charge_finish(&mem); } static void bpf_map_put_uref(struct bpf_map *map) @@ -335,6 +353,18 @@ static int bpf_map_release(struct inode *inode, struct file *filp) return 0; } +static fmode_t map_get_sys_perms(struct bpf_map *map, struct fd f) +{ + fmode_t mode = f.file->f_mode; + + /* Our file permissions may have been overridden by global + * map permissions facing syscall side. + */ + if (READ_ONCE(map->frozen)) + mode &= ~FMODE_CAN_WRITE; + return mode; +} + #ifdef CONFIG_PROC_FS static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp) { @@ -356,14 +386,16 @@ static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp) "max_entries:\t%u\n" "map_flags:\t%#x\n" "memlock:\t%llu\n" - "map_id:\t%u\n", + "map_id:\t%u\n" + "frozen:\t%u\n", map->map_type, map->key_size, map->value_size, map->max_entries, map->map_flags, - map->pages * 1ULL << PAGE_SHIFT, - map->id); + map->memory.pages * 1ULL << PAGE_SHIFT, + map->id, + READ_ONCE(map->frozen)); if (owner_prog_type) { seq_printf(m, "owner_prog_type:\t%u\n", @@ -440,10 +472,10 @@ static int bpf_obj_name_cpy(char *dst, const char *src) const char *end = src + BPF_OBJ_NAME_LEN; memset(dst, 0, BPF_OBJ_NAME_LEN); - - /* Copy all isalnum() and '_' char */ + /* Copy all isalnum(), '_' and '.' chars. */ while (src < end && *src) { - if (!isalnum(*src) && *src != '_') + if (!isalnum(*src) && + *src != '_' && *src != '.') return -EINVAL; *dst++ = *src++; } @@ -463,21 +495,47 @@ int map_check_no_btf(const struct bpf_map *map, return -ENOTSUPP; } -static int map_check_btf(const struct bpf_map *map, const struct btf *btf, +static int map_check_btf(struct bpf_map *map, const struct btf *btf, u32 btf_key_id, u32 btf_value_id) { const struct btf_type *key_type, *value_type; u32 key_size, value_size; int ret = 0; - key_type = btf_type_id_size(btf, &btf_key_id, &key_size); - if (!key_type || key_size != map->key_size) - return -EINVAL; + /* Some maps allow key to be unspecified. */ + if (btf_key_id) { + key_type = btf_type_id_size(btf, &btf_key_id, &key_size); + if (!key_type || key_size != map->key_size) + return -EINVAL; + } else { + key_type = btf_type_by_id(btf, 0); + if (!map->ops->map_check_btf) + return -EINVAL; + } value_type = btf_type_id_size(btf, &btf_value_id, &value_size); if (!value_type || value_size != map->value_size) return -EINVAL; + map->spin_lock_off = btf_find_spin_lock(btf, value_type); + + if (map_value_has_spin_lock(map)) { + if (map->map_flags & BPF_F_RDONLY_PROG) + return -EACCES; + if (map->map_type != BPF_MAP_TYPE_HASH && + map->map_type != BPF_MAP_TYPE_ARRAY && + map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE && + map->map_type != BPF_MAP_TYPE_SK_STORAGE) + return -ENOTSUPP; + if (map->spin_lock_off + sizeof(struct bpf_spin_lock) > + map->value_size) { + WARN_ONCE(1, + "verifier bug spin_lock_off %d value_size %d\n", + map->spin_lock_off, map->value_size); + return -EFAULT; + } + } + if (map->ops->map_check_btf) ret = map->ops->map_check_btf(map, btf, key_type, value_type); @@ -489,6 +547,7 @@ static int map_check_btf(const struct bpf_map *map, const struct btf *btf, static int map_create(union bpf_attr *attr) { int numa_node = bpf_map_attr_numa_node(attr); + struct bpf_map_memory mem; struct bpf_map *map; int f_flags; int err; @@ -513,7 +572,7 @@ static int map_create(union bpf_attr *attr) err = bpf_obj_name_cpy(map->name, attr->map_name); if (err) - goto free_map_nouncharge; + goto free_map; atomic_set(&map->refcnt, 1); atomic_set(&map->usercnt, 1); @@ -521,62 +580,60 @@ static int map_create(union bpf_attr *attr) if (attr->btf_key_type_id || attr->btf_value_type_id) { struct btf *btf; - if (!attr->btf_key_type_id || !attr->btf_value_type_id) { + if (!attr->btf_value_type_id) { err = -EINVAL; - goto free_map_nouncharge; + goto free_map; } btf = btf_get_by_fd(attr->btf_fd); if (IS_ERR(btf)) { err = PTR_ERR(btf); - goto free_map_nouncharge; + goto free_map; } err = map_check_btf(map, btf, attr->btf_key_type_id, attr->btf_value_type_id); if (err) { btf_put(btf); - goto free_map_nouncharge; + goto free_map; } map->btf = btf; map->btf_key_type_id = attr->btf_key_type_id; map->btf_value_type_id = attr->btf_value_type_id; + } else { + map->spin_lock_off = -EINVAL; } err = security_bpf_map_alloc(map); if (err) - goto free_map_nouncharge; - - err = bpf_map_init_memlock(map); - if (err) - goto free_map_sec; + goto free_map; err = bpf_map_alloc_id(map); if (err) - goto free_map; + goto free_map_sec; err = bpf_map_new_fd(map, f_flags); if (err < 0) { /* failed to allocate fd. - * bpf_map_put() is needed because the above + * bpf_map_put_with_uref() is needed because the above * bpf_map_alloc_id() has published the map * to the userspace and the userspace may * have refcnt-ed it through BPF_MAP_GET_FD_BY_ID. */ - bpf_map_put(map); + bpf_map_put_with_uref(map); return err; } return err; -free_map: - bpf_map_release_memlock(map); free_map_sec: security_bpf_map_free(map); -free_map_nouncharge: +free_map: btf_put(map->btf); + bpf_map_charge_move(&mem, &map->memory); map->ops->map_free(map); + bpf_map_charge_finish(&mem); return err; } @@ -664,7 +721,7 @@ static void *__bpf_copy_key(void __user *ukey, u64 key_size) } /* last field in 'union bpf_attr' used by this command */ -#define BPF_MAP_LOOKUP_ELEM_LAST_FIELD value +#define BPF_MAP_LOOKUP_ELEM_LAST_FIELD flags static int map_lookup_elem(union bpf_attr *attr) { @@ -680,16 +737,24 @@ static int map_lookup_elem(union bpf_attr *attr) if (CHECK_ATTR(BPF_MAP_LOOKUP_ELEM)) return -EINVAL; + if (attr->flags & ~BPF_F_LOCK) + return -EINVAL; + f = fdget(ufd); map = __bpf_map_get(f); if (IS_ERR(map)) return PTR_ERR(map); - - if (!(f.file->f_mode & FMODE_CAN_READ)) { + if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ)) { err = -EPERM; goto err_put; } + if ((attr->flags & BPF_F_LOCK) && + !map_value_has_spin_lock(map)) { + err = -EINVAL; + goto err_put; + } + key = __bpf_copy_key(ukey, map->key_size); if (IS_ERR(key)) { err = PTR_ERR(key); @@ -738,14 +803,23 @@ static int map_lookup_elem(union bpf_attr *attr) err = map->ops->map_peek_elem(map, value); } else { rcu_read_lock(); - ptr = map->ops->map_lookup_elem(map, key); + if (map->ops->map_lookup_elem_sys_only) + ptr = map->ops->map_lookup_elem_sys_only(map, key); + else + ptr = map->ops->map_lookup_elem(map, key); if (IS_ERR(ptr)) { err = PTR_ERR(ptr); } else if (!ptr) { err = -ENOENT; } else { err = 0; - memcpy(value, ptr, value_size); + if (attr->flags & BPF_F_LOCK) + /* lock 'ptr' and copy everything but lock */ + copy_map_value_locked(map, value, ptr, true); + else + copy_map_value(map, value, ptr); + /* mask lock, since value wasn't zero inited */ + check_and_init_map_lock(map, value); } rcu_read_unlock(); } @@ -802,12 +876,17 @@ static int map_update_elem(union bpf_attr *attr) map = __bpf_map_get(f); if (IS_ERR(map)) return PTR_ERR(map); - - if (!(f.file->f_mode & FMODE_CAN_WRITE)) { + if (!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) { err = -EPERM; goto err_put; } + if ((attr->flags & BPF_F_LOCK) && + !map_value_has_spin_lock(map)) { + err = -EINVAL; + goto err_put; + } + key = __bpf_copy_key(ukey, map->key_size); if (IS_ERR(key)) { err = PTR_ERR(key); @@ -908,8 +987,7 @@ static int map_delete_elem(union bpf_attr *attr) map = __bpf_map_get(f); if (IS_ERR(map)) return PTR_ERR(map); - - if (!(f.file->f_mode & FMODE_CAN_WRITE)) { + if (!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) { err = -EPERM; goto err_put; } @@ -960,8 +1038,7 @@ static int map_get_next_key(union bpf_attr *attr) map = __bpf_map_get(f); if (IS_ERR(map)) return PTR_ERR(map); - - if (!(f.file->f_mode & FMODE_CAN_READ)) { + if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ)) { err = -EPERM; goto err_put; } @@ -1028,8 +1105,7 @@ static int map_lookup_and_delete_elem(union bpf_attr *attr) map = __bpf_map_get(f); if (IS_ERR(map)) return PTR_ERR(map); - - if (!(f.file->f_mode & FMODE_CAN_WRITE)) { + if (!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) { err = -EPERM; goto err_put; } @@ -1071,6 +1147,36 @@ err_put: return err; } +#define BPF_MAP_FREEZE_LAST_FIELD map_fd + +static int map_freeze(const union bpf_attr *attr) +{ + int err = 0, ufd = attr->map_fd; + struct bpf_map *map; + struct fd f; + + if (CHECK_ATTR(BPF_MAP_FREEZE)) + return -EINVAL; + + f = fdget(ufd); + map = __bpf_map_get(f); + if (IS_ERR(map)) + return PTR_ERR(map); + if (READ_ONCE(map->frozen)) { + err = -EBUSY; + goto err_put; + } + if (!capable(CAP_SYS_ADMIN)) { + err = -EPERM; + goto err_put; + } + + WRITE_ONCE(map->frozen, true); +err_put: + fdput(f); + return err; +} + static const struct bpf_prog_ops * const bpf_prog_types[] = { #define BPF_PROG_TYPE(_id, _name) \ [_id] = & _name ## _prog_ops, @@ -1219,6 +1325,7 @@ static void __bpf_prog_put_rcu(struct rcu_head *rcu) static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock) { if (atomic_dec_and_test(&prog->aux->refcnt)) { + perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_UNLOAD, 0); /* bpf_prog_free_id() must be called first */ bpf_prog_free_id(prog, do_idr_lock); bpf_prog_kallsyms_del_all(prog); @@ -1244,24 +1351,54 @@ static int bpf_prog_release(struct inode *inode, struct file *filp) return 0; } +static void bpf_prog_get_stats(const struct bpf_prog *prog, + struct bpf_prog_stats *stats) +{ + u64 nsecs = 0, cnt = 0; + int cpu; + + for_each_possible_cpu(cpu) { + const struct bpf_prog_stats *st; + unsigned int start; + u64 tnsecs, tcnt; + + st = per_cpu_ptr(prog->aux->stats, cpu); + do { + start = u64_stats_fetch_begin_irq(&st->syncp); + tnsecs = st->nsecs; + tcnt = st->cnt; + } while (u64_stats_fetch_retry_irq(&st->syncp, start)); + nsecs += tnsecs; + cnt += tcnt; + } + stats->nsecs = nsecs; + stats->cnt = cnt; +} + #ifdef CONFIG_PROC_FS static void bpf_prog_show_fdinfo(struct seq_file *m, struct file *filp) { const struct bpf_prog *prog = filp->private_data; char prog_tag[sizeof(prog->tag) * 2 + 1] = { }; + struct bpf_prog_stats stats; + bpf_prog_get_stats(prog, &stats); bin2hex(prog_tag, prog->tag, sizeof(prog->tag)); seq_printf(m, "prog_type:\t%u\n" "prog_jited:\t%u\n" "prog_tag:\t%s\n" "memlock:\t%llu\n" - "prog_id:\t%u\n", + "prog_id:\t%u\n" + "run_time_ns:\t%llu\n" + "run_cnt:\t%llu\n", prog->type, prog->jited, prog_tag, prog->pages * 1ULL << PAGE_SHIFT, - prog->aux->id); + prog->aux->id, + stats.nsecs, + stats.cnt); } #endif @@ -1439,6 +1576,24 @@ bpf_prog_load_check_attach_type(enum bpf_prog_type prog_type, case BPF_CGROUP_INET6_CONNECT: case BPF_CGROUP_UDP4_SENDMSG: case BPF_CGROUP_UDP6_SENDMSG: + case BPF_CGROUP_UDP4_RECVMSG: + case BPF_CGROUP_UDP6_RECVMSG: + return 0; + default: + return -EINVAL; + } + case BPF_PROG_TYPE_CGROUP_SKB: + switch (expected_attach_type) { + case BPF_CGROUP_INET_INGRESS: + case BPF_CGROUP_INET_EGRESS: + return 0; + default: + return -EINVAL; + } + case BPF_PROG_TYPE_CGROUP_SOCKOPT: + switch (expected_attach_type) { + case BPF_CGROUP_SETSOCKOPT: + case BPF_CGROUP_GETSOCKOPT: return 0; default: return -EINVAL; @@ -1462,7 +1617,9 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr) if (CHECK_ATTR(BPF_PROG_LOAD)) return -EINVAL; - if (attr->prog_flags & ~(BPF_F_STRICT_ALIGNMENT | BPF_F_ANY_ALIGNMENT)) + if (attr->prog_flags & ~(BPF_F_STRICT_ALIGNMENT | + BPF_F_ANY_ALIGNMENT | + BPF_F_TEST_RND_HI32)) return -EINVAL; if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && @@ -1479,7 +1636,8 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr) /* eBPF programs must be GPL compatible to use GPL-ed functions */ is_gpl = license_is_gpl_compatible(license); - if (attr->insn_cnt == 0 || attr->insn_cnt > BPF_MAXINSNS) + if (attr->insn_cnt == 0 || + attr->insn_cnt > (capable(CAP_SYS_ADMIN) ? BPF_COMPLEXITY_LIMIT_INSNS : BPF_MAXINSNS)) return -E2BIG; if (type != BPF_PROG_TYPE_SOCKET_FILTER && type != BPF_PROG_TYPE_CGROUP_SKB && @@ -1531,7 +1689,7 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr) if (err < 0) goto free_prog; - prog->aux->load_time = ktime_get_boot_ns(); + prog->aux->load_time = ktime_get_boottime_ns(); err = bpf_obj_name_cpy(prog->aux->name, attr->prog_name); if (err) goto free_prog; @@ -1562,6 +1720,7 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr) } bpf_prog_kallsyms_add(prog); + perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_LOAD, 0); return err; free_used_maps: @@ -1649,12 +1808,16 @@ static int bpf_raw_tracepoint_open(const union bpf_attr *attr) } raw_tp->btp = btp; - prog = bpf_prog_get_type(attr->raw_tracepoint.prog_fd, - BPF_PROG_TYPE_RAW_TRACEPOINT); + prog = bpf_prog_get(attr->raw_tracepoint.prog_fd); if (IS_ERR(prog)) { err = PTR_ERR(prog); goto out_free_tp; } + if (prog->type != BPF_PROG_TYPE_RAW_TRACEPOINT && + prog->type != BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE) { + err = -EINVAL; + goto out_put_prog; + } err = bpf_probe_register(raw_tp->btp, prog); if (err) @@ -1685,7 +1848,12 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog, switch (prog->type) { case BPF_PROG_TYPE_CGROUP_SOCK: case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: + case BPF_PROG_TYPE_CGROUP_SOCKOPT: return attach_type == prog->expected_attach_type ? 0 : -EINVAL; + case BPF_PROG_TYPE_CGROUP_SKB: + return prog->enforce_expected_attach_type && + prog->expected_attach_type != attach_type ? + -EINVAL : 0; default: return 0; } @@ -1727,6 +1895,8 @@ static int bpf_prog_attach(const union bpf_attr *attr) case BPF_CGROUP_INET6_CONNECT: case BPF_CGROUP_UDP4_SENDMSG: case BPF_CGROUP_UDP6_SENDMSG: + case BPF_CGROUP_UDP4_RECVMSG: + case BPF_CGROUP_UDP6_RECVMSG: ptype = BPF_PROG_TYPE_CGROUP_SOCK_ADDR; break; case BPF_CGROUP_SOCK_OPS: @@ -1748,6 +1918,13 @@ static int bpf_prog_attach(const union bpf_attr *attr) case BPF_FLOW_DISSECTOR: ptype = BPF_PROG_TYPE_FLOW_DISSECTOR; break; + case BPF_CGROUP_SYSCTL: + ptype = BPF_PROG_TYPE_CGROUP_SYSCTL; + break; + case BPF_CGROUP_GETSOCKOPT: + case BPF_CGROUP_SETSOCKOPT: + ptype = BPF_PROG_TYPE_CGROUP_SOCKOPT; + break; default: return -EINVAL; } @@ -1809,6 +1986,8 @@ static int bpf_prog_detach(const union bpf_attr *attr) case BPF_CGROUP_INET6_CONNECT: case BPF_CGROUP_UDP4_SENDMSG: case BPF_CGROUP_UDP6_SENDMSG: + case BPF_CGROUP_UDP4_RECVMSG: + case BPF_CGROUP_UDP6_RECVMSG: ptype = BPF_PROG_TYPE_CGROUP_SOCK_ADDR; break; case BPF_CGROUP_SOCK_OPS: @@ -1826,6 +2005,13 @@ static int bpf_prog_detach(const union bpf_attr *attr) return lirc_prog_detach(attr); case BPF_FLOW_DISSECTOR: return skb_flow_dissector_bpf_prog_detach(attr); + case BPF_CGROUP_SYSCTL: + ptype = BPF_PROG_TYPE_CGROUP_SYSCTL; + break; + case BPF_CGROUP_GETSOCKOPT: + case BPF_CGROUP_SETSOCKOPT: + ptype = BPF_PROG_TYPE_CGROUP_SOCKOPT; + break; default: return -EINVAL; } @@ -1857,11 +2043,18 @@ static int bpf_prog_query(const union bpf_attr *attr, case BPF_CGROUP_INET6_CONNECT: case BPF_CGROUP_UDP4_SENDMSG: case BPF_CGROUP_UDP6_SENDMSG: + case BPF_CGROUP_UDP4_RECVMSG: + case BPF_CGROUP_UDP6_RECVMSG: case BPF_CGROUP_SOCK_OPS: case BPF_CGROUP_DEVICE: + case BPF_CGROUP_SYSCTL: + case BPF_CGROUP_GETSOCKOPT: + case BPF_CGROUP_SETSOCKOPT: break; case BPF_LIRC_MODE2: return lirc_prog_query(attr, uattr); + case BPF_FLOW_DISSECTOR: + return skb_flow_dissector_prog_query(attr, uattr); default: return -EINVAL; } @@ -1869,7 +2062,7 @@ static int bpf_prog_query(const union bpf_attr *attr, return cgroup_bpf_prog_query(attr, uattr); } -#define BPF_PROG_TEST_RUN_LAST_FIELD test.duration +#define BPF_PROG_TEST_RUN_LAST_FIELD test.ctx_out static int bpf_prog_test_run(const union bpf_attr *attr, union bpf_attr __user *uattr) @@ -1882,6 +2075,14 @@ static int bpf_prog_test_run(const union bpf_attr *attr, if (CHECK_ATTR(BPF_PROG_TEST_RUN)) return -EINVAL; + if ((attr->test.ctx_size_in && !attr->test.ctx_in) || + (!attr->test.ctx_size_in && attr->test.ctx_in)) + return -EINVAL; + + if ((attr->test.ctx_size_out && !attr->test.ctx_out) || + (!attr->test.ctx_size_out && attr->test.ctx_out)) + return -EINVAL; + prog = bpf_prog_get(attr->test.prog_fd); if (IS_ERR(prog)) return PTR_ERR(prog); @@ -1986,19 +2187,32 @@ static int bpf_map_get_fd_by_id(const union bpf_attr *attr) fd = bpf_map_new_fd(map, f_flags); if (fd < 0) - bpf_map_put(map); + bpf_map_put_with_uref(map); return fd; } static const struct bpf_map *bpf_map_from_imm(const struct bpf_prog *prog, - unsigned long addr) + unsigned long addr, u32 *off, + u32 *type) { + const struct bpf_map *map; int i; - for (i = 0; i < prog->aux->used_map_cnt; i++) - if (prog->aux->used_maps[i] == (void *)addr) - return prog->aux->used_maps[i]; + for (i = 0, *off = 0; i < prog->aux->used_map_cnt; i++) { + map = prog->aux->used_maps[i]; + if (map == (void *)addr) { + *type = BPF_PSEUDO_MAP_FD; + return map; + } + if (!map->ops->map_direct_value_meta) + continue; + if (!map->ops->map_direct_value_meta(map, addr, off)) { + *type = BPF_PSEUDO_MAP_VALUE; + return map; + } + } + return NULL; } @@ -2006,6 +2220,7 @@ static struct bpf_insn *bpf_insn_prepare_dump(const struct bpf_prog *prog) { const struct bpf_map *map; struct bpf_insn *insns; + u32 off, type; u64 imm; int i; @@ -2033,11 +2248,11 @@ static struct bpf_insn *bpf_insn_prepare_dump(const struct bpf_prog *prog) continue; imm = ((u64)insns[i + 1].imm << 32) | (u32)insns[i].imm; - map = bpf_map_from_imm(prog, imm); + map = bpf_map_from_imm(prog, imm, &off, &type); if (map) { - insns[i].src_reg = BPF_PSEUDO_MAP_FD; + insns[i].src_reg = type; insns[i].imm = map->id; - insns[i + 1].imm = 0; + insns[i + 1].imm = off; continue; } } @@ -2083,6 +2298,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, struct bpf_prog_info __user *uinfo = u64_to_user_ptr(attr->info.info); struct bpf_prog_info info = {}; u32 info_len = attr->info.info_len; + struct bpf_prog_stats stats; char __user *uinsns; u32 ulen; int err; @@ -2122,6 +2338,10 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, if (err) return err; + bpf_prog_get_stats(prog, &stats); + info.run_time_ns = stats.nsecs; + info.run_cnt = stats.cnt; + if (!capable(CAP_SYS_ADMIN)) { info.jited_prog_len = 0; info.xlated_prog_len = 0; @@ -2622,6 +2842,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz case BPF_MAP_GET_NEXT_KEY: err = map_get_next_key(&attr); break; + case BPF_MAP_FREEZE: + err = map_freeze(&attr); + break; case BPF_PROG_LOAD: err = bpf_prog_load(&attr, uattr); break; diff --git a/kernel/bpf/tnum.c b/kernel/bpf/tnum.c index 938d41211be7..ca52b9642943 100644 --- a/kernel/bpf/tnum.c +++ b/kernel/bpf/tnum.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* tnum: tracked (or tristate) numbers * * A tnum tracks knowledge about the bits of a value. Each bit can be either diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 56674a7c3778..a2e763703c30 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1,15 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0-only /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com * Copyright (c) 2016 Facebook * Copyright (c) 2018 Covalent IO, Inc. http://covalent.io - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. */ #include <uapi/linux/btf.h> #include <linux/kernel.h> @@ -176,8 +168,7 @@ struct bpf_verifier_stack_elem { struct bpf_verifier_stack_elem *next; }; -#define BPF_COMPLEXITY_LIMIT_INSNS 131072 -#define BPF_COMPLEXITY_LIMIT_STACK 1024 +#define BPF_COMPLEXITY_LIMIT_JMP_SEQ 8192 #define BPF_COMPLEXITY_LIMIT_STATES 64 #define BPF_MAP_PTR_UNPRIV 1UL @@ -212,7 +203,8 @@ struct bpf_call_arg_meta { int access_size; s64 msize_smax_value; u64 msize_umax_value; - int ptr_id; + int ref_obj_id; + int func_id; }; static DEFINE_MUTEX(bpf_verifier_lock); @@ -330,35 +322,39 @@ static bool type_is_pkt_pointer(enum bpf_reg_type type) type == PTR_TO_PACKET_META; } -static bool reg_type_may_be_null(enum bpf_reg_type type) -{ - return type == PTR_TO_MAP_VALUE_OR_NULL || - type == PTR_TO_SOCKET_OR_NULL; -} - -static bool type_is_refcounted(enum bpf_reg_type type) +static bool type_is_sk_pointer(enum bpf_reg_type type) { - return type == PTR_TO_SOCKET; + return type == PTR_TO_SOCKET || + type == PTR_TO_SOCK_COMMON || + type == PTR_TO_TCP_SOCK || + type == PTR_TO_XDP_SOCK; } -static bool type_is_refcounted_or_null(enum bpf_reg_type type) +static bool reg_type_may_be_null(enum bpf_reg_type type) { - return type == PTR_TO_SOCKET || type == PTR_TO_SOCKET_OR_NULL; + return type == PTR_TO_MAP_VALUE_OR_NULL || + type == PTR_TO_SOCKET_OR_NULL || + type == PTR_TO_SOCK_COMMON_OR_NULL || + type == PTR_TO_TCP_SOCK_OR_NULL; } -static bool reg_is_refcounted(const struct bpf_reg_state *reg) +static bool reg_may_point_to_spin_lock(const struct bpf_reg_state *reg) { - return type_is_refcounted(reg->type); + return reg->type == PTR_TO_MAP_VALUE && + map_value_has_spin_lock(reg->map_ptr); } -static bool reg_is_refcounted_or_null(const struct bpf_reg_state *reg) +static bool reg_type_may_be_refcounted_or_null(enum bpf_reg_type type) { - return type_is_refcounted_or_null(reg->type); + return type == PTR_TO_SOCKET || + type == PTR_TO_SOCKET_OR_NULL || + type == PTR_TO_TCP_SOCK || + type == PTR_TO_TCP_SOCK_OR_NULL; } -static bool arg_type_is_refcounted(enum bpf_arg_type type) +static bool arg_type_may_be_refcounted(enum bpf_arg_type type) { - return type == ARG_PTR_TO_SOCKET; + return type == ARG_PTR_TO_SOCK_COMMON; } /* Determine whether the function releases some resources allocated by another @@ -370,6 +366,19 @@ static bool is_release_function(enum bpf_func_id func_id) return func_id == BPF_FUNC_sk_release; } +static bool is_acquire_function(enum bpf_func_id func_id) +{ + return func_id == BPF_FUNC_sk_lookup_tcp || + func_id == BPF_FUNC_sk_lookup_udp || + func_id == BPF_FUNC_skc_lookup_tcp; +} + +static bool is_ptr_cast_function(enum bpf_func_id func_id) +{ + return func_id == BPF_FUNC_tcp_sock || + func_id == BPF_FUNC_sk_fullsock; +} + /* string representation of 'enum bpf_reg_type' */ static const char * const reg_type_str[] = { [NOT_INIT] = "?", @@ -385,6 +394,12 @@ static const char * const reg_type_str[] = { [PTR_TO_FLOW_KEYS] = "flow_keys", [PTR_TO_SOCKET] = "sock", [PTR_TO_SOCKET_OR_NULL] = "sock_or_null", + [PTR_TO_SOCK_COMMON] = "sock_common", + [PTR_TO_SOCK_COMMON_OR_NULL] = "sock_common_or_null", + [PTR_TO_TCP_SOCK] = "tcp_sock", + [PTR_TO_TCP_SOCK_OR_NULL] = "tcp_sock_or_null", + [PTR_TO_TP_BUFFER] = "tp_buffer", + [PTR_TO_XDP_SOCK] = "xdp_sock", }; static char slot_type_char[] = { @@ -432,14 +447,16 @@ static void print_verifier_state(struct bpf_verifier_env *env, verbose(env, " R%d", i); print_liveness(env, reg->live); verbose(env, "=%s", reg_type_str[t]); + if (t == SCALAR_VALUE && reg->precise) + verbose(env, "P"); if ((t == SCALAR_VALUE || t == PTR_TO_STACK) && tnum_is_const(reg->var_off)) { /* reg->off should be 0 for SCALAR_VALUE */ verbose(env, "%lld", reg->var_off.value + reg->off); - if (t == PTR_TO_STACK) - verbose(env, ",call_%d", func(env, reg)->callsite); } else { verbose(env, "(id=%d", reg->id); + if (reg_type_may_be_refcounted_or_null(t)) + verbose(env, ",ref_obj_id=%d", reg->ref_obj_id); if (t != SCALAR_VALUE) verbose(env, ",off=%d", reg->off); if (type_is_pkt_pointer(t)) @@ -497,11 +514,17 @@ static void print_verifier_state(struct bpf_verifier_env *env, continue; verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE); print_liveness(env, state->stack[i].spilled_ptr.live); - if (state->stack[i].slot_type[0] == STACK_SPILL) - verbose(env, "=%s", - reg_type_str[state->stack[i].spilled_ptr.type]); - else + if (state->stack[i].slot_type[0] == STACK_SPILL) { + reg = &state->stack[i].spilled_ptr; + t = reg->type; + verbose(env, "=%s", reg_type_str[t]); + if (t == SCALAR_VALUE && reg->precise) + verbose(env, "P"); + if (t == SCALAR_VALUE && tnum_is_const(reg->var_off)) + verbose(env, "%lld", reg->var_off.value + reg->off); + } else { verbose(env, "=%s", types_buf); + } } if (state->acquired_refs && state->refs[0].id) { verbose(env, " refs=%d", state->refs[0].id); @@ -611,13 +634,10 @@ static int acquire_reference_state(struct bpf_verifier_env *env, int insn_idx) } /* release function corresponding to acquire_reference_state(). Idempotent. */ -static int __release_reference_state(struct bpf_func_state *state, int ptr_id) +static int release_reference_state(struct bpf_func_state *state, int ptr_id) { int i, last_idx; - if (!ptr_id) - return -EFAULT; - last_idx = state->acquired_refs - 1; for (i = 0; i < state->acquired_refs; i++) { if (state->refs[i].id == ptr_id) { @@ -629,21 +649,7 @@ static int __release_reference_state(struct bpf_func_state *state, int ptr_id) return 0; } } - return -EFAULT; -} - -/* variation on the above for cases where we expect that there must be an - * outstanding reference for the specified ptr_id. - */ -static int release_reference_state(struct bpf_verifier_env *env, int ptr_id) -{ - struct bpf_func_state *state = cur_func(env); - int err; - - err = __release_reference_state(state, ptr_id); - if (WARN_ON_ONCE(err != 0)) - verbose(env, "verifier internal error: can't release reference\n"); - return err; + return -EINVAL; } static int transfer_reference_state(struct bpf_func_state *dst, @@ -667,6 +673,13 @@ static void free_func_state(struct bpf_func_state *state) kfree(state); } +static void clear_jmp_history(struct bpf_verifier_state *state) +{ + kfree(state->jmp_history); + state->jmp_history = NULL; + state->jmp_history_cnt = 0; +} + static void free_verifier_state(struct bpf_verifier_state *state, bool free_self) { @@ -676,6 +689,7 @@ static void free_verifier_state(struct bpf_verifier_state *state, free_func_state(state->frame[i]); state->frame[i] = NULL; } + clear_jmp_history(state); if (free_self) kfree(state); } @@ -703,8 +717,18 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state, const struct bpf_verifier_state *src) { struct bpf_func_state *dst; + u32 jmp_sz = sizeof(struct bpf_idx_pair) * src->jmp_history_cnt; int i, err; + if (dst_state->jmp_history_cnt < src->jmp_history_cnt) { + kfree(dst_state->jmp_history); + dst_state->jmp_history = kmalloc(jmp_sz, GFP_USER); + if (!dst_state->jmp_history) + return -ENOMEM; + } + memcpy(dst_state->jmp_history, src->jmp_history, jmp_sz); + dst_state->jmp_history_cnt = src->jmp_history_cnt; + /* if dst has more stack frames then src frame, free them */ for (i = src->curframe + 1; i <= dst_state->curframe; i++) { free_func_state(dst_state->frame[i]); @@ -712,6 +736,11 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state, } dst_state->speculative = src->speculative; dst_state->curframe = src->curframe; + dst_state->active_spin_lock = src->active_spin_lock; + dst_state->branches = src->branches; + dst_state->parent = src->parent; + dst_state->first_insn_idx = src->first_insn_idx; + dst_state->last_insn_idx = src->last_insn_idx; for (i = 0; i <= src->curframe; i++) { dst = dst_state->frame[i]; if (!dst) { @@ -727,6 +756,23 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state, return 0; } +static void update_branch_counts(struct bpf_verifier_env *env, struct bpf_verifier_state *st) +{ + while (st) { + u32 br = --st->branches; + + /* WARN_ON(br > 1) technically makes sense here, + * but see comment in push_stack(), hence: + */ + WARN_ONCE((int)br < 0, + "BUG update_branch_counts:branches_to_explore=%d\n", + br); + if (br) + break; + st = st->parent; + } +} + static int pop_stack(struct bpf_verifier_env *env, int *prev_insn_idx, int *insn_idx) { @@ -775,10 +821,23 @@ static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env, if (err) goto err; elem->st.speculative |= speculative; - if (env->stack_size > BPF_COMPLEXITY_LIMIT_STACK) { - verbose(env, "BPF program is too complex\n"); + if (env->stack_size > BPF_COMPLEXITY_LIMIT_JMP_SEQ) { + verbose(env, "The sequence of %d jumps is too complex.\n", + env->stack_size); goto err; } + if (elem->st.parent) { + ++elem->st.parent->branches; + /* WARN_ON(branches > 2) technically makes sense here, + * but + * 1. speculative states will bump 'branches' for non-branch + * instructions + * 2. is_state_visited() heuristics may decide not to create + * a new state for a sequence of branches and all such current + * and cloned states will be pointing to a single parent state + * which might have large 'branches' count. + */ + } return &elem->st; err: free_verifier_state(env->cur_state, true); @@ -926,6 +985,9 @@ static void __mark_reg_unbounded(struct bpf_reg_state *reg) reg->smax_value = S64_MAX; reg->umin_value = 0; reg->umax_value = U64_MAX; + + /* constant backtracking is enabled for root only for now */ + reg->precise = capable(CAP_SYS_ADMIN) ? false : true; } /* Mark a register as having a completely unknown (scalar) value. */ @@ -974,6 +1036,7 @@ static void mark_reg_not_init(struct bpf_verifier_env *env, __mark_reg_not_init(regs + regno); } +#define DEF_NOT_SUBREG (0) static void init_reg_state(struct bpf_verifier_env *env, struct bpf_func_state *state) { @@ -984,6 +1047,7 @@ static void init_reg_state(struct bpf_verifier_env *env, mark_reg_not_init(env, regs, i); regs[i].live = REG_LIVE_NONE; regs[i].parent = NULL; + regs[i].subreg_def = DEF_NOT_SUBREG; } /* frame pointer */ @@ -1085,7 +1149,7 @@ static int check_subprogs(struct bpf_verifier_env *env) */ subprog[env->subprog_cnt].start = insn_cnt; - if (env->log.level > 1) + if (env->log.level & BPF_LOG_LEVEL2) for (i = 0; i < env->subprog_cnt; i++) verbose(env, "func#%d @%d\n", i, subprog[i].start); @@ -1095,7 +1159,7 @@ static int check_subprogs(struct bpf_verifier_env *env) for (i = 0; i < insn_cnt; i++) { u8 code = insn[i].code; - if (BPF_CLASS(code) != BPF_JMP) + if (BPF_CLASS(code) != BPF_JMP && BPF_CLASS(code) != BPF_JMP32) goto next; if (BPF_OP(code) == BPF_EXIT || BPF_OP(code) == BPF_CALL) goto next; @@ -1129,9 +1193,10 @@ next: */ static int mark_reg_read(struct bpf_verifier_env *env, const struct bpf_reg_state *state, - struct bpf_reg_state *parent) + struct bpf_reg_state *parent, u8 flag) { bool writes = parent == state->parent; /* Observe write marks */ + int cnt = 0; while (parent) { /* if read wasn't screened by an earlier write ... */ @@ -1143,50 +1208,625 @@ static int mark_reg_read(struct bpf_verifier_env *env, parent->var_off.value, parent->off); return -EFAULT; } + /* The first condition is more likely to be true than the + * second, checked it first. + */ + if ((parent->live & REG_LIVE_READ) == flag || + parent->live & REG_LIVE_READ64) + /* The parentage chain never changes and + * this parent was already marked as LIVE_READ. + * There is no need to keep walking the chain again and + * keep re-marking all parents as LIVE_READ. + * This case happens when the same register is read + * multiple times without writes into it in-between. + * Also, if parent has the stronger REG_LIVE_READ64 set, + * then no need to set the weak REG_LIVE_READ32. + */ + break; /* ... then we depend on parent's value */ - parent->live |= REG_LIVE_READ; + parent->live |= flag; + /* REG_LIVE_READ64 overrides REG_LIVE_READ32. */ + if (flag == REG_LIVE_READ64) + parent->live &= ~REG_LIVE_READ32; state = parent; parent = state->parent; writes = true; + cnt++; } + + if (env->longest_mark_read_walk < cnt) + env->longest_mark_read_walk = cnt; return 0; } +/* This function is supposed to be used by the following 32-bit optimization + * code only. It returns TRUE if the source or destination register operates + * on 64-bit, otherwise return FALSE. + */ +static bool is_reg64(struct bpf_verifier_env *env, struct bpf_insn *insn, + u32 regno, struct bpf_reg_state *reg, enum reg_arg_type t) +{ + u8 code, class, op; + + code = insn->code; + class = BPF_CLASS(code); + op = BPF_OP(code); + if (class == BPF_JMP) { + /* BPF_EXIT for "main" will reach here. Return TRUE + * conservatively. + */ + if (op == BPF_EXIT) + return true; + if (op == BPF_CALL) { + /* BPF to BPF call will reach here because of marking + * caller saved clobber with DST_OP_NO_MARK for which we + * don't care the register def because they are anyway + * marked as NOT_INIT already. + */ + if (insn->src_reg == BPF_PSEUDO_CALL) + return false; + /* Helper call will reach here because of arg type + * check, conservatively return TRUE. + */ + if (t == SRC_OP) + return true; + + return false; + } + } + + if (class == BPF_ALU64 || class == BPF_JMP || + /* BPF_END always use BPF_ALU class. */ + (class == BPF_ALU && op == BPF_END && insn->imm == 64)) + return true; + + if (class == BPF_ALU || class == BPF_JMP32) + return false; + + if (class == BPF_LDX) { + if (t != SRC_OP) + return BPF_SIZE(code) == BPF_DW; + /* LDX source must be ptr. */ + return true; + } + + if (class == BPF_STX) { + if (reg->type != SCALAR_VALUE) + return true; + return BPF_SIZE(code) == BPF_DW; + } + + if (class == BPF_LD) { + u8 mode = BPF_MODE(code); + + /* LD_IMM64 */ + if (mode == BPF_IMM) + return true; + + /* Both LD_IND and LD_ABS return 32-bit data. */ + if (t != SRC_OP) + return false; + + /* Implicit ctx ptr. */ + if (regno == BPF_REG_6) + return true; + + /* Explicit source could be any width. */ + return true; + } + + if (class == BPF_ST) + /* The only source register for BPF_ST is a ptr. */ + return true; + + /* Conservatively return true at default. */ + return true; +} + +/* Return TRUE if INSN doesn't have explicit value define. */ +static bool insn_no_def(struct bpf_insn *insn) +{ + u8 class = BPF_CLASS(insn->code); + + return (class == BPF_JMP || class == BPF_JMP32 || + class == BPF_STX || class == BPF_ST); +} + +/* Return TRUE if INSN has defined any 32-bit value explicitly. */ +static bool insn_has_def32(struct bpf_verifier_env *env, struct bpf_insn *insn) +{ + if (insn_no_def(insn)) + return false; + + return !is_reg64(env, insn, insn->dst_reg, NULL, DST_OP); +} + +static void mark_insn_zext(struct bpf_verifier_env *env, + struct bpf_reg_state *reg) +{ + s32 def_idx = reg->subreg_def; + + if (def_idx == DEF_NOT_SUBREG) + return; + + env->insn_aux_data[def_idx - 1].zext_dst = true; + /* The dst will be zero extended, so won't be sub-register anymore. */ + reg->subreg_def = DEF_NOT_SUBREG; +} + static int check_reg_arg(struct bpf_verifier_env *env, u32 regno, enum reg_arg_type t) { struct bpf_verifier_state *vstate = env->cur_state; struct bpf_func_state *state = vstate->frame[vstate->curframe]; - struct bpf_reg_state *regs = state->regs; + struct bpf_insn *insn = env->prog->insnsi + env->insn_idx; + struct bpf_reg_state *reg, *regs = state->regs; + bool rw64; if (regno >= MAX_BPF_REG) { verbose(env, "R%d is invalid\n", regno); return -EINVAL; } + reg = ®s[regno]; + rw64 = is_reg64(env, insn, regno, reg, t); if (t == SRC_OP) { /* check whether register used as source operand can be read */ - if (regs[regno].type == NOT_INIT) { + if (reg->type == NOT_INIT) { verbose(env, "R%d !read_ok\n", regno); return -EACCES; } /* We don't need to worry about FP liveness because it's read-only */ - if (regno != BPF_REG_FP) - return mark_reg_read(env, ®s[regno], - regs[regno].parent); + if (regno == BPF_REG_FP) + return 0; + + if (rw64) + mark_insn_zext(env, reg); + + return mark_reg_read(env, reg, reg->parent, + rw64 ? REG_LIVE_READ64 : REG_LIVE_READ32); } else { /* check whether register used as dest operand can be written to */ if (regno == BPF_REG_FP) { verbose(env, "frame pointer is read only\n"); return -EACCES; } - regs[regno].live |= REG_LIVE_WRITTEN; + reg->live |= REG_LIVE_WRITTEN; + reg->subreg_def = rw64 ? DEF_NOT_SUBREG : env->insn_idx + 1; if (t == DST_OP) mark_reg_unknown(env, regs, regno); } return 0; } +/* for any branch, call, exit record the history of jmps in the given state */ +static int push_jmp_history(struct bpf_verifier_env *env, + struct bpf_verifier_state *cur) +{ + u32 cnt = cur->jmp_history_cnt; + struct bpf_idx_pair *p; + + cnt++; + p = krealloc(cur->jmp_history, cnt * sizeof(*p), GFP_USER); + if (!p) + return -ENOMEM; + p[cnt - 1].idx = env->insn_idx; + p[cnt - 1].prev_idx = env->prev_insn_idx; + cur->jmp_history = p; + cur->jmp_history_cnt = cnt; + return 0; +} + +/* Backtrack one insn at a time. If idx is not at the top of recorded + * history then previous instruction came from straight line execution. + */ +static int get_prev_insn_idx(struct bpf_verifier_state *st, int i, + u32 *history) +{ + u32 cnt = *history; + + if (cnt && st->jmp_history[cnt - 1].idx == i) { + i = st->jmp_history[cnt - 1].prev_idx; + (*history)--; + } else { + i--; + } + return i; +} + +/* For given verifier state backtrack_insn() is called from the last insn to + * the first insn. Its purpose is to compute a bitmask of registers and + * stack slots that needs precision in the parent verifier state. + */ +static int backtrack_insn(struct bpf_verifier_env *env, int idx, + u32 *reg_mask, u64 *stack_mask) +{ + const struct bpf_insn_cbs cbs = { + .cb_print = verbose, + .private_data = env, + }; + struct bpf_insn *insn = env->prog->insnsi + idx; + u8 class = BPF_CLASS(insn->code); + u8 opcode = BPF_OP(insn->code); + u8 mode = BPF_MODE(insn->code); + u32 dreg = 1u << insn->dst_reg; + u32 sreg = 1u << insn->src_reg; + u32 spi; + + if (insn->code == 0) + return 0; + if (env->log.level & BPF_LOG_LEVEL) { + verbose(env, "regs=%x stack=%llx before ", *reg_mask, *stack_mask); + verbose(env, "%d: ", idx); + print_bpf_insn(&cbs, insn, env->allow_ptr_leaks); + } + + if (class == BPF_ALU || class == BPF_ALU64) { + if (!(*reg_mask & dreg)) + return 0; + if (opcode == BPF_MOV) { + if (BPF_SRC(insn->code) == BPF_X) { + /* dreg = sreg + * dreg needs precision after this insn + * sreg needs precision before this insn + */ + *reg_mask &= ~dreg; + *reg_mask |= sreg; + } else { + /* dreg = K + * dreg needs precision after this insn. + * Corresponding register is already marked + * as precise=true in this verifier state. + * No further markings in parent are necessary + */ + *reg_mask &= ~dreg; + } + } else { + if (BPF_SRC(insn->code) == BPF_X) { + /* dreg += sreg + * both dreg and sreg need precision + * before this insn + */ + *reg_mask |= sreg; + } /* else dreg += K + * dreg still needs precision before this insn + */ + } + } else if (class == BPF_LDX) { + if (!(*reg_mask & dreg)) + return 0; + *reg_mask &= ~dreg; + + /* scalars can only be spilled into stack w/o losing precision. + * Load from any other memory can be zero extended. + * The desire to keep that precision is already indicated + * by 'precise' mark in corresponding register of this state. + * No further tracking necessary. + */ + if (insn->src_reg != BPF_REG_FP) + return 0; + if (BPF_SIZE(insn->code) != BPF_DW) + return 0; + + /* dreg = *(u64 *)[fp - off] was a fill from the stack. + * that [fp - off] slot contains scalar that needs to be + * tracked with precision + */ + spi = (-insn->off - 1) / BPF_REG_SIZE; + if (spi >= 64) { + verbose(env, "BUG spi %d\n", spi); + WARN_ONCE(1, "verifier backtracking bug"); + return -EFAULT; + } + *stack_mask |= 1ull << spi; + } else if (class == BPF_STX) { + if (*reg_mask & dreg) + /* stx shouldn't be using _scalar_ dst_reg + * to access memory. It means backtracking + * encountered a case of pointer subtraction. + */ + return -ENOTSUPP; + /* scalars can only be spilled into stack */ + if (insn->dst_reg != BPF_REG_FP) + return 0; + if (BPF_SIZE(insn->code) != BPF_DW) + return 0; + spi = (-insn->off - 1) / BPF_REG_SIZE; + if (spi >= 64) { + verbose(env, "BUG spi %d\n", spi); + WARN_ONCE(1, "verifier backtracking bug"); + return -EFAULT; + } + if (!(*stack_mask & (1ull << spi))) + return 0; + *stack_mask &= ~(1ull << spi); + *reg_mask |= sreg; + } else if (class == BPF_JMP || class == BPF_JMP32) { + if (opcode == BPF_CALL) { + if (insn->src_reg == BPF_PSEUDO_CALL) + return -ENOTSUPP; + /* regular helper call sets R0 */ + *reg_mask &= ~1; + if (*reg_mask & 0x3f) { + /* if backtracing was looking for registers R1-R5 + * they should have been found already. + */ + verbose(env, "BUG regs %x\n", *reg_mask); + WARN_ONCE(1, "verifier backtracking bug"); + return -EFAULT; + } + } else if (opcode == BPF_EXIT) { + return -ENOTSUPP; + } + } else if (class == BPF_LD) { + if (!(*reg_mask & dreg)) + return 0; + *reg_mask &= ~dreg; + /* It's ld_imm64 or ld_abs or ld_ind. + * For ld_imm64 no further tracking of precision + * into parent is necessary + */ + if (mode == BPF_IND || mode == BPF_ABS) + /* to be analyzed */ + return -ENOTSUPP; + } else if (class == BPF_ST) { + if (*reg_mask & dreg) + /* likely pointer subtraction */ + return -ENOTSUPP; + } + return 0; +} + +/* the scalar precision tracking algorithm: + * . at the start all registers have precise=false. + * . scalar ranges are tracked as normal through alu and jmp insns. + * . once precise value of the scalar register is used in: + * . ptr + scalar alu + * . if (scalar cond K|scalar) + * . helper_call(.., scalar, ...) where ARG_CONST is expected + * backtrack through the verifier states and mark all registers and + * stack slots with spilled constants that these scalar regisers + * should be precise. + * . during state pruning two registers (or spilled stack slots) + * are equivalent if both are not precise. + * + * Note the verifier cannot simply walk register parentage chain, + * since many different registers and stack slots could have been + * used to compute single precise scalar. + * + * The approach of starting with precise=true for all registers and then + * backtrack to mark a register as not precise when the verifier detects + * that program doesn't care about specific value (e.g., when helper + * takes register as ARG_ANYTHING parameter) is not safe. + * + * It's ok to walk single parentage chain of the verifier states. + * It's possible that this backtracking will go all the way till 1st insn. + * All other branches will be explored for needing precision later. + * + * The backtracking needs to deal with cases like: + * R8=map_value(id=0,off=0,ks=4,vs=1952,imm=0) R9_w=map_value(id=0,off=40,ks=4,vs=1952,imm=0) + * r9 -= r8 + * r5 = r9 + * if r5 > 0x79f goto pc+7 + * R5_w=inv(id=0,umax_value=1951,var_off=(0x0; 0x7ff)) + * r5 += 1 + * ... + * call bpf_perf_event_output#25 + * where .arg5_type = ARG_CONST_SIZE_OR_ZERO + * + * and this case: + * r6 = 1 + * call foo // uses callee's r6 inside to compute r0 + * r0 += r6 + * if r0 == 0 goto + * + * to track above reg_mask/stack_mask needs to be independent for each frame. + * + * Also if parent's curframe > frame where backtracking started, + * the verifier need to mark registers in both frames, otherwise callees + * may incorrectly prune callers. This is similar to + * commit 7640ead93924 ("bpf: verifier: make sure callees don't prune with caller differences") + * + * For now backtracking falls back into conservative marking. + */ +static void mark_all_scalars_precise(struct bpf_verifier_env *env, + struct bpf_verifier_state *st) +{ + struct bpf_func_state *func; + struct bpf_reg_state *reg; + int i, j; + + /* big hammer: mark all scalars precise in this path. + * pop_stack may still get !precise scalars. + */ + for (; st; st = st->parent) + for (i = 0; i <= st->curframe; i++) { + func = st->frame[i]; + for (j = 0; j < BPF_REG_FP; j++) { + reg = &func->regs[j]; + if (reg->type != SCALAR_VALUE) + continue; + reg->precise = true; + } + for (j = 0; j < func->allocated_stack / BPF_REG_SIZE; j++) { + if (func->stack[j].slot_type[0] != STACK_SPILL) + continue; + reg = &func->stack[j].spilled_ptr; + if (reg->type != SCALAR_VALUE) + continue; + reg->precise = true; + } + } +} + +static int __mark_chain_precision(struct bpf_verifier_env *env, int regno, + int spi) +{ + struct bpf_verifier_state *st = env->cur_state; + int first_idx = st->first_insn_idx; + int last_idx = env->insn_idx; + struct bpf_func_state *func; + struct bpf_reg_state *reg; + u32 reg_mask = regno >= 0 ? 1u << regno : 0; + u64 stack_mask = spi >= 0 ? 1ull << spi : 0; + bool skip_first = true; + bool new_marks = false; + int i, err; + + if (!env->allow_ptr_leaks) + /* backtracking is root only for now */ + return 0; + + func = st->frame[st->curframe]; + if (regno >= 0) { + reg = &func->regs[regno]; + if (reg->type != SCALAR_VALUE) { + WARN_ONCE(1, "backtracing misuse"); + return -EFAULT; + } + if (!reg->precise) + new_marks = true; + else + reg_mask = 0; + reg->precise = true; + } + + while (spi >= 0) { + if (func->stack[spi].slot_type[0] != STACK_SPILL) { + stack_mask = 0; + break; + } + reg = &func->stack[spi].spilled_ptr; + if (reg->type != SCALAR_VALUE) { + stack_mask = 0; + break; + } + if (!reg->precise) + new_marks = true; + else + stack_mask = 0; + reg->precise = true; + break; + } + + if (!new_marks) + return 0; + if (!reg_mask && !stack_mask) + return 0; + for (;;) { + DECLARE_BITMAP(mask, 64); + u32 history = st->jmp_history_cnt; + + if (env->log.level & BPF_LOG_LEVEL) + verbose(env, "last_idx %d first_idx %d\n", last_idx, first_idx); + for (i = last_idx;;) { + if (skip_first) { + err = 0; + skip_first = false; + } else { + err = backtrack_insn(env, i, ®_mask, &stack_mask); + } + if (err == -ENOTSUPP) { + mark_all_scalars_precise(env, st); + return 0; + } else if (err) { + return err; + } + if (!reg_mask && !stack_mask) + /* Found assignment(s) into tracked register in this state. + * Since this state is already marked, just return. + * Nothing to be tracked further in the parent state. + */ + return 0; + if (i == first_idx) + break; + i = get_prev_insn_idx(st, i, &history); + if (i >= env->prog->len) { + /* This can happen if backtracking reached insn 0 + * and there are still reg_mask or stack_mask + * to backtrack. + * It means the backtracking missed the spot where + * particular register was initialized with a constant. + */ + verbose(env, "BUG backtracking idx %d\n", i); + WARN_ONCE(1, "verifier backtracking bug"); + return -EFAULT; + } + } + st = st->parent; + if (!st) + break; + + new_marks = false; + func = st->frame[st->curframe]; + bitmap_from_u64(mask, reg_mask); + for_each_set_bit(i, mask, 32) { + reg = &func->regs[i]; + if (reg->type != SCALAR_VALUE) { + reg_mask &= ~(1u << i); + continue; + } + if (!reg->precise) + new_marks = true; + reg->precise = true; + } + + bitmap_from_u64(mask, stack_mask); + for_each_set_bit(i, mask, 64) { + if (i >= func->allocated_stack / BPF_REG_SIZE) { + /* This can happen if backtracking + * is propagating stack precision where + * caller has larger stack frame + * than callee, but backtrack_insn() should + * have returned -ENOTSUPP. + */ + verbose(env, "BUG spi %d stack_size %d\n", + i, func->allocated_stack); + WARN_ONCE(1, "verifier backtracking bug"); + return -EFAULT; + } + + if (func->stack[i].slot_type[0] != STACK_SPILL) { + stack_mask &= ~(1ull << i); + continue; + } + reg = &func->stack[i].spilled_ptr; + if (reg->type != SCALAR_VALUE) { + stack_mask &= ~(1ull << i); + continue; + } + if (!reg->precise) + new_marks = true; + reg->precise = true; + } + if (env->log.level & BPF_LOG_LEVEL) { + print_verifier_state(env, func); + verbose(env, "parent %s regs=%x stack=%llx marks\n", + new_marks ? "didn't have" : "already had", + reg_mask, stack_mask); + } + + if (!reg_mask && !stack_mask) + break; + if (!new_marks) + break; + + last_idx = st->last_insn_idx; + first_idx = st->first_insn_idx; + } + return 0; +} + +static int mark_chain_precision(struct bpf_verifier_env *env, int regno) +{ + return __mark_chain_precision(env, regno, -1); +} + +static int mark_chain_precision_stack(struct bpf_verifier_env *env, int spi) +{ + return __mark_chain_precision(env, -1, spi); +} + static bool is_spillable_regtype(enum bpf_reg_type type) { switch (type) { @@ -1201,6 +1841,11 @@ static bool is_spillable_regtype(enum bpf_reg_type type) case CONST_PTR_TO_MAP: case PTR_TO_SOCKET: case PTR_TO_SOCKET_OR_NULL: + case PTR_TO_SOCK_COMMON: + case PTR_TO_SOCK_COMMON_OR_NULL: + case PTR_TO_TCP_SOCK: + case PTR_TO_TCP_SOCK_OR_NULL: + case PTR_TO_XDP_SOCK: return true; default: return false; @@ -1213,6 +1858,23 @@ static bool register_is_null(struct bpf_reg_state *reg) return reg->type == SCALAR_VALUE && tnum_equals_const(reg->var_off, 0); } +static bool register_is_const(struct bpf_reg_state *reg) +{ + return reg->type == SCALAR_VALUE && tnum_is_const(reg->var_off); +} + +static void save_register_state(struct bpf_func_state *state, + int spi, struct bpf_reg_state *reg) +{ + int i; + + state->stack[spi].spilled_ptr = *reg; + state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN; + + for (i = 0; i < BPF_REG_SIZE; i++) + state->stack[spi].slot_type[i] = STACK_SPILL; +} + /* check_stack_read/write functions track spill/fill of registers, * stack boundary and alignment are checked in check_mem_access() */ @@ -1222,7 +1884,8 @@ static int check_stack_write(struct bpf_verifier_env *env, { struct bpf_func_state *cur; /* state of the current function */ int i, slot = -off - 1, spi = slot / BPF_REG_SIZE, err; - enum bpf_reg_type type; + u32 dst_reg = env->prog->insnsi[insn_idx].dst_reg; + struct bpf_reg_state *reg = NULL; err = realloc_func_state(state, round_up(slot + 1, BPF_REG_SIZE), state->acquired_refs, true); @@ -1239,27 +1902,48 @@ static int check_stack_write(struct bpf_verifier_env *env, } cur = env->cur_state->frame[env->cur_state->curframe]; - if (value_regno >= 0 && - is_spillable_regtype((type = cur->regs[value_regno].type))) { - + if (value_regno >= 0) + reg = &cur->regs[value_regno]; + + if (reg && size == BPF_REG_SIZE && register_is_const(reg) && + !register_is_null(reg) && env->allow_ptr_leaks) { + if (dst_reg != BPF_REG_FP) { + /* The backtracking logic can only recognize explicit + * stack slot address like [fp - 8]. Other spill of + * scalar via different register has to be conervative. + * Backtrack from here and mark all registers as precise + * that contributed into 'reg' being a constant. + */ + err = mark_chain_precision(env, value_regno); + if (err) + return err; + } + save_register_state(state, spi, reg); + } else if (reg && is_spillable_regtype(reg->type)) { /* register containing pointer is being spilled into stack */ if (size != BPF_REG_SIZE) { + verbose_linfo(env, insn_idx, "; "); verbose(env, "invalid size of register spill\n"); return -EACCES; } - if (state != cur && type == PTR_TO_STACK) { + if (state != cur && reg->type == PTR_TO_STACK) { verbose(env, "cannot spill pointers to stack into stack frame of the caller\n"); return -EINVAL; } - /* save register state */ - state->stack[spi].spilled_ptr = cur->regs[value_regno]; - state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN; + if (!env->allow_ptr_leaks) { + bool sanitize = false; - for (i = 0; i < BPF_REG_SIZE; i++) { - if (state->stack[spi].slot_type[i] == STACK_MISC && - !env->allow_ptr_leaks) { + if (state->stack[spi].slot_type[0] == STACK_SPILL && + register_is_const(&state->stack[spi].spilled_ptr)) + sanitize = true; + for (i = 0; i < BPF_REG_SIZE; i++) + if (state->stack[spi].slot_type[i] == STACK_MISC) { + sanitize = true; + break; + } + if (sanitize) { int *poff = &env->insn_aux_data[insn_idx].sanitize_stack_off; int soff = (-spi - 1) * BPF_REG_SIZE; @@ -1282,8 +1966,8 @@ static int check_stack_write(struct bpf_verifier_env *env, } *poff = soff; } - state->stack[spi].slot_type[i] = STACK_SPILL; } + save_register_state(state, spi, reg); } else { u8 type = STACK_MISC; @@ -1306,9 +1990,13 @@ static int check_stack_write(struct bpf_verifier_env *env, state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN; /* when we zero initialize stack slots mark them as such */ - if (value_regno >= 0 && - register_is_null(&cur->regs[value_regno])) + if (reg && register_is_null(reg)) { + /* backtracking doesn't work for STACK_ZERO yet. */ + err = mark_chain_precision(env, value_regno); + if (err) + return err; type = STACK_ZERO; + } /* Mark slots affected by this stack write. */ for (i = 0; i < size; i++) @@ -1325,6 +2013,7 @@ static int check_stack_read(struct bpf_verifier_env *env, struct bpf_verifier_state *vstate = env->cur_state; struct bpf_func_state *state = vstate->frame[vstate->curframe]; int i, slot = -off - 1, spi = slot / BPF_REG_SIZE; + struct bpf_reg_state *reg; u8 *stype; if (reg_state->allocated_stack <= slot) { @@ -1333,11 +2022,21 @@ static int check_stack_read(struct bpf_verifier_env *env, return -EACCES; } stype = reg_state->stack[spi].slot_type; + reg = ®_state->stack[spi].spilled_ptr; if (stype[0] == STACK_SPILL) { if (size != BPF_REG_SIZE) { - verbose(env, "invalid size of register spill\n"); - return -EACCES; + if (reg->type != SCALAR_VALUE) { + verbose_linfo(env, env->insn_idx, "; "); + verbose(env, "invalid size of register fill\n"); + return -EACCES; + } + if (value_regno >= 0) { + mark_reg_unknown(env, state->regs, value_regno); + state->regs[value_regno].live |= REG_LIVE_WRITTEN; + } + mark_reg_read(env, reg, reg->parent, REG_LIVE_READ64); + return 0; } for (i = 1; i < BPF_REG_SIZE; i++) { if (stype[(slot - i) % BPF_REG_SIZE] != STACK_SPILL) { @@ -1348,16 +2047,14 @@ static int check_stack_read(struct bpf_verifier_env *env, if (value_regno >= 0) { /* restore register state from stack */ - state->regs[value_regno] = reg_state->stack[spi].spilled_ptr; + state->regs[value_regno] = *reg; /* mark reg as written since spilled pointer state likely * has its liveness marks cleared by is_state_visited() * which resets stack/reg liveness for state transitions */ state->regs[value_regno].live |= REG_LIVE_WRITTEN; } - mark_reg_read(env, ®_state->stack[spi].spilled_ptr, - reg_state->stack[spi].spilled_ptr.parent); - return 0; + mark_reg_read(env, reg, reg->parent, REG_LIVE_READ64); } else { int zeros = 0; @@ -1372,22 +2069,32 @@ static int check_stack_read(struct bpf_verifier_env *env, off, i, size); return -EACCES; } - mark_reg_read(env, ®_state->stack[spi].spilled_ptr, - reg_state->stack[spi].spilled_ptr.parent); + mark_reg_read(env, reg, reg->parent, REG_LIVE_READ64); if (value_regno >= 0) { if (zeros == size) { /* any size read into register is zero extended, * so the whole register == const_zero */ __mark_reg_const_zero(&state->regs[value_regno]); + /* backtracking doesn't support STACK_ZERO yet, + * so mark it precise here, so that later + * backtracking can stop here. + * Backtracking may not need this if this register + * doesn't participate in pointer adjustment. + * Forward propagation of precise flag is not + * necessary either. This mark is only to stop + * backtracking. Any register that contributed + * to const 0 was marked precise before spill. + */ + state->regs[value_regno].precise = true; } else { /* have read misc data from the stack */ mark_reg_unknown(env, state->regs, value_regno); } state->regs[value_regno].live |= REG_LIVE_WRITTEN; } - return 0; } + return 0; } static int check_stack_access(struct bpf_verifier_env *env, @@ -1402,7 +2109,7 @@ static int check_stack_access(struct bpf_verifier_env *env, char tn_buf[48]; tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); - verbose(env, "variable stack access var_off=%s off=%d size=%d", + verbose(env, "variable stack access var_off=%s off=%d size=%d\n", tn_buf, off, size); return -EACCES; } @@ -1415,6 +2122,28 @@ static int check_stack_access(struct bpf_verifier_env *env, return 0; } +static int check_map_access_type(struct bpf_verifier_env *env, u32 regno, + int off, int size, enum bpf_access_type type) +{ + struct bpf_reg_state *regs = cur_regs(env); + struct bpf_map *map = regs[regno].map_ptr; + u32 cap = bpf_map_flags_to_cap(map); + + if (type == BPF_WRITE && !(cap & BPF_MAP_CAN_WRITE)) { + verbose(env, "write into map forbidden, value_size=%d off=%d size=%d\n", + map->value_size, off, size); + return -EACCES; + } + + if (type == BPF_READ && !(cap & BPF_MAP_CAN_READ)) { + verbose(env, "read from map forbidden, value_size=%d off=%d size=%d\n", + map->value_size, off, size); + return -EACCES; + } + + return 0; +} + /* check read/write into map element returned by bpf_map_lookup_elem() */ static int __check_map_access(struct bpf_verifier_env *env, u32 regno, int off, int size, bool zero_size_allowed) @@ -1444,7 +2173,7 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno, * need to try adding each of min_value and max_value to off * to make sure our theoretical access will be safe. */ - if (env->log.level) + if (env->log.level & BPF_LOG_LEVEL) print_verifier_state(env, state); /* The minimum value is only important with signed @@ -1483,6 +2212,21 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno, if (err) verbose(env, "R%d max value is outside of the array range\n", regno); + + if (map_value_has_spin_lock(reg->map_ptr)) { + u32 lock = reg->map_ptr->spin_lock_off; + + /* if any part of struct bpf_spin_lock can be touched by + * load/store reject this program. + * To check that [x1, x2) overlaps with [y1, y2) + * it is sufficient to check x1 < y2 && y1 < x2. + */ + if (reg->smin_value + off < lock + sizeof(struct bpf_spin_lock) && + lock < reg->umax_value + off + size) { + verbose(env, "bpf_spin_lock cannot be accessed directly by load/store\n"); + return -EACCES; + } + } return err; } @@ -1516,6 +2260,13 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env, env->seen_direct_write = true; return true; + + case BPF_PROG_TYPE_CGROUP_SOCKOPT: + if (t == BPF_WRITE) + env->seen_direct_write = true; + + return true; + default: return false; } @@ -1617,12 +2368,14 @@ static int check_flow_keys_access(struct bpf_verifier_env *env, int off, return 0; } -static int check_sock_access(struct bpf_verifier_env *env, u32 regno, int off, - int size, enum bpf_access_type t) +static int check_sock_access(struct bpf_verifier_env *env, int insn_idx, + u32 regno, int off, int size, + enum bpf_access_type t) { struct bpf_reg_state *regs = cur_regs(env); struct bpf_reg_state *reg = ®s[regno]; - struct bpf_insn_access_aux info; + struct bpf_insn_access_aux info = {}; + bool valid; if (reg->smin_value < 0) { verbose(env, "R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n", @@ -1630,13 +2383,34 @@ static int check_sock_access(struct bpf_verifier_env *env, u32 regno, int off, return -EACCES; } - if (!bpf_sock_is_valid_access(off, size, t, &info)) { - verbose(env, "invalid bpf_sock access off=%d size=%d\n", - off, size); - return -EACCES; + switch (reg->type) { + case PTR_TO_SOCK_COMMON: + valid = bpf_sock_common_is_valid_access(off, size, t, &info); + break; + case PTR_TO_SOCKET: + valid = bpf_sock_is_valid_access(off, size, t, &info); + break; + case PTR_TO_TCP_SOCK: + valid = bpf_tcp_sock_is_valid_access(off, size, t, &info); + break; + case PTR_TO_XDP_SOCK: + valid = bpf_xdp_sock_is_valid_access(off, size, t, &info); + break; + default: + valid = false; } - return 0; + + if (valid) { + env->insn_aux_data[insn_idx].ctx_field_size = + info.ctx_field_size; + return 0; + } + + verbose(env, "R%d invalid %s access off=%d size=%d\n", + regno, reg_type_str[reg->type], off, size); + + return -EACCES; } static bool __is_pointer_value(bool allow_ptr_leaks, @@ -1662,8 +2436,14 @@ static bool is_ctx_reg(struct bpf_verifier_env *env, int regno) { const struct bpf_reg_state *reg = reg_state(env, regno); - return reg->type == PTR_TO_CTX || - reg->type == PTR_TO_SOCKET; + return reg->type == PTR_TO_CTX; +} + +static bool is_sk_reg(struct bpf_verifier_env *env, int regno) +{ + const struct bpf_reg_state *reg = reg_state(env, regno); + + return type_is_sk_pointer(reg->type); } static bool is_pkt_reg(struct bpf_verifier_env *env, int regno) @@ -1774,6 +2554,15 @@ static int check_ptr_alignment(struct bpf_verifier_env *env, case PTR_TO_SOCKET: pointer_desc = "sock "; break; + case PTR_TO_SOCK_COMMON: + pointer_desc = "sock_common "; + break; + case PTR_TO_TCP_SOCK: + pointer_desc = "tcp_sock "; + break; + case PTR_TO_XDP_SOCK: + pointer_desc = "xdp_sock "; + break; default: break; } @@ -1840,8 +2629,9 @@ continue_func: } frame++; if (frame >= MAX_CALL_FRAMES) { - WARN_ONCE(1, "verifier bug. Call stack is too deep\n"); - return -EFAULT; + verbose(env, "the call stack of %d frames is too deep !\n", + frame); + return -E2BIG; } goto process_func; } @@ -1897,6 +2687,32 @@ static int check_ctx_reg(struct bpf_verifier_env *env, return 0; } +static int check_tp_buffer_access(struct bpf_verifier_env *env, + const struct bpf_reg_state *reg, + int regno, int off, int size) +{ + if (off < 0) { + verbose(env, + "R%d invalid tracepoint buffer access: off=%d, size=%d", + regno, off, size); + return -EACCES; + } + if (!tnum_is_const(reg->var_off) || reg->var_off.value) { + char tn_buf[48]; + + tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); + verbose(env, + "R%d invalid variable buffer offset: off=%d, var_off=%s", + regno, off, tn_buf); + return -EACCES; + } + if (off + size > env->prog->aux->max_tp_access) + env->prog->aux->max_tp_access = off + size; + + return 0; +} + + /* truncate register to smaller size (in bytes) * must be called with size < BPF_REG_SIZE */ @@ -1953,7 +2769,9 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn verbose(env, "R%d leaks addr into map\n", value_regno); return -EACCES; } - + err = check_map_access_type(env, regno, off, size, t); + if (err) + return err; err = check_map_access(env, regno, off, size, false); if (!err && t == BPF_READ && value_regno >= 0) mark_reg_unknown(env, regs, value_regno); @@ -1977,11 +2795,20 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn * PTR_TO_PACKET[_META,_END]. In the latter * case, we know the offset is zero. */ - if (reg_type == SCALAR_VALUE) + if (reg_type == SCALAR_VALUE) { mark_reg_unknown(env, regs, value_regno); - else + } else { mark_reg_known_zero(env, regs, value_regno); + if (reg_type_may_be_null(reg_type)) + regs[value_regno].id = ++env->id_gen; + /* A load of ctx field could have different + * actual load size with the one encoded in the + * insn. When the dst is PTR, it is for sure not + * a sub-register. + */ + regs[value_regno].subreg_def = DEF_NOT_SUBREG; + } regs[value_regno].type = reg_type; } @@ -2027,14 +2854,19 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn err = check_flow_keys_access(env, off, size); if (!err && t == BPF_READ && value_regno >= 0) mark_reg_unknown(env, regs, value_regno); - } else if (reg->type == PTR_TO_SOCKET) { + } else if (type_is_sk_pointer(reg->type)) { if (t == BPF_WRITE) { - verbose(env, "cannot write into socket\n"); + verbose(env, "R%d cannot write into %s\n", + regno, reg_type_str[reg->type]); return -EACCES; } - err = check_sock_access(env, regno, off, size, t); + err = check_sock_access(env, insn_idx, regno, off, size, t); if (!err && value_regno >= 0) mark_reg_unknown(env, regs, value_regno); + } else if (reg->type == PTR_TO_TP_BUFFER) { + err = check_tp_buffer_access(env, reg, regno, off, size); + if (!err && t == BPF_READ && value_regno >= 0) + mark_reg_unknown(env, regs, value_regno); } else { verbose(env, "R%d invalid mem access '%s'\n", regno, reg_type_str[reg->type]); @@ -2076,7 +2908,8 @@ static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_ins if (is_ctx_reg(env, insn->dst_reg) || is_pkt_reg(env, insn->dst_reg) || - is_flow_key_reg(env, insn->dst_reg)) { + is_flow_key_reg(env, insn->dst_reg) || + is_sk_reg(env, insn->dst_reg)) { verbose(env, "BPF_XADD stores into R%d %s is not allowed\n", insn->dst_reg, reg_type_str[reg_state(env, insn->dst_reg)->type]); @@ -2094,6 +2927,29 @@ static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_ins BPF_SIZE(insn->code), BPF_WRITE, -1, true); } +static int __check_stack_boundary(struct bpf_verifier_env *env, u32 regno, + int off, int access_size, + bool zero_size_allowed) +{ + struct bpf_reg_state *reg = reg_state(env, regno); + + if (off >= 0 || off < -MAX_BPF_STACK || off + access_size > 0 || + access_size < 0 || (access_size == 0 && !zero_size_allowed)) { + if (tnum_is_const(reg->var_off)) { + verbose(env, "invalid stack type R%d off=%d access_size=%d\n", + regno, off, access_size); + } else { + char tn_buf[48]; + + tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); + verbose(env, "invalid stack type R%d var_off=%s access_size=%d\n", + regno, tn_buf, access_size); + } + return -EACCES; + } + return 0; +} + /* when register 'regno' is passed into function that will read 'access_size' * bytes from that pointer, make sure that it's within stack boundary * and all elements of stack are initialized. @@ -2106,7 +2962,7 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, { struct bpf_reg_state *reg = reg_state(env, regno); struct bpf_func_state *state = func(env, reg); - int off, i, slot, spi; + int err, min_off, max_off, i, j, slot, spi; if (reg->type != PTR_TO_STACK) { /* Allow zero-byte read from NULL, regardless of pointer type */ @@ -2120,21 +2976,57 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, return -EACCES; } - /* Only allow fixed-offset stack reads */ - if (!tnum_is_const(reg->var_off)) { - char tn_buf[48]; + if (tnum_is_const(reg->var_off)) { + min_off = max_off = reg->var_off.value + reg->off; + err = __check_stack_boundary(env, regno, min_off, access_size, + zero_size_allowed); + if (err) + return err; + } else { + /* Variable offset is prohibited for unprivileged mode for + * simplicity since it requires corresponding support in + * Spectre masking for stack ALU. + * See also retrieve_ptr_limit(). + */ + if (!env->allow_ptr_leaks) { + char tn_buf[48]; - tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); - verbose(env, "invalid variable stack read R%d var_off=%s\n", - regno, tn_buf); - return -EACCES; - } - off = reg->off + reg->var_off.value; - if (off >= 0 || off < -MAX_BPF_STACK || off + access_size > 0 || - access_size < 0 || (access_size == 0 && !zero_size_allowed)) { - verbose(env, "invalid stack type R%d off=%d access_size=%d\n", - regno, off, access_size); - return -EACCES; + tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); + verbose(env, "R%d indirect variable offset stack access prohibited for !root, var_off=%s\n", + regno, tn_buf); + return -EACCES; + } + /* Only initialized buffer on stack is allowed to be accessed + * with variable offset. With uninitialized buffer it's hard to + * guarantee that whole memory is marked as initialized on + * helper return since specific bounds are unknown what may + * cause uninitialized stack leaking. + */ + if (meta && meta->raw_mode) + meta = NULL; + + if (reg->smax_value >= BPF_MAX_VAR_OFF || + reg->smax_value <= -BPF_MAX_VAR_OFF) { + verbose(env, "R%d unbounded indirect variable offset stack access\n", + regno); + return -EACCES; + } + min_off = reg->smin_value + reg->off; + max_off = reg->smax_value + reg->off; + err = __check_stack_boundary(env, regno, min_off, access_size, + zero_size_allowed); + if (err) { + verbose(env, "R%d min value is outside of stack bound\n", + regno); + return err; + } + err = __check_stack_boundary(env, regno, max_off, access_size, + zero_size_allowed); + if (err) { + verbose(env, "R%d max value is outside of stack bound\n", + regno); + return err; + } } if (meta && meta->raw_mode) { @@ -2143,10 +3035,10 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, return 0; } - for (i = 0; i < access_size; i++) { + for (i = min_off; i < max_off + access_size; i++) { u8 *stype; - slot = -(off + i) - 1; + slot = -i - 1; spi = slot / BPF_REG_SIZE; if (state->allocated_stack <= slot) goto err; @@ -2158,18 +3050,35 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, *stype = STACK_MISC; goto mark; } + if (state->stack[spi].slot_type[0] == STACK_SPILL && + state->stack[spi].spilled_ptr.type == SCALAR_VALUE) { + __mark_reg_unknown(&state->stack[spi].spilled_ptr); + for (j = 0; j < BPF_REG_SIZE; j++) + state->stack[spi].slot_type[j] = STACK_MISC; + goto mark; + } + err: - verbose(env, "invalid indirect read from stack off %d+%d size %d\n", - off, i, access_size); + if (tnum_is_const(reg->var_off)) { + verbose(env, "invalid indirect read from stack off %d+%d size %d\n", + min_off, i - min_off, access_size); + } else { + char tn_buf[48]; + + tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); + verbose(env, "invalid indirect read from stack var_off %s+%d size %d\n", + tn_buf, i - min_off, access_size); + } return -EACCES; mark: /* reading any byte out of 8-byte 'spill_slot' will cause * the whole slot to be marked as 'read' */ mark_reg_read(env, &state->stack[spi].spilled_ptr, - state->stack[spi].spilled_ptr.parent); + state->stack[spi].spilled_ptr.parent, + REG_LIVE_READ64); } - return update_stack_depth(env, state, off); + return update_stack_depth(env, state, min_off); } static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, @@ -2184,6 +3093,10 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, return check_packet_access(env, regno, reg->off, access_size, zero_size_allowed); case PTR_TO_MAP_VALUE: + if (check_map_access_type(env, regno, reg->off, access_size, + meta && meta->raw_mode ? BPF_WRITE : + BPF_READ)) + return -EACCES; return check_map_access(env, regno, reg->off, access_size, zero_size_allowed); default: /* scalar_value|ptr_to_stack or invalid ptr */ @@ -2192,6 +3105,91 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, } } +/* Implementation details: + * bpf_map_lookup returns PTR_TO_MAP_VALUE_OR_NULL + * Two bpf_map_lookups (even with the same key) will have different reg->id. + * For traditional PTR_TO_MAP_VALUE the verifier clears reg->id after + * value_or_null->value transition, since the verifier only cares about + * the range of access to valid map value pointer and doesn't care about actual + * address of the map element. + * For maps with 'struct bpf_spin_lock' inside map value the verifier keeps + * reg->id > 0 after value_or_null->value transition. By doing so + * two bpf_map_lookups will be considered two different pointers that + * point to different bpf_spin_locks. + * The verifier allows taking only one bpf_spin_lock at a time to avoid + * dead-locks. + * Since only one bpf_spin_lock is allowed the checks are simpler than + * reg_is_refcounted() logic. The verifier needs to remember only + * one spin_lock instead of array of acquired_refs. + * cur_state->active_spin_lock remembers which map value element got locked + * and clears it after bpf_spin_unlock. + */ +static int process_spin_lock(struct bpf_verifier_env *env, int regno, + bool is_lock) +{ + struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; + struct bpf_verifier_state *cur = env->cur_state; + bool is_const = tnum_is_const(reg->var_off); + struct bpf_map *map = reg->map_ptr; + u64 val = reg->var_off.value; + + if (reg->type != PTR_TO_MAP_VALUE) { + verbose(env, "R%d is not a pointer to map_value\n", regno); + return -EINVAL; + } + if (!is_const) { + verbose(env, + "R%d doesn't have constant offset. bpf_spin_lock has to be at the constant offset\n", + regno); + return -EINVAL; + } + if (!map->btf) { + verbose(env, + "map '%s' has to have BTF in order to use bpf_spin_lock\n", + map->name); + return -EINVAL; + } + if (!map_value_has_spin_lock(map)) { + if (map->spin_lock_off == -E2BIG) + verbose(env, + "map '%s' has more than one 'struct bpf_spin_lock'\n", + map->name); + else if (map->spin_lock_off == -ENOENT) + verbose(env, + "map '%s' doesn't have 'struct bpf_spin_lock'\n", + map->name); + else + verbose(env, + "map '%s' is not a struct type or bpf_spin_lock is mangled\n", + map->name); + return -EINVAL; + } + if (map->spin_lock_off != val + reg->off) { + verbose(env, "off %lld doesn't point to 'struct bpf_spin_lock'\n", + val + reg->off); + return -EINVAL; + } + if (is_lock) { + if (cur->active_spin_lock) { + verbose(env, + "Locking two bpf_spin_locks are not allowed\n"); + return -EINVAL; + } + cur->active_spin_lock = reg->id; + } else { + if (!cur->active_spin_lock) { + verbose(env, "bpf_spin_unlock without taking a lock\n"); + return -EINVAL; + } + if (cur->active_spin_lock != reg->id) { + verbose(env, "bpf_spin_unlock of different lock\n"); + return -EINVAL; + } + cur->active_spin_lock = 0; + } + return 0; +} + static bool arg_type_is_mem_ptr(enum bpf_arg_type type) { return type == ARG_PTR_TO_MEM || @@ -2205,6 +3203,22 @@ static bool arg_type_is_mem_size(enum bpf_arg_type type) type == ARG_CONST_SIZE_OR_ZERO; } +static bool arg_type_is_int_ptr(enum bpf_arg_type type) +{ + return type == ARG_PTR_TO_INT || + type == ARG_PTR_TO_LONG; +} + +static int int_ptr_type_to_size(enum bpf_arg_type type) +{ + if (type == ARG_PTR_TO_INT) + return sizeof(u32); + else if (type == ARG_PTR_TO_LONG) + return sizeof(u64); + + return -EINVAL; +} + static int check_func_arg(struct bpf_verifier_env *env, u32 regno, enum bpf_arg_type arg_type, struct bpf_call_arg_meta *meta) @@ -2237,10 +3251,15 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, if (arg_type == ARG_PTR_TO_MAP_KEY || arg_type == ARG_PTR_TO_MAP_VALUE || - arg_type == ARG_PTR_TO_UNINIT_MAP_VALUE) { + arg_type == ARG_PTR_TO_UNINIT_MAP_VALUE || + arg_type == ARG_PTR_TO_MAP_VALUE_OR_NULL) { expected_type = PTR_TO_STACK; - if (!type_is_pkt_pointer(type) && type != PTR_TO_MAP_VALUE && - type != expected_type) + if (register_is_null(reg) && + arg_type == ARG_PTR_TO_MAP_VALUE_OR_NULL) + /* final test in check_stack_boundary() */; + else if (!type_is_pkt_pointer(type) && + type != PTR_TO_MAP_VALUE && + type != expected_type) goto err_type; } else if (arg_type == ARG_CONST_SIZE || arg_type == ARG_CONST_SIZE_OR_ZERO) { @@ -2258,16 +3277,35 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, err = check_ctx_reg(env, reg, regno); if (err < 0) return err; + } else if (arg_type == ARG_PTR_TO_SOCK_COMMON) { + expected_type = PTR_TO_SOCK_COMMON; + /* Any sk pointer can be ARG_PTR_TO_SOCK_COMMON */ + if (!type_is_sk_pointer(type)) + goto err_type; + if (reg->ref_obj_id) { + if (meta->ref_obj_id) { + verbose(env, "verifier internal error: more than one arg with ref_obj_id R%d %u %u\n", + regno, reg->ref_obj_id, + meta->ref_obj_id); + return -EFAULT; + } + meta->ref_obj_id = reg->ref_obj_id; + } } else if (arg_type == ARG_PTR_TO_SOCKET) { expected_type = PTR_TO_SOCKET; if (type != expected_type) goto err_type; - if (meta->ptr_id || !reg->id) { - verbose(env, "verifier internal error: mismatched references meta=%d, reg=%d\n", - meta->ptr_id, reg->id); + } else if (arg_type == ARG_PTR_TO_SPIN_LOCK) { + if (meta->func_id == BPF_FUNC_spin_lock) { + if (process_spin_lock(env, regno, true)) + return -EACCES; + } else if (meta->func_id == BPF_FUNC_spin_unlock) { + if (process_spin_lock(env, regno, false)) + return -EACCES; + } else { + verbose(env, "verifier internal error\n"); return -EFAULT; } - meta->ptr_id = reg->id; } else if (arg_type_is_mem_ptr(arg_type)) { expected_type = PTR_TO_STACK; /* One exception here. In case function allows for NULL to be @@ -2282,6 +3320,12 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, type != expected_type) goto err_type; meta->raw_mode = arg_type == ARG_PTR_TO_UNINIT_MEM; + } else if (arg_type_is_int_ptr(arg_type)) { + expected_type = PTR_TO_STACK; + if (!type_is_pkt_pointer(type) && + type != PTR_TO_MAP_VALUE && + type != expected_type) + goto err_type; } else { verbose(env, "unsupported arg_type %d\n", arg_type); return -EFAULT; @@ -2308,6 +3352,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, meta->map_ptr->key_size, false, NULL); } else if (arg_type == ARG_PTR_TO_MAP_VALUE || + (arg_type == ARG_PTR_TO_MAP_VALUE_OR_NULL && + !register_is_null(reg)) || arg_type == ARG_PTR_TO_UNINIT_MAP_VALUE) { /* bpf_map_xxx(..., map_ptr, ..., value) call: * check [value, value + map->value_size) validity @@ -2363,6 +3409,15 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, err = check_helper_mem_access(env, regno - 1, reg->umax_value, zero_size_allowed, meta); + if (!err) + err = mark_chain_precision(env, regno); + } else if (arg_type_is_int_ptr(arg_type)) { + int size = int_ptr_type_to_size(arg_type); + + err = check_helper_mem_access(env, regno, size, false, meta); + if (err) + return err; + err = check_ptr_alignment(env, reg, 0, size, true); } return err; @@ -2404,22 +3459,23 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, if (func_id != BPF_FUNC_get_local_storage) goto error; break; - /* devmap returns a pointer to a live net_device ifindex that we cannot - * allow to be modified from bpf side. So do not allow lookup elements - * for now. - */ case BPF_MAP_TYPE_DEVMAP: - if (func_id != BPF_FUNC_redirect_map) + if (func_id != BPF_FUNC_redirect_map && + func_id != BPF_FUNC_map_lookup_elem) goto error; break; /* Restrict bpf side of cpumap and xskmap, open when use-cases * appear. */ case BPF_MAP_TYPE_CPUMAP: - case BPF_MAP_TYPE_XSKMAP: if (func_id != BPF_FUNC_redirect_map) goto error; break; + case BPF_MAP_TYPE_XSKMAP: + if (func_id != BPF_FUNC_redirect_map && + func_id != BPF_FUNC_map_lookup_elem) + goto error; + break; case BPF_MAP_TYPE_ARRAY_OF_MAPS: case BPF_MAP_TYPE_HASH_OF_MAPS: if (func_id != BPF_FUNC_map_lookup_elem) @@ -2450,6 +3506,11 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, func_id != BPF_FUNC_map_push_elem) goto error; break; + case BPF_MAP_TYPE_SK_STORAGE: + if (func_id != BPF_FUNC_sk_storage_get && + func_id != BPF_FUNC_sk_storage_delete) + goto error; + break; default: break; } @@ -2513,6 +3574,11 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, map->map_type != BPF_MAP_TYPE_STACK) goto error; break; + case BPF_FUNC_sk_storage_get: + case BPF_FUNC_sk_storage_delete: + if (map->map_type != BPF_MAP_TYPE_SK_STORAGE) + goto error; + break; default: break; } @@ -2573,32 +3639,38 @@ static bool check_arg_pair_ok(const struct bpf_func_proto *fn) return true; } -static bool check_refcount_ok(const struct bpf_func_proto *fn) +static bool check_refcount_ok(const struct bpf_func_proto *fn, int func_id) { int count = 0; - if (arg_type_is_refcounted(fn->arg1_type)) + if (arg_type_may_be_refcounted(fn->arg1_type)) count++; - if (arg_type_is_refcounted(fn->arg2_type)) + if (arg_type_may_be_refcounted(fn->arg2_type)) count++; - if (arg_type_is_refcounted(fn->arg3_type)) + if (arg_type_may_be_refcounted(fn->arg3_type)) count++; - if (arg_type_is_refcounted(fn->arg4_type)) + if (arg_type_may_be_refcounted(fn->arg4_type)) count++; - if (arg_type_is_refcounted(fn->arg5_type)) + if (arg_type_may_be_refcounted(fn->arg5_type)) count++; + /* A reference acquiring function cannot acquire + * another refcounted ptr. + */ + if (is_acquire_function(func_id) && count) + return false; + /* We only support one arg being unreferenced at the moment, * which is sufficient for the helper functions we have right now. */ return count <= 1; } -static int check_func_proto(const struct bpf_func_proto *fn) +static int check_func_proto(const struct bpf_func_proto *fn, int func_id) { return check_raw_mode_ok(fn) && check_arg_pair_ok(fn) && - check_refcount_ok(fn) ? 0 : -EINVAL; + check_refcount_ok(fn, func_id) ? 0 : -EINVAL; } /* Packet data might have moved, any old PTR_TO_PACKET[_META,_END] @@ -2632,19 +3704,20 @@ static void clear_all_pkt_pointers(struct bpf_verifier_env *env) } static void release_reg_references(struct bpf_verifier_env *env, - struct bpf_func_state *state, int id) + struct bpf_func_state *state, + int ref_obj_id) { struct bpf_reg_state *regs = state->regs, *reg; int i; for (i = 0; i < MAX_BPF_REG; i++) - if (regs[i].id == id) + if (regs[i].ref_obj_id == ref_obj_id) mark_reg_unknown(env, regs, i); bpf_for_each_spilled_reg(i, state, reg) { if (!reg) continue; - if (reg_is_refcounted(reg) && reg->id == id) + if (reg->ref_obj_id == ref_obj_id) __mark_reg_unknown(reg); } } @@ -2653,15 +3726,20 @@ static void release_reg_references(struct bpf_verifier_env *env, * resources. Identify all copies of the same pointer and clear the reference. */ static int release_reference(struct bpf_verifier_env *env, - struct bpf_call_arg_meta *meta) + int ref_obj_id) { struct bpf_verifier_state *vstate = env->cur_state; + int err; int i; + err = release_reference_state(cur_func(env), ref_obj_id); + if (err) + return err; + for (i = 0; i <= vstate->curframe; i++) - release_reg_references(env, vstate->frame[i], meta->ptr_id); + release_reg_references(env, vstate->frame[i], ref_obj_id); - return release_reference_state(env, meta->ptr_id); + return 0; } static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, @@ -2730,7 +3808,7 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, /* and go analyze first insn of the callee */ *insn_idx = target_insn; - if (env->log.level) { + if (env->log.level & BPF_LOG_LEVEL) { verbose(env, "caller:\n"); print_verifier_state(env, caller); verbose(env, "callee:\n"); @@ -2770,7 +3848,7 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx) return err; *insn_idx = callee->callsite + 1; - if (env->log.level) { + if (env->log.level & BPF_LOG_LEVEL) { verbose(env, "returning from callee:\n"); print_verifier_state(env, callee); verbose(env, "to caller at %d:\n", *insn_idx); @@ -2804,6 +3882,7 @@ record_func_map(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta, int func_id, int insn_idx) { struct bpf_insn_aux_data *aux = &env->insn_aux_data[insn_idx]; + struct bpf_map *map = meta->map_ptr; if (func_id != BPF_FUNC_tail_call && func_id != BPF_FUNC_map_lookup_elem && @@ -2814,11 +3893,24 @@ record_func_map(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta, func_id != BPF_FUNC_map_peek_elem) return 0; - if (meta->map_ptr == NULL) { + if (map == NULL) { verbose(env, "kernel subsystem misconfigured verifier\n"); return -EINVAL; } + /* In case of read-only, some additional restrictions + * need to be applied in order to prevent altering the + * state of the map from program side. + */ + if ((map->map_flags & BPF_F_RDONLY_PROG) && + (func_id == BPF_FUNC_map_delete_elem || + func_id == BPF_FUNC_map_update_elem || + func_id == BPF_FUNC_map_push_elem || + func_id == BPF_FUNC_map_pop_elem)) { + verbose(env, "write into map forbidden\n"); + return -EACCES; + } + if (!BPF_MAP_PTR(aux->map_state)) bpf_map_ptr_store(aux, meta->map_ptr, meta->map_ptr->unpriv_array); @@ -2880,13 +3972,14 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn memset(&meta, 0, sizeof(meta)); meta.pkt_access = fn->pkt_access; - err = check_func_proto(fn); + err = check_func_proto(fn, func_id); if (err) { verbose(env, "kernel subsystem misconfigured func %s#%d\n", func_id_name(func_id), func_id); return err; } + meta.func_id = func_id; /* check args */ err = check_func_arg(env, BPF_REG_1, fn->arg1_type, &meta); if (err) @@ -2925,9 +4018,12 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn return err; } } else if (is_release_function(func_id)) { - err = release_reference(env, &meta); - if (err) + err = release_reference(env, meta.ref_obj_id); + if (err) { + verbose(env, "func %s#%d reference has not been acquired before\n", + func_id_name(func_id), func_id); return err; + } } regs = cur_regs(env); @@ -2947,6 +4043,9 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK); } + /* helper call returns 64-bit value. */ + regs[BPF_REG_0].subreg_def = DEF_NOT_SUBREG; + /* update return register (already marked as written above) */ if (fn->ret_type == RET_INTEGER) { /* sets type to SCALAR_VALUE */ @@ -2969,23 +4068,44 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn regs[BPF_REG_0].map_ptr = meta.map_ptr; if (fn->ret_type == RET_PTR_TO_MAP_VALUE) { regs[BPF_REG_0].type = PTR_TO_MAP_VALUE; + if (map_value_has_spin_lock(meta.map_ptr)) + regs[BPF_REG_0].id = ++env->id_gen; } else { regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL; regs[BPF_REG_0].id = ++env->id_gen; } } else if (fn->ret_type == RET_PTR_TO_SOCKET_OR_NULL) { - int id = acquire_reference_state(env, insn_idx); - if (id < 0) - return id; mark_reg_known_zero(env, regs, BPF_REG_0); regs[BPF_REG_0].type = PTR_TO_SOCKET_OR_NULL; - regs[BPF_REG_0].id = id; + regs[BPF_REG_0].id = ++env->id_gen; + } else if (fn->ret_type == RET_PTR_TO_SOCK_COMMON_OR_NULL) { + mark_reg_known_zero(env, regs, BPF_REG_0); + regs[BPF_REG_0].type = PTR_TO_SOCK_COMMON_OR_NULL; + regs[BPF_REG_0].id = ++env->id_gen; + } else if (fn->ret_type == RET_PTR_TO_TCP_SOCK_OR_NULL) { + mark_reg_known_zero(env, regs, BPF_REG_0); + regs[BPF_REG_0].type = PTR_TO_TCP_SOCK_OR_NULL; + regs[BPF_REG_0].id = ++env->id_gen; } else { verbose(env, "unknown return type %d of func %s#%d\n", fn->ret_type, func_id_name(func_id), func_id); return -EINVAL; } + if (is_ptr_cast_function(func_id)) { + /* For release_reference() */ + regs[BPF_REG_0].ref_obj_id = meta.ref_obj_id; + } else if (is_acquire_function(func_id)) { + int id = acquire_reference_state(env, insn_idx); + + if (id < 0) + return id; + /* For mark_ptr_or_null_reg() */ + regs[BPF_REG_0].id = id; + /* For release_reference() */ + regs[BPF_REG_0].ref_obj_id = id; + } + do_refine_retval_range(regs, fn->ret_type, func_id, &meta); err = check_map_func_compatibility(env, meta.map_ptr, func_id); @@ -3084,6 +4204,9 @@ static int retrieve_ptr_limit(const struct bpf_reg_state *ptr_reg, switch (ptr_reg->type) { case PTR_TO_STACK: + /* Indirect variable offset stack access is prohibited in + * unprivileged mode so it's not handled here. + */ off = ptr_reg->off + ptr_reg->var_off.value; if (mask_to_left) *ptr_limit = MAX_BPF_STACK + off; @@ -3184,7 +4307,7 @@ do_sim: *dst_reg = *ptr_reg; } ret = push_stack(env, env->insn_idx + 1, env->insn_idx, true); - if (!ptr_is_dst_reg) + if (!ptr_is_dst_reg && ret) *dst_reg = tmp; return !ret ? -EFAULT : 0; } @@ -3239,6 +4362,11 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, case PTR_TO_PACKET_END: case PTR_TO_SOCKET: case PTR_TO_SOCKET_OR_NULL: + case PTR_TO_SOCK_COMMON: + case PTR_TO_SOCK_COMMON_OR_NULL: + case PTR_TO_TCP_SOCK: + case PTR_TO_TCP_SOCK_OR_NULL: + case PTR_TO_XDP_SOCK: verbose(env, "R%d pointer arithmetic on %s prohibited\n", dst, reg_type_str[ptr_reg->type]); return -EACCES; @@ -3716,6 +4844,7 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env, struct bpf_reg_state *regs = state->regs, *dst_reg, *src_reg; struct bpf_reg_state *ptr_reg = NULL, off_reg = {0}; u8 opcode = BPF_OP(insn->code); + int err; dst_reg = ®s[insn->dst_reg]; src_reg = NULL; @@ -3742,11 +4871,17 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env, * This is legal, but we have to reverse our * src/dest handling in computing the range */ + err = mark_chain_precision(env, insn->dst_reg); + if (err) + return err; return adjust_ptr_min_max_vals(env, insn, src_reg, dst_reg); } } else if (ptr_reg) { /* pointer += scalar */ + err = mark_chain_precision(env, insn->src_reg); + if (err) + return err; return adjust_ptr_min_max_vals(env, insn, dst_reg, src_reg); } @@ -3850,6 +4985,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) */ *dst_reg = *src_reg; dst_reg->live |= REG_LIVE_WRITTEN; + dst_reg->subreg_def = DEF_NOT_SUBREG; } else { /* R1 = (u32) R2 */ if (is_pointer_value(env, insn->src_reg)) { @@ -3860,6 +4996,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) } else if (src_reg->type == SCALAR_VALUE) { *dst_reg = *src_reg; dst_reg->live |= REG_LIVE_WRITTEN; + dst_reg->subreg_def = env->insn_idx + 1; } else { mark_reg_unknown(env, regs, insn->dst_reg); @@ -3936,15 +5073,35 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) return 0; } +static void __find_good_pkt_pointers(struct bpf_func_state *state, + struct bpf_reg_state *dst_reg, + enum bpf_reg_type type, u16 new_range) +{ + struct bpf_reg_state *reg; + int i; + + for (i = 0; i < MAX_BPF_REG; i++) { + reg = &state->regs[i]; + if (reg->type == type && reg->id == dst_reg->id) + /* keep the maximum range already checked */ + reg->range = max(reg->range, new_range); + } + + bpf_for_each_spilled_reg(i, state, reg) { + if (!reg) + continue; + if (reg->type == type && reg->id == dst_reg->id) + reg->range = max(reg->range, new_range); + } +} + static void find_good_pkt_pointers(struct bpf_verifier_state *vstate, struct bpf_reg_state *dst_reg, enum bpf_reg_type type, bool range_right_open) { - struct bpf_func_state *state = vstate->frame[vstate->curframe]; - struct bpf_reg_state *regs = state->regs, *reg; u16 new_range; - int i, j; + int i; if (dst_reg->off < 0 || (dst_reg->off == 0 && range_right_open)) @@ -4009,20 +5166,9 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *vstate, * the range won't allow anything. * dst_reg->off is known < MAX_PACKET_OFF, therefore it fits in a u16. */ - for (i = 0; i < MAX_BPF_REG; i++) - if (regs[i].type == type && regs[i].id == dst_reg->id) - /* keep the maximum range already checked */ - regs[i].range = max(regs[i].range, new_range); - - for (j = 0; j <= vstate->curframe; j++) { - state = vstate->frame[j]; - bpf_for_each_spilled_reg(i, state, reg) { - if (!reg) - continue; - if (reg->type == type && reg->id == dst_reg->id) - reg->range = max(reg->range, new_range); - } - } + for (i = 0; i <= vstate->curframe; i++) + __find_good_pkt_pointers(vstate->frame[i], dst_reg, type, + new_range); } /* compute branch direction of the expression "if (reg opcode val) goto target;" @@ -4031,11 +5177,50 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *vstate, * 0 - branch will not be taken and fall-through to next insn * -1 - unknown. Example: "if (reg < 5)" is unknown when register value range [0,10] */ -static int is_branch_taken(struct bpf_reg_state *reg, u64 val, u8 opcode) +static int is_branch_taken(struct bpf_reg_state *reg, u64 val, u8 opcode, + bool is_jmp32) { + struct bpf_reg_state reg_lo; + s64 sval; + if (__is_pointer_value(false, reg)) return -1; + if (is_jmp32) { + reg_lo = *reg; + reg = ®_lo; + /* For JMP32, only low 32 bits are compared, coerce_reg_to_size + * could truncate high bits and update umin/umax according to + * information of low bits. + */ + coerce_reg_to_size(reg, 4); + /* smin/smax need special handling. For example, after coerce, + * if smin_value is 0x00000000ffffffffLL, the value is -1 when + * used as operand to JMP32. It is a negative number from s32's + * point of view, while it is a positive number when seen as + * s64. The smin/smax are kept as s64, therefore, when used with + * JMP32, they need to be transformed into s32, then sign + * extended back to s64. + * + * Also, smin/smax were copied from umin/umax. If umin/umax has + * different sign bit, then min/max relationship doesn't + * maintain after casting into s32, for this case, set smin/smax + * to safest range. + */ + if ((reg->umax_value ^ reg->umin_value) & + (1ULL << 31)) { + reg->smin_value = S32_MIN; + reg->smax_value = S32_MAX; + } + reg->smin_value = (s64)(s32)reg->smin_value; + reg->smax_value = (s64)(s32)reg->smax_value; + + val = (u32)val; + sval = (s64)(s32)val; + } else { + sval = (s64)val; + } + switch (opcode) { case BPF_JEQ: if (tnum_is_const(reg->var_off)) @@ -4058,9 +5243,9 @@ static int is_branch_taken(struct bpf_reg_state *reg, u64 val, u8 opcode) return 0; break; case BPF_JSGT: - if (reg->smin_value > (s64)val) + if (reg->smin_value > sval) return 1; - else if (reg->smax_value < (s64)val) + else if (reg->smax_value < sval) return 0; break; case BPF_JLT: @@ -4070,9 +5255,9 @@ static int is_branch_taken(struct bpf_reg_state *reg, u64 val, u8 opcode) return 0; break; case BPF_JSLT: - if (reg->smax_value < (s64)val) + if (reg->smax_value < sval) return 1; - else if (reg->smin_value >= (s64)val) + else if (reg->smin_value >= sval) return 0; break; case BPF_JGE: @@ -4082,9 +5267,9 @@ static int is_branch_taken(struct bpf_reg_state *reg, u64 val, u8 opcode) return 0; break; case BPF_JSGE: - if (reg->smin_value >= (s64)val) + if (reg->smin_value >= sval) return 1; - else if (reg->smax_value < (s64)val) + else if (reg->smax_value < sval) return 0; break; case BPF_JLE: @@ -4094,9 +5279,9 @@ static int is_branch_taken(struct bpf_reg_state *reg, u64 val, u8 opcode) return 0; break; case BPF_JSLE: - if (reg->smax_value <= (s64)val) + if (reg->smax_value <= sval) return 1; - else if (reg->smin_value > (s64)val) + else if (reg->smin_value > sval) return 0; break; } @@ -4104,6 +5289,29 @@ static int is_branch_taken(struct bpf_reg_state *reg, u64 val, u8 opcode) return -1; } +/* Generate min value of the high 32-bit from TNUM info. */ +static u64 gen_hi_min(struct tnum var) +{ + return var.value & ~0xffffffffULL; +} + +/* Generate max value of the high 32-bit from TNUM info. */ +static u64 gen_hi_max(struct tnum var) +{ + return (var.value | var.mask) & ~0xffffffffULL; +} + +/* Return true if VAL is compared with a s64 sign extended from s32, and they + * are with the same signedness. + */ +static bool cmp_val_with_extended_s64(s64 sval, struct bpf_reg_state *reg) +{ + return ((s32)sval >= 0 && + reg->smin_value >= 0 && reg->smax_value <= S32_MAX) || + ((s32)sval < 0 && + reg->smax_value <= 0 && reg->smin_value >= S32_MIN); +} + /* Adjusts the register min/max values in the case that the dst_reg is the * variable register that we are working on, and src_reg is a constant or we're * simply doing a BPF_K check. @@ -4111,8 +5319,10 @@ static int is_branch_taken(struct bpf_reg_state *reg, u64 val, u8 opcode) */ static void reg_set_min_max(struct bpf_reg_state *true_reg, struct bpf_reg_state *false_reg, u64 val, - u8 opcode) + u8 opcode, bool is_jmp32) { + s64 sval; + /* If the dst_reg is a pointer, we can't learn anything about its * variable offset from the compare (unless src_reg were a pointer into * the same object, but we don't bother with that. @@ -4122,19 +5332,31 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg, if (__is_pointer_value(false, false_reg)) return; + val = is_jmp32 ? (u32)val : val; + sval = is_jmp32 ? (s64)(s32)val : (s64)val; + switch (opcode) { case BPF_JEQ: - /* If this is false then we know nothing Jon Snow, but if it is - * true then we know for sure. - */ - __mark_reg_known(true_reg, val); - break; case BPF_JNE: - /* If this is true we know nothing Jon Snow, but if it is false - * we know the value for sure; + { + struct bpf_reg_state *reg = + opcode == BPF_JEQ ? true_reg : false_reg; + + /* For BPF_JEQ, if this is false we know nothing Jon Snow, but + * if it is true we know the value for sure. Likewise for + * BPF_JNE. */ - __mark_reg_known(false_reg, val); + if (is_jmp32) { + u64 old_v = reg->var_off.value; + u64 hi_mask = ~0xffffffffULL; + + reg->var_off.value = (old_v & hi_mask) | val; + reg->var_off.mask &= hi_mask; + } else { + __mark_reg_known(reg, val); + } break; + } case BPF_JSET: false_reg->var_off = tnum_and(false_reg->var_off, tnum_const(~val)); @@ -4142,38 +5364,61 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg, true_reg->var_off = tnum_or(true_reg->var_off, tnum_const(val)); break; - case BPF_JGT: - false_reg->umax_value = min(false_reg->umax_value, val); - true_reg->umin_value = max(true_reg->umin_value, val + 1); - break; - case BPF_JSGT: - false_reg->smax_value = min_t(s64, false_reg->smax_value, val); - true_reg->smin_value = max_t(s64, true_reg->smin_value, val + 1); - break; - case BPF_JLT: - false_reg->umin_value = max(false_reg->umin_value, val); - true_reg->umax_value = min(true_reg->umax_value, val - 1); - break; - case BPF_JSLT: - false_reg->smin_value = max_t(s64, false_reg->smin_value, val); - true_reg->smax_value = min_t(s64, true_reg->smax_value, val - 1); - break; case BPF_JGE: - false_reg->umax_value = min(false_reg->umax_value, val - 1); - true_reg->umin_value = max(true_reg->umin_value, val); + case BPF_JGT: + { + u64 false_umax = opcode == BPF_JGT ? val : val - 1; + u64 true_umin = opcode == BPF_JGT ? val + 1 : val; + + if (is_jmp32) { + false_umax += gen_hi_max(false_reg->var_off); + true_umin += gen_hi_min(true_reg->var_off); + } + false_reg->umax_value = min(false_reg->umax_value, false_umax); + true_reg->umin_value = max(true_reg->umin_value, true_umin); break; + } case BPF_JSGE: - false_reg->smax_value = min_t(s64, false_reg->smax_value, val - 1); - true_reg->smin_value = max_t(s64, true_reg->smin_value, val); + case BPF_JSGT: + { + s64 false_smax = opcode == BPF_JSGT ? sval : sval - 1; + s64 true_smin = opcode == BPF_JSGT ? sval + 1 : sval; + + /* If the full s64 was not sign-extended from s32 then don't + * deduct further info. + */ + if (is_jmp32 && !cmp_val_with_extended_s64(sval, false_reg)) + break; + false_reg->smax_value = min(false_reg->smax_value, false_smax); + true_reg->smin_value = max(true_reg->smin_value, true_smin); break; + } case BPF_JLE: - false_reg->umin_value = max(false_reg->umin_value, val + 1); - true_reg->umax_value = min(true_reg->umax_value, val); + case BPF_JLT: + { + u64 false_umin = opcode == BPF_JLT ? val : val + 1; + u64 true_umax = opcode == BPF_JLT ? val - 1 : val; + + if (is_jmp32) { + false_umin += gen_hi_min(false_reg->var_off); + true_umax += gen_hi_max(true_reg->var_off); + } + false_reg->umin_value = max(false_reg->umin_value, false_umin); + true_reg->umax_value = min(true_reg->umax_value, true_umax); break; + } case BPF_JSLE: - false_reg->smin_value = max_t(s64, false_reg->smin_value, val + 1); - true_reg->smax_value = min_t(s64, true_reg->smax_value, val); + case BPF_JSLT: + { + s64 false_smin = opcode == BPF_JSLT ? sval : sval + 1; + s64 true_smax = opcode == BPF_JSLT ? sval - 1 : sval; + + if (is_jmp32 && !cmp_val_with_extended_s64(sval, false_reg)) + break; + false_reg->smin_value = max(false_reg->smin_value, false_smin); + true_reg->smax_value = min(true_reg->smax_value, true_smax); break; + } default: break; } @@ -4196,24 +5441,34 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg, */ static void reg_set_min_max_inv(struct bpf_reg_state *true_reg, struct bpf_reg_state *false_reg, u64 val, - u8 opcode) + u8 opcode, bool is_jmp32) { + s64 sval; + if (__is_pointer_value(false, false_reg)) return; + val = is_jmp32 ? (u32)val : val; + sval = is_jmp32 ? (s64)(s32)val : (s64)val; + switch (opcode) { case BPF_JEQ: - /* If this is false then we know nothing Jon Snow, but if it is - * true then we know for sure. - */ - __mark_reg_known(true_reg, val); - break; case BPF_JNE: - /* If this is true we know nothing Jon Snow, but if it is false - * we know the value for sure; - */ - __mark_reg_known(false_reg, val); + { + struct bpf_reg_state *reg = + opcode == BPF_JEQ ? true_reg : false_reg; + + if (is_jmp32) { + u64 old_v = reg->var_off.value; + u64 hi_mask = ~0xffffffffULL; + + reg->var_off.value = (old_v & hi_mask) | val; + reg->var_off.mask &= hi_mask; + } else { + __mark_reg_known(reg, val); + } break; + } case BPF_JSET: false_reg->var_off = tnum_and(false_reg->var_off, tnum_const(~val)); @@ -4221,38 +5476,58 @@ static void reg_set_min_max_inv(struct bpf_reg_state *true_reg, true_reg->var_off = tnum_or(true_reg->var_off, tnum_const(val)); break; - case BPF_JGT: - true_reg->umax_value = min(true_reg->umax_value, val - 1); - false_reg->umin_value = max(false_reg->umin_value, val); - break; - case BPF_JSGT: - true_reg->smax_value = min_t(s64, true_reg->smax_value, val - 1); - false_reg->smin_value = max_t(s64, false_reg->smin_value, val); - break; - case BPF_JLT: - true_reg->umin_value = max(true_reg->umin_value, val + 1); - false_reg->umax_value = min(false_reg->umax_value, val); - break; - case BPF_JSLT: - true_reg->smin_value = max_t(s64, true_reg->smin_value, val + 1); - false_reg->smax_value = min_t(s64, false_reg->smax_value, val); - break; case BPF_JGE: - true_reg->umax_value = min(true_reg->umax_value, val); - false_reg->umin_value = max(false_reg->umin_value, val + 1); + case BPF_JGT: + { + u64 false_umin = opcode == BPF_JGT ? val : val + 1; + u64 true_umax = opcode == BPF_JGT ? val - 1 : val; + + if (is_jmp32) { + false_umin += gen_hi_min(false_reg->var_off); + true_umax += gen_hi_max(true_reg->var_off); + } + false_reg->umin_value = max(false_reg->umin_value, false_umin); + true_reg->umax_value = min(true_reg->umax_value, true_umax); break; + } case BPF_JSGE: - true_reg->smax_value = min_t(s64, true_reg->smax_value, val); - false_reg->smin_value = max_t(s64, false_reg->smin_value, val + 1); + case BPF_JSGT: + { + s64 false_smin = opcode == BPF_JSGT ? sval : sval + 1; + s64 true_smax = opcode == BPF_JSGT ? sval - 1 : sval; + + if (is_jmp32 && !cmp_val_with_extended_s64(sval, false_reg)) + break; + false_reg->smin_value = max(false_reg->smin_value, false_smin); + true_reg->smax_value = min(true_reg->smax_value, true_smax); break; + } case BPF_JLE: - true_reg->umin_value = max(true_reg->umin_value, val); - false_reg->umax_value = min(false_reg->umax_value, val - 1); + case BPF_JLT: + { + u64 false_umax = opcode == BPF_JLT ? val : val - 1; + u64 true_umin = opcode == BPF_JLT ? val + 1 : val; + + if (is_jmp32) { + false_umax += gen_hi_max(false_reg->var_off); + true_umin += gen_hi_min(true_reg->var_off); + } + false_reg->umax_value = min(false_reg->umax_value, false_umax); + true_reg->umin_value = max(true_reg->umin_value, true_umin); break; + } case BPF_JSLE: - true_reg->smin_value = max_t(s64, true_reg->smin_value, val); - false_reg->smax_value = min_t(s64, false_reg->smax_value, val - 1); + case BPF_JSLT: + { + s64 false_smax = opcode == BPF_JSLT ? sval : sval - 1; + s64 true_smin = opcode == BPF_JSLT ? sval + 1 : sval; + + if (is_jmp32 && !cmp_val_with_extended_s64(sval, false_reg)) + break; + false_reg->smax_value = min(false_reg->smax_value, false_smax); + true_reg->smin_value = max(true_reg->smin_value, true_smin); break; + } default: break; } @@ -4338,22 +5613,54 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state, if (reg->map_ptr->inner_map_meta) { reg->type = CONST_PTR_TO_MAP; reg->map_ptr = reg->map_ptr->inner_map_meta; + } else if (reg->map_ptr->map_type == + BPF_MAP_TYPE_XSKMAP) { + reg->type = PTR_TO_XDP_SOCK; } else { reg->type = PTR_TO_MAP_VALUE; } } else if (reg->type == PTR_TO_SOCKET_OR_NULL) { reg->type = PTR_TO_SOCKET; + } else if (reg->type == PTR_TO_SOCK_COMMON_OR_NULL) { + reg->type = PTR_TO_SOCK_COMMON; + } else if (reg->type == PTR_TO_TCP_SOCK_OR_NULL) { + reg->type = PTR_TO_TCP_SOCK; } - if (is_null || !reg_is_refcounted(reg)) { - /* We don't need id from this point onwards anymore, - * thus we should better reset it, so that state - * pruning has chances to take effect. + if (is_null) { + /* We don't need id and ref_obj_id from this point + * onwards anymore, thus we should better reset it, + * so that state pruning has chances to take effect. + */ + reg->id = 0; + reg->ref_obj_id = 0; + } else if (!reg_may_point_to_spin_lock(reg)) { + /* For not-NULL ptr, reg->ref_obj_id will be reset + * in release_reg_references(). + * + * reg->id is still used by spin_lock ptr. Other + * than spin_lock ptr type, reg->id can be reset. */ reg->id = 0; } } } +static void __mark_ptr_or_null_regs(struct bpf_func_state *state, u32 id, + bool is_null) +{ + struct bpf_reg_state *reg; + int i; + + for (i = 0; i < MAX_BPF_REG; i++) + mark_ptr_or_null_reg(state, &state->regs[i], id, is_null); + + bpf_for_each_spilled_reg(i, state, reg) { + if (!reg) + continue; + mark_ptr_or_null_reg(state, reg, id, is_null); + } +} + /* The logic is similar to find_good_pkt_pointers(), both could eventually * be folded together at some point. */ @@ -4361,24 +5668,20 @@ static void mark_ptr_or_null_regs(struct bpf_verifier_state *vstate, u32 regno, bool is_null) { struct bpf_func_state *state = vstate->frame[vstate->curframe]; - struct bpf_reg_state *reg, *regs = state->regs; + struct bpf_reg_state *regs = state->regs; + u32 ref_obj_id = regs[regno].ref_obj_id; u32 id = regs[regno].id; - int i, j; - - if (reg_is_refcounted_or_null(®s[regno]) && is_null) - __release_reference_state(state, id); + int i; - for (i = 0; i < MAX_BPF_REG; i++) - mark_ptr_or_null_reg(state, ®s[i], id, is_null); + if (ref_obj_id && ref_obj_id == id && is_null) + /* regs[regno] is in the " == NULL" branch. + * No one could have freed the reference state before + * doing the NULL check. + */ + WARN_ON_ONCE(release_reference_state(state, id)); - for (j = 0; j <= vstate->curframe; j++) { - state = vstate->frame[j]; - bpf_for_each_spilled_reg(i, state, reg) { - if (!reg) - continue; - mark_ptr_or_null_reg(state, reg, id, is_null); - } - } + for (i = 0; i <= vstate->curframe; i++) + __mark_ptr_or_null_regs(vstate->frame[i], id, is_null); } static bool try_match_pkt_pointers(const struct bpf_insn *insn, @@ -4390,6 +5693,10 @@ static bool try_match_pkt_pointers(const struct bpf_insn *insn, if (BPF_SRC(insn->code) != BPF_X) return false; + /* Pointers are always 64-bit. */ + if (BPF_CLASS(insn->code) == BPF_JMP32) + return false; + switch (BPF_OP(insn->code)) { case BPF_JGT: if ((dst_reg->type == PTR_TO_PACKET && @@ -4480,18 +5787,21 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, struct bpf_verifier_state *this_branch = env->cur_state; struct bpf_verifier_state *other_branch; struct bpf_reg_state *regs = this_branch->frame[this_branch->curframe]->regs; - struct bpf_reg_state *dst_reg, *other_branch_regs; + struct bpf_reg_state *dst_reg, *other_branch_regs, *src_reg = NULL; u8 opcode = BPF_OP(insn->code); + bool is_jmp32; + int pred = -1; int err; - if (opcode > BPF_JSLE) { - verbose(env, "invalid BPF_JMP opcode %x\n", opcode); + /* Only conditional jumps are expected to reach here. */ + if (opcode == BPF_JA || opcode > BPF_JSLE) { + verbose(env, "invalid BPF_JMP/JMP32 opcode %x\n", opcode); return -EINVAL; } if (BPF_SRC(insn->code) == BPF_X) { if (insn->imm != 0) { - verbose(env, "BPF_JMP uses reserved fields\n"); + verbose(env, "BPF_JMP/JMP32 uses reserved fields\n"); return -EINVAL; } @@ -4505,9 +5815,10 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, insn->src_reg); return -EACCES; } + src_reg = ®s[insn->src_reg]; } else { if (insn->src_reg != BPF_REG_0) { - verbose(env, "BPF_JMP uses reserved fields\n"); + verbose(env, "BPF_JMP/JMP32 uses reserved fields\n"); return -EINVAL; } } @@ -4518,20 +5829,31 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, return err; dst_reg = ®s[insn->dst_reg]; - - if (BPF_SRC(insn->code) == BPF_K) { - int pred = is_branch_taken(dst_reg, insn->imm, opcode); - - if (pred == 1) { - /* only follow the goto, ignore fall-through */ - *insn_idx += insn->off; - return 0; - } else if (pred == 0) { - /* only follow fall-through branch, since - * that's where the program will go - */ - return 0; - } + is_jmp32 = BPF_CLASS(insn->code) == BPF_JMP32; + + if (BPF_SRC(insn->code) == BPF_K) + pred = is_branch_taken(dst_reg, insn->imm, + opcode, is_jmp32); + else if (src_reg->type == SCALAR_VALUE && + tnum_is_const(src_reg->var_off)) + pred = is_branch_taken(dst_reg, src_reg->var_off.value, + opcode, is_jmp32); + if (pred >= 0) { + err = mark_chain_precision(env, insn->dst_reg); + if (BPF_SRC(insn->code) == BPF_X && !err) + err = mark_chain_precision(env, insn->src_reg); + if (err) + return err; + } + if (pred == 1) { + /* only follow the goto, ignore fall-through */ + *insn_idx += insn->off; + return 0; + } else if (pred == 0) { + /* only follow fall-through branch, since + * that's where the program will go + */ + return 0; } other_branch = push_stack(env, *insn_idx + insn->off + 1, *insn_idx, @@ -4548,30 +5870,51 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, * comparable. */ if (BPF_SRC(insn->code) == BPF_X) { + struct bpf_reg_state *src_reg = ®s[insn->src_reg]; + struct bpf_reg_state lo_reg0 = *dst_reg; + struct bpf_reg_state lo_reg1 = *src_reg; + struct bpf_reg_state *src_lo, *dst_lo; + + dst_lo = &lo_reg0; + src_lo = &lo_reg1; + coerce_reg_to_size(dst_lo, 4); + coerce_reg_to_size(src_lo, 4); + if (dst_reg->type == SCALAR_VALUE && - regs[insn->src_reg].type == SCALAR_VALUE) { - if (tnum_is_const(regs[insn->src_reg].var_off)) + src_reg->type == SCALAR_VALUE) { + if (tnum_is_const(src_reg->var_off) || + (is_jmp32 && tnum_is_const(src_lo->var_off))) reg_set_min_max(&other_branch_regs[insn->dst_reg], - dst_reg, regs[insn->src_reg].var_off.value, - opcode); - else if (tnum_is_const(dst_reg->var_off)) + dst_reg, + is_jmp32 + ? src_lo->var_off.value + : src_reg->var_off.value, + opcode, is_jmp32); + else if (tnum_is_const(dst_reg->var_off) || + (is_jmp32 && tnum_is_const(dst_lo->var_off))) reg_set_min_max_inv(&other_branch_regs[insn->src_reg], - ®s[insn->src_reg], - dst_reg->var_off.value, opcode); - else if (opcode == BPF_JEQ || opcode == BPF_JNE) + src_reg, + is_jmp32 + ? dst_lo->var_off.value + : dst_reg->var_off.value, + opcode, is_jmp32); + else if (!is_jmp32 && + (opcode == BPF_JEQ || opcode == BPF_JNE)) /* Comparing for equality, we can combine knowledge */ reg_combine_min_max(&other_branch_regs[insn->src_reg], &other_branch_regs[insn->dst_reg], - ®s[insn->src_reg], - ®s[insn->dst_reg], opcode); + src_reg, dst_reg, opcode); } } else if (dst_reg->type == SCALAR_VALUE) { reg_set_min_max(&other_branch_regs[insn->dst_reg], - dst_reg, insn->imm, opcode); + dst_reg, insn->imm, opcode, is_jmp32); } - /* detect if R == 0 where R is returned from bpf_map_lookup_elem() */ - if (BPF_SRC(insn->code) == BPF_K && + /* detect if R == 0 where R is returned from bpf_map_lookup_elem(). + * NOTE: these optimizations below are related with pointer comparison + * which will never be JMP32. + */ + if (!is_jmp32 && BPF_SRC(insn->code) == BPF_K && insn->imm == 0 && (opcode == BPF_JEQ || opcode == BPF_JNE) && reg_type_may_be_null(dst_reg->type)) { /* Mark all identical registers in each branch as either @@ -4588,23 +5931,17 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, insn->dst_reg); return -EACCES; } - if (env->log.level) + if (env->log.level & BPF_LOG_LEVEL) print_verifier_state(env, this_branch->frame[this_branch->curframe]); return 0; } -/* return the map pointer stored inside BPF_LD_IMM64 instruction */ -static struct bpf_map *ld_imm64_to_map_ptr(struct bpf_insn *insn) -{ - u64 imm64 = ((u64) (u32) insn[0].imm) | ((u64) (u32) insn[1].imm) << 32; - - return (struct bpf_map *) (unsigned long) imm64; -} - /* verify BPF_LD_IMM64 instruction */ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn) { + struct bpf_insn_aux_data *aux = cur_aux(env); struct bpf_reg_state *regs = cur_regs(env); + struct bpf_map *map; int err; if (BPF_SIZE(insn->code) != BPF_DW) { @@ -4628,11 +5965,22 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn) return 0; } - /* replace_map_fd_with_map_ptr() should have caught bad ld_imm64 */ - BUG_ON(insn->src_reg != BPF_PSEUDO_MAP_FD); + map = env->used_maps[aux->map_index]; + mark_reg_known_zero(env, regs, insn->dst_reg); + regs[insn->dst_reg].map_ptr = map; + + if (insn->src_reg == BPF_PSEUDO_MAP_VALUE) { + regs[insn->dst_reg].type = PTR_TO_MAP_VALUE; + regs[insn->dst_reg].off = aux->map_off; + if (map_value_has_spin_lock(map)) + regs[insn->dst_reg].id = ++env->id_gen; + } else if (insn->src_reg == BPF_PSEUDO_MAP_FD) { + regs[insn->dst_reg].type = CONST_PTR_TO_MAP; + } else { + verbose(env, "bpf verifier is misconfigured\n"); + return -EINVAL; + } - regs[insn->dst_reg].type = CONST_PTR_TO_MAP; - regs[insn->dst_reg].map_ptr = ld_imm64_to_map_ptr(insn); return 0; } @@ -4713,6 +6061,11 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) return err; } + if (env->cur_state->active_spin_lock) { + verbose(env, "BPF_LD_[ABS|IND] cannot be used inside bpf_spin_lock-ed region\n"); + return -EINVAL; + } + if (regs[BPF_REG_6].type != PTR_TO_CTX) { verbose(env, "at the time of BPF_LD_ABS|IND R6 != pointer to skb\n"); @@ -4737,20 +6090,32 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) * Already marked as written above. */ mark_reg_unknown(env, regs, BPF_REG_0); + /* ld_abs load up to 32-bit skb data. */ + regs[BPF_REG_0].subreg_def = env->insn_idx + 1; return 0; } static int check_return_code(struct bpf_verifier_env *env) { + struct tnum enforce_attach_type_range = tnum_unknown; struct bpf_reg_state *reg; struct tnum range = tnum_range(0, 1); switch (env->prog->type) { + case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: + if (env->prog->expected_attach_type == BPF_CGROUP_UDP4_RECVMSG || + env->prog->expected_attach_type == BPF_CGROUP_UDP6_RECVMSG) + range = tnum_range(1, 1); case BPF_PROG_TYPE_CGROUP_SKB: + if (env->prog->expected_attach_type == BPF_CGROUP_INET_EGRESS) { + range = tnum_range(0, 3); + enforce_attach_type_range = tnum_range(2, 3); + } case BPF_PROG_TYPE_CGROUP_SOCK: - case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: case BPF_PROG_TYPE_SOCK_OPS: case BPF_PROG_TYPE_CGROUP_DEVICE: + case BPF_PROG_TYPE_CGROUP_SYSCTL: + case BPF_PROG_TYPE_CGROUP_SOCKOPT: break; default: return 0; @@ -4764,18 +6129,23 @@ static int check_return_code(struct bpf_verifier_env *env) } if (!tnum_in(range, reg->var_off)) { + char tn_buf[48]; + verbose(env, "At program exit the register R0 "); if (!tnum_is_unknown(reg->var_off)) { - char tn_buf[48]; - tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); verbose(env, "has value %s", tn_buf); } else { verbose(env, "has unknown scalar value"); } - verbose(env, " should have been 0 or 1\n"); + tnum_strn(tn_buf, sizeof(tn_buf), range); + verbose(env, " should have been in %s\n", tn_buf); return -EINVAL; } + + if (!tnum_is_unknown(enforce_attach_type_range) && + tnum_in(enforce_attach_type_range, reg->var_off)) + env->prog->enforce_expected_attach_type = 1; return 0; } @@ -4819,19 +6189,37 @@ enum { BRANCH = 2, }; -#define STATE_LIST_MARK ((struct bpf_verifier_state_list *) -1L) +static u32 state_htab_size(struct bpf_verifier_env *env) +{ + return env->prog->len; +} + +static struct bpf_verifier_state_list **explored_state( + struct bpf_verifier_env *env, + int idx) +{ + struct bpf_verifier_state *cur = env->cur_state; + struct bpf_func_state *state = cur->frame[cur->curframe]; + + return &env->explored_states[(idx ^ state->callsite) % state_htab_size(env)]; +} -static int *insn_stack; /* stack of insns to process */ -static int cur_stack; /* current stack index */ -static int *insn_state; +static void init_explored_state(struct bpf_verifier_env *env, int idx) +{ + env->insn_aux_data[idx].prune_point = true; +} /* t, w, e - match pseudo-code above: * t - index of current instruction * w - next instruction * e - edge */ -static int push_insn(int t, int w, int e, struct bpf_verifier_env *env) +static int push_insn(int t, int w, int e, struct bpf_verifier_env *env, + bool loop_ok) { + int *insn_stack = env->cfg.insn_stack; + int *insn_state = env->cfg.insn_state; + if (e == FALLTHROUGH && insn_state[t] >= (DISCOVERED | FALLTHROUGH)) return 0; @@ -4846,17 +6234,19 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env) if (e == BRANCH) /* mark branch target for state pruning */ - env->explored_states[w] = STATE_LIST_MARK; + init_explored_state(env, w); if (insn_state[w] == 0) { /* tree-edge */ insn_state[t] = DISCOVERED | e; insn_state[w] = DISCOVERED; - if (cur_stack >= env->prog->len) + if (env->cfg.cur_stack >= env->prog->len) return -E2BIG; - insn_stack[cur_stack++] = w; + insn_stack[env->cfg.cur_stack++] = w; return 1; } else if ((insn_state[w] & 0xF0) == DISCOVERED) { + if (loop_ok && env->allow_ptr_leaks) + return 0; verbose_linfo(env, t, "%d: ", t); verbose_linfo(env, w, "%d: ", w); verbose(env, "back-edge from insn %d to %d\n", t, w); @@ -4878,44 +6268,47 @@ static int check_cfg(struct bpf_verifier_env *env) { struct bpf_insn *insns = env->prog->insnsi; int insn_cnt = env->prog->len; + int *insn_stack, *insn_state; int ret = 0; int i, t; - insn_state = kcalloc(insn_cnt, sizeof(int), GFP_KERNEL); + insn_state = env->cfg.insn_state = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL); if (!insn_state) return -ENOMEM; - insn_stack = kcalloc(insn_cnt, sizeof(int), GFP_KERNEL); + insn_stack = env->cfg.insn_stack = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL); if (!insn_stack) { - kfree(insn_state); + kvfree(insn_state); return -ENOMEM; } insn_state[0] = DISCOVERED; /* mark 1st insn as discovered */ insn_stack[0] = 0; /* 0 is the first instruction */ - cur_stack = 1; + env->cfg.cur_stack = 1; peek_stack: - if (cur_stack == 0) + if (env->cfg.cur_stack == 0) goto check_state; - t = insn_stack[cur_stack - 1]; + t = insn_stack[env->cfg.cur_stack - 1]; - if (BPF_CLASS(insns[t].code) == BPF_JMP) { + if (BPF_CLASS(insns[t].code) == BPF_JMP || + BPF_CLASS(insns[t].code) == BPF_JMP32) { u8 opcode = BPF_OP(insns[t].code); if (opcode == BPF_EXIT) { goto mark_explored; } else if (opcode == BPF_CALL) { - ret = push_insn(t, t + 1, FALLTHROUGH, env); + ret = push_insn(t, t + 1, FALLTHROUGH, env, false); if (ret == 1) goto peek_stack; else if (ret < 0) goto err_free; if (t + 1 < insn_cnt) - env->explored_states[t + 1] = STATE_LIST_MARK; + init_explored_state(env, t + 1); if (insns[t].src_reg == BPF_PSEUDO_CALL) { - env->explored_states[t] = STATE_LIST_MARK; - ret = push_insn(t, t + insns[t].imm + 1, BRANCH, env); + init_explored_state(env, t); + ret = push_insn(t, t + insns[t].imm + 1, BRANCH, + env, false); if (ret == 1) goto peek_stack; else if (ret < 0) @@ -4928,26 +6321,31 @@ peek_stack: } /* unconditional jump with single edge */ ret = push_insn(t, t + insns[t].off + 1, - FALLTHROUGH, env); + FALLTHROUGH, env, true); if (ret == 1) goto peek_stack; else if (ret < 0) goto err_free; + /* unconditional jmp is not a good pruning point, + * but it's marked, since backtracking needs + * to record jmp history in is_state_visited(). + */ + init_explored_state(env, t + insns[t].off + 1); /* tell verifier to check for equivalent states * after every call and jump */ if (t + 1 < insn_cnt) - env->explored_states[t + 1] = STATE_LIST_MARK; + init_explored_state(env, t + 1); } else { /* conditional jump with two edges */ - env->explored_states[t] = STATE_LIST_MARK; - ret = push_insn(t, t + 1, FALLTHROUGH, env); + init_explored_state(env, t); + ret = push_insn(t, t + 1, FALLTHROUGH, env, true); if (ret == 1) goto peek_stack; else if (ret < 0) goto err_free; - ret = push_insn(t, t + insns[t].off + 1, BRANCH, env); + ret = push_insn(t, t + insns[t].off + 1, BRANCH, env, true); if (ret == 1) goto peek_stack; else if (ret < 0) @@ -4957,7 +6355,7 @@ peek_stack: /* all other non-branch instructions with single * fall-through edge */ - ret = push_insn(t, t + 1, FALLTHROUGH, env); + ret = push_insn(t, t + 1, FALLTHROUGH, env, false); if (ret == 1) goto peek_stack; else if (ret < 0) @@ -4966,7 +6364,7 @@ peek_stack: mark_explored: insn_state[t] = EXPLORED; - if (cur_stack-- <= 0) { + if (env->cfg.cur_stack-- <= 0) { verbose(env, "pop stack internal bug\n"); ret = -EFAULT; goto err_free; @@ -4984,8 +6382,9 @@ check_state: ret = 0; /* cfg looks good */ err_free: - kfree(insn_state); - kfree(insn_stack); + kvfree(insn_state); + kvfree(insn_stack); + env->cfg.insn_state = env->cfg.insn_stack = NULL; return ret; } @@ -4997,13 +6396,14 @@ static int check_btf_func(struct bpf_verifier_env *env, const union bpf_attr *attr, union bpf_attr __user *uattr) { - u32 i, nfuncs, urec_size, min_size, prev_offset; + u32 i, nfuncs, urec_size, min_size; u32 krec_size = sizeof(struct bpf_func_info); struct bpf_func_info *krecord; const struct btf_type *type; struct bpf_prog *prog; const struct btf *btf; void __user *urecord; + u32 prev_offset = 0; int ret = 0; nfuncs = attr->func_info_cnt; @@ -5386,12 +6786,12 @@ static void clean_live_states(struct bpf_verifier_env *env, int insn, struct bpf_verifier_state_list *sl; int i; - sl = env->explored_states[insn]; - if (!sl) - return; - - while (sl != STATE_LIST_MARK) { - if (sl->state.curframe != cur->curframe) + sl = *explored_state(env, insn); + while (sl) { + if (sl->state.branches) + goto next; + if (sl->state.insn_idx != insn || + sl->state.curframe != cur->curframe) goto next; for (i = 0; i <= cur->curframe; i++) if (sl->state.frame[i]->callsite != cur->frame[i]->callsite) @@ -5431,6 +6831,8 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur, switch (rold->type) { case SCALAR_VALUE: if (rcur->type == SCALAR_VALUE) { + if (!rold->precise && !rcur->precise) + return true; /* new val must satisfy old val knowledge */ return range_within(rold, rcur) && tnum_in(rold->var_off, rcur->var_off); @@ -5447,8 +6849,11 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur, case PTR_TO_MAP_VALUE: /* If the new min/max/var_off satisfy the old ones and * everything else matches, we are OK. - * We don't care about the 'id' value, because nothing - * uses it for PTR_TO_MAP_VALUE (only for ..._OR_NULL) + * 'id' is not compared, since it's only used for maps with + * bpf_spin_lock inside map element and in such cases if + * the rest of the prog is valid for one map element then + * it's valid for all map elements regardless of the key + * used in bpf_map_lookup() */ return memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)) == 0 && range_within(rold, rcur) && @@ -5496,6 +6901,11 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur, case PTR_TO_FLOW_KEYS: case PTR_TO_SOCKET: case PTR_TO_SOCKET_OR_NULL: + case PTR_TO_SOCK_COMMON: + case PTR_TO_SOCK_COMMON_OR_NULL: + case PTR_TO_TCP_SOCK: + case PTR_TO_TCP_SOCK_OR_NULL: + case PTR_TO_XDP_SOCK: /* Only valid matches are exact, which memcmp() above * would have accepted */ @@ -5651,6 +7061,9 @@ static bool states_equal(struct bpf_verifier_env *env, if (old->speculative && !cur->speculative) return false; + if (old->active_spin_lock != cur->active_spin_lock) + return false; + /* for states to be equal callsites have to be the same * and all frame states need to be equivalent */ @@ -5663,6 +7076,35 @@ static bool states_equal(struct bpf_verifier_env *env, return true; } +/* Return 0 if no propagation happened. Return negative error code if error + * happened. Otherwise, return the propagated bit. + */ +static int propagate_liveness_reg(struct bpf_verifier_env *env, + struct bpf_reg_state *reg, + struct bpf_reg_state *parent_reg) +{ + u8 parent_flag = parent_reg->live & REG_LIVE_READ; + u8 flag = reg->live & REG_LIVE_READ; + int err; + + /* When comes here, read flags of PARENT_REG or REG could be any of + * REG_LIVE_READ64, REG_LIVE_READ32, REG_LIVE_NONE. There is no need + * of propagation if PARENT_REG has strongest REG_LIVE_READ64. + */ + if (parent_flag == REG_LIVE_READ64 || + /* Or if there is no read flag from REG. */ + !flag || + /* Or if the read flag from REG is the same as PARENT_REG. */ + parent_flag == flag) + return 0; + + err = mark_reg_read(env, reg, parent_reg, flag); + if (err) + return err; + + return flag; +} + /* A write screens off any subsequent reads; but write marks come from the * straight-line code between a state and its parent. When we arrive at an * equivalent state (jump target or such) we didn't arrive by the straight-line @@ -5674,8 +7116,9 @@ static int propagate_liveness(struct bpf_verifier_env *env, const struct bpf_verifier_state *vstate, struct bpf_verifier_state *vparent) { - int i, frame, err = 0; + struct bpf_reg_state *state_reg, *parent_reg; struct bpf_func_state *state, *parent; + int i, frame, err = 0; if (vparent->curframe != vstate->curframe) { WARN(1, "propagate_live: parent frame %d current frame %d\n", @@ -5684,52 +7127,156 @@ static int propagate_liveness(struct bpf_verifier_env *env, } /* Propagate read liveness of registers... */ BUILD_BUG_ON(BPF_REG_FP + 1 != MAX_BPF_REG); - /* We don't need to worry about FP liveness because it's read-only */ - for (i = 0; i < BPF_REG_FP; i++) { - if (vparent->frame[vparent->curframe]->regs[i].live & REG_LIVE_READ) - continue; - if (vstate->frame[vstate->curframe]->regs[i].live & REG_LIVE_READ) { - err = mark_reg_read(env, &vstate->frame[vstate->curframe]->regs[i], - &vparent->frame[vstate->curframe]->regs[i]); - if (err) + for (frame = 0; frame <= vstate->curframe; frame++) { + parent = vparent->frame[frame]; + state = vstate->frame[frame]; + parent_reg = parent->regs; + state_reg = state->regs; + /* We don't need to worry about FP liveness, it's read-only */ + for (i = frame < vstate->curframe ? BPF_REG_6 : 0; i < BPF_REG_FP; i++) { + err = propagate_liveness_reg(env, &state_reg[i], + &parent_reg[i]); + if (err < 0) return err; + if (err == REG_LIVE_READ64) + mark_insn_zext(env, &parent_reg[i]); } - } - /* ... and stack slots */ - for (frame = 0; frame <= vstate->curframe; frame++) { - state = vstate->frame[frame]; - parent = vparent->frame[frame]; + /* Propagate stack slots. */ for (i = 0; i < state->allocated_stack / BPF_REG_SIZE && i < parent->allocated_stack / BPF_REG_SIZE; i++) { - if (parent->stack[i].spilled_ptr.live & REG_LIVE_READ) - continue; - if (state->stack[i].spilled_ptr.live & REG_LIVE_READ) - mark_reg_read(env, &state->stack[i].spilled_ptr, - &parent->stack[i].spilled_ptr); + parent_reg = &parent->stack[i].spilled_ptr; + state_reg = &state->stack[i].spilled_ptr; + err = propagate_liveness_reg(env, state_reg, + parent_reg); + if (err < 0) + return err; } } - return err; + return 0; } +/* find precise scalars in the previous equivalent state and + * propagate them into the current state + */ +static int propagate_precision(struct bpf_verifier_env *env, + const struct bpf_verifier_state *old) +{ + struct bpf_reg_state *state_reg; + struct bpf_func_state *state; + int i, err = 0; + + state = old->frame[old->curframe]; + state_reg = state->regs; + for (i = 0; i < BPF_REG_FP; i++, state_reg++) { + if (state_reg->type != SCALAR_VALUE || + !state_reg->precise) + continue; + if (env->log.level & BPF_LOG_LEVEL2) + verbose(env, "propagating r%d\n", i); + err = mark_chain_precision(env, i); + if (err < 0) + return err; + } + + for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) { + if (state->stack[i].slot_type[0] != STACK_SPILL) + continue; + state_reg = &state->stack[i].spilled_ptr; + if (state_reg->type != SCALAR_VALUE || + !state_reg->precise) + continue; + if (env->log.level & BPF_LOG_LEVEL2) + verbose(env, "propagating fp%d\n", + (-i - 1) * BPF_REG_SIZE); + err = mark_chain_precision_stack(env, i); + if (err < 0) + return err; + } + return 0; +} + +static bool states_maybe_looping(struct bpf_verifier_state *old, + struct bpf_verifier_state *cur) +{ + struct bpf_func_state *fold, *fcur; + int i, fr = cur->curframe; + + if (old->curframe != fr) + return false; + + fold = old->frame[fr]; + fcur = cur->frame[fr]; + for (i = 0; i < MAX_BPF_REG; i++) + if (memcmp(&fold->regs[i], &fcur->regs[i], + offsetof(struct bpf_reg_state, parent))) + return false; + return true; +} + + static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) { struct bpf_verifier_state_list *new_sl; - struct bpf_verifier_state_list *sl; + struct bpf_verifier_state_list *sl, **pprev; struct bpf_verifier_state *cur = env->cur_state, *new; int i, j, err, states_cnt = 0; + bool add_new_state = false; - sl = env->explored_states[insn_idx]; - if (!sl) + cur->last_insn_idx = env->prev_insn_idx; + if (!env->insn_aux_data[insn_idx].prune_point) /* this 'insn_idx' instruction wasn't marked, so we will not * be doing state search here */ return 0; + /* bpf progs typically have pruning point every 4 instructions + * http://vger.kernel.org/bpfconf2019.html#session-1 + * Do not add new state for future pruning if the verifier hasn't seen + * at least 2 jumps and at least 8 instructions. + * This heuristics helps decrease 'total_states' and 'peak_states' metric. + * In tests that amounts to up to 50% reduction into total verifier + * memory consumption and 20% verifier time speedup. + */ + if (env->jmps_processed - env->prev_jmps_processed >= 2 && + env->insn_processed - env->prev_insn_processed >= 8) + add_new_state = true; + + pprev = explored_state(env, insn_idx); + sl = *pprev; + clean_live_states(env, insn_idx, cur); - while (sl != STATE_LIST_MARK) { + while (sl) { + states_cnt++; + if (sl->state.insn_idx != insn_idx) + goto next; + if (sl->state.branches) { + if (states_maybe_looping(&sl->state, cur) && + states_equal(env, &sl->state, cur)) { + verbose_linfo(env, insn_idx, "; "); + verbose(env, "infinite loop detected at insn %d\n", insn_idx); + return -EINVAL; + } + /* if the verifier is processing a loop, avoid adding new state + * too often, since different loop iterations have distinct + * states and may not help future pruning. + * This threshold shouldn't be too low to make sure that + * a loop with large bound will be rejected quickly. + * The most abusive loop will be: + * r1 += 1 + * if r1 < 1000000 goto pc-2 + * 1M insn_procssed limit / 100 == 10k peak states. + * This threshold shouldn't be too high either, since states + * at the end of the loop are likely to be useful in pruning. + */ + if (env->jmps_processed - env->prev_jmps_processed < 20 && + env->insn_processed - env->prev_insn_processed < 100) + add_new_state = false; + goto miss; + } if (states_equal(env, &sl->state, cur)) { + sl->hit_cnt++; /* reached equivalent register/stack state, * prune the search. * Registers read by the continuation are read by us. @@ -5741,27 +7288,87 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) * this state and will pop a new one. */ err = propagate_liveness(env, &sl->state, cur); + + /* if previous state reached the exit with precision and + * current state is equivalent to it (except precsion marks) + * the precision needs to be propagated back in + * the current state. + */ + err = err ? : push_jmp_history(env, cur); + err = err ? : propagate_precision(env, &sl->state); if (err) return err; return 1; } - sl = sl->next; - states_cnt++; +miss: + /* when new state is not going to be added do not increase miss count. + * Otherwise several loop iterations will remove the state + * recorded earlier. The goal of these heuristics is to have + * states from some iterations of the loop (some in the beginning + * and some at the end) to help pruning. + */ + if (add_new_state) + sl->miss_cnt++; + /* heuristic to determine whether this state is beneficial + * to keep checking from state equivalence point of view. + * Higher numbers increase max_states_per_insn and verification time, + * but do not meaningfully decrease insn_processed. + */ + if (sl->miss_cnt > sl->hit_cnt * 3 + 3) { + /* the state is unlikely to be useful. Remove it to + * speed up verification + */ + *pprev = sl->next; + if (sl->state.frame[0]->regs[0].live & REG_LIVE_DONE) { + u32 br = sl->state.branches; + + WARN_ONCE(br, + "BUG live_done but branches_to_explore %d\n", + br); + free_verifier_state(&sl->state, false); + kfree(sl); + env->peak_states--; + } else { + /* cannot free this state, since parentage chain may + * walk it later. Add it for free_list instead to + * be freed at the end of verification + */ + sl->next = env->free_list; + env->free_list = sl; + } + sl = *pprev; + continue; + } +next: + pprev = &sl->next; + sl = *pprev; } + if (env->max_states_per_insn < states_cnt) + env->max_states_per_insn = states_cnt; + if (!env->allow_ptr_leaks && states_cnt > BPF_COMPLEXITY_LIMIT_STATES) - return 0; + return push_jmp_history(env, cur); - /* there were no equivalent states, remember current one. - * technically the current state is not proven to be safe yet, + if (!add_new_state) + return push_jmp_history(env, cur); + + /* There were no equivalent states, remember the current one. + * Technically the current state is not proven to be safe yet, * but it will either reach outer most bpf_exit (which means it's safe) - * or it will be rejected. Since there are no loops, we won't be + * or it will be rejected. When there are no loops the verifier won't be * seeing this tuple (frame[0].callsite, frame[1].callsite, .. insn_idx) - * again on the way to bpf_exit + * again on the way to bpf_exit. + * When looping the sl->state.branches will be > 0 and this state + * will not be considered for equivalence until branches == 0. */ new_sl = kzalloc(sizeof(struct bpf_verifier_state_list), GFP_KERNEL); if (!new_sl) return -ENOMEM; + env->total_states++; + env->peak_states++; + env->prev_jmps_processed = env->jmps_processed; + env->prev_insn_processed = env->insn_processed; /* add new state to the head of linked list */ new = &new_sl->state; @@ -5771,8 +7378,15 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) kfree(new_sl); return err; } - new_sl->next = env->explored_states[insn_idx]; - env->explored_states[insn_idx] = new_sl; + new->insn_idx = insn_idx; + WARN_ONCE(new->branches != 1, + "BUG is_state_visited:branches_to_explore=%d insn %d\n", new->branches, insn_idx); + + cur->parent = new; + cur->first_insn_idx = insn_idx; + clear_jmp_history(cur); + new_sl->next = *explored_state(env, insn_idx); + *explored_state(env, insn_idx) = new_sl; /* connect new state to parentage chain. Current frame needs all * registers connected. Only r6 - r9 of the callers are alive (pushed * to the stack implicitly by JITs) so in callers' frames connect just @@ -5780,17 +7394,18 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) * the state of the call instruction (with WRITTEN set), and r0 comes * from callee with its full parentage chain, anyway. */ - for (j = 0; j <= cur->curframe; j++) - for (i = j < cur->curframe ? BPF_REG_6 : 0; i < BPF_REG_FP; i++) - cur->frame[j]->regs[i].parent = &new->frame[j]->regs[i]; /* clear write marks in current state: the writes we did are not writes * our child did, so they don't screen off its reads from us. * (There are no read marks in current state, because reads always mark * their parent and current state never has children yet. Only * explored_states can get read marks.) */ - for (i = 0; i < BPF_REG_FP; i++) - cur->frame[cur->curframe]->regs[i].live = REG_LIVE_NONE; + for (j = 0; j <= cur->curframe; j++) { + for (i = j < cur->curframe ? BPF_REG_6 : 0; i < BPF_REG_FP; i++) + cur->frame[j]->regs[i].parent = &new->frame[j]->regs[i]; + for (i = 0; i < BPF_REG_FP; i++) + cur->frame[j]->regs[i].live = REG_LIVE_NONE; + } /* all stack frames are accessible from callee, clear them all */ for (j = 0; j <= cur->curframe; j++) { @@ -5813,6 +7428,11 @@ static bool reg_type_mismatch_ok(enum bpf_reg_type type) case PTR_TO_CTX: case PTR_TO_SOCKET: case PTR_TO_SOCKET_OR_NULL: + case PTR_TO_SOCK_COMMON: + case PTR_TO_SOCK_COMMON_OR_NULL: + case PTR_TO_TCP_SOCK: + case PTR_TO_TCP_SOCK_OR_NULL: + case PTR_TO_XDP_SOCK: return false; default: return true; @@ -5842,9 +7462,9 @@ static int do_check(struct bpf_verifier_env *env) struct bpf_verifier_state *state; struct bpf_insn *insns = env->prog->insnsi; struct bpf_reg_state *regs; - int insn_cnt = env->prog->len, i; - int insn_processed = 0; + int insn_cnt = env->prog->len; bool do_print_state = false; + int prev_insn_idx = -1; env->prev_linfo = NULL; @@ -5853,6 +7473,7 @@ static int do_check(struct bpf_verifier_env *env) return -ENOMEM; state->curframe = 0; state->speculative = false; + state->branches = 1; state->frame[0] = kzalloc(sizeof(struct bpf_func_state), GFP_KERNEL); if (!state->frame[0]) { kfree(state); @@ -5869,6 +7490,7 @@ static int do_check(struct bpf_verifier_env *env) u8 class; int err; + env->prev_insn_idx = prev_insn_idx; if (env->insn_idx >= insn_cnt) { verbose(env, "invalid insn idx %d insn_cnt %d\n", env->insn_idx, insn_cnt); @@ -5878,10 +7500,10 @@ static int do_check(struct bpf_verifier_env *env) insn = &insns[env->insn_idx]; class = BPF_CLASS(insn->code); - if (++insn_processed > BPF_COMPLEXITY_LIMIT_INSNS) { + if (++env->insn_processed > BPF_COMPLEXITY_LIMIT_INSNS) { verbose(env, "BPF program is too large. Processed %d insn\n", - insn_processed); + env->insn_processed); return -E2BIG; } @@ -5890,7 +7512,7 @@ static int do_check(struct bpf_verifier_env *env) return err; if (err == 1) { /* found equivalent state, can prune the search */ - if (env->log.level) { + if (env->log.level & BPF_LOG_LEVEL) { if (do_print_state) verbose(env, "\nfrom %d to %d%s: safe\n", env->prev_insn_idx, env->insn_idx, @@ -5908,8 +7530,9 @@ static int do_check(struct bpf_verifier_env *env) if (need_resched()) cond_resched(); - if (env->log.level > 1 || (env->log.level && do_print_state)) { - if (env->log.level > 1) + if (env->log.level & BPF_LOG_LEVEL2 || + (env->log.level & BPF_LOG_LEVEL && do_print_state)) { + if (env->log.level & BPF_LOG_LEVEL2) verbose(env, "%d:", env->insn_idx); else verbose(env, "\nfrom %d to %d%s:", @@ -5920,7 +7543,7 @@ static int do_check(struct bpf_verifier_env *env) do_print_state = false; } - if (env->log.level) { + if (env->log.level & BPF_LOG_LEVEL) { const struct bpf_insn_cbs cbs = { .cb_print = verbose, .private_data = env, @@ -5940,6 +7563,7 @@ static int do_check(struct bpf_verifier_env *env) regs = cur_regs(env); env->insn_aux_data[env->insn_idx].seen = true; + prev_insn_idx = env->insn_idx; if (class == BPF_ALU || class == BPF_ALU64) { err = check_alu_op(env, insn); @@ -6055,19 +7679,27 @@ static int do_check(struct bpf_verifier_env *env) if (err) return err; - } else if (class == BPF_JMP) { + } else if (class == BPF_JMP || class == BPF_JMP32) { u8 opcode = BPF_OP(insn->code); + env->jmps_processed++; if (opcode == BPF_CALL) { if (BPF_SRC(insn->code) != BPF_K || insn->off != 0 || (insn->src_reg != BPF_REG_0 && insn->src_reg != BPF_PSEUDO_CALL) || - insn->dst_reg != BPF_REG_0) { + insn->dst_reg != BPF_REG_0 || + class == BPF_JMP32) { verbose(env, "BPF_CALL uses reserved fields\n"); return -EINVAL; } + if (env->cur_state->active_spin_lock && + (insn->src_reg == BPF_PSEUDO_CALL || + insn->imm != BPF_FUNC_spin_unlock)) { + verbose(env, "function calls are not allowed while holding a lock\n"); + return -EINVAL; + } if (insn->src_reg == BPF_PSEUDO_CALL) err = check_func_call(env, insn, &env->insn_idx); else @@ -6079,7 +7711,8 @@ static int do_check(struct bpf_verifier_env *env) if (BPF_SRC(insn->code) != BPF_K || insn->imm != 0 || insn->src_reg != BPF_REG_0 || - insn->dst_reg != BPF_REG_0) { + insn->dst_reg != BPF_REG_0 || + class == BPF_JMP32) { verbose(env, "BPF_JA uses reserved fields\n"); return -EINVAL; } @@ -6091,14 +7724,19 @@ static int do_check(struct bpf_verifier_env *env) if (BPF_SRC(insn->code) != BPF_K || insn->imm != 0 || insn->src_reg != BPF_REG_0 || - insn->dst_reg != BPF_REG_0) { + insn->dst_reg != BPF_REG_0 || + class == BPF_JMP32) { verbose(env, "BPF_EXIT uses reserved fields\n"); return -EINVAL; } + if (env->cur_state->active_spin_lock) { + verbose(env, "bpf_spin_unlock is missing\n"); + return -EINVAL; + } + if (state->curframe) { /* exit from nested function */ - env->prev_insn_idx = env->insn_idx; err = prepare_func_exit(env, &env->insn_idx); if (err) return err; @@ -6129,7 +7767,8 @@ static int do_check(struct bpf_verifier_env *env) if (err) return err; process_bpf_exit: - err = pop_stack(env, &env->prev_insn_idx, + update_branch_counts(env, env->cur_state); + err = pop_stack(env, &prev_insn_idx, &env->insn_idx); if (err < 0) { if (err != -ENOENT) @@ -6171,16 +7810,6 @@ process_bpf_exit: env->insn_idx++; } - verbose(env, "processed %d insns (limit %d), stack depth ", - insn_processed, BPF_COMPLEXITY_LIMIT_INSNS); - for (i = 0; i < env->subprog_cnt; i++) { - u32 depth = env->subprog_info[i].stack_depth; - - verbose(env, "%d", depth); - if (i + 1 < env->subprog_cnt) - verbose(env, "+"); - } - verbose(env, "\n"); env->prog->aux->stack_depth = env->subprog_info[0].stack_depth; return 0; } @@ -6193,6 +7822,19 @@ static int check_map_prealloc(struct bpf_map *map) !(map->map_flags & BPF_F_NO_PREALLOC); } +static bool is_tracing_prog_type(enum bpf_prog_type type) +{ + switch (type) { + case BPF_PROG_TYPE_KPROBE: + case BPF_PROG_TYPE_TRACEPOINT: + case BPF_PROG_TYPE_PERF_EVENT: + case BPF_PROG_TYPE_RAW_TRACEPOINT: + return true; + default: + return false; + } +} + static int check_map_prog_compatibility(struct bpf_verifier_env *env, struct bpf_map *map, struct bpf_prog *prog) @@ -6215,6 +7857,13 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env, } } + if ((is_tracing_prog_type(prog->type) || + prog->type == BPF_PROG_TYPE_SOCKET_FILTER) && + map_value_has_spin_lock(map)) { + verbose(env, "tracing progs cannot use bpf_spin_lock yet\n"); + return -EINVAL; + } + if ((bpf_prog_is_dev_bound(prog->aux) || bpf_map_is_dev_bound(map)) && !bpf_offload_prog_map_match(prog, map)) { verbose(env, "offload device mismatch between prog and map\n"); @@ -6258,8 +7907,10 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env) } if (insn[0].code == (BPF_LD | BPF_IMM | BPF_DW)) { + struct bpf_insn_aux_data *aux; struct bpf_map *map; struct fd f; + u64 addr; if (i == insn_cnt - 1 || insn[1].code != 0 || insn[1].dst_reg != 0 || insn[1].src_reg != 0 || @@ -6268,21 +7919,27 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env) return -EINVAL; } - if (insn->src_reg == 0) + if (insn[0].src_reg == 0) /* valid generic load 64-bit imm */ goto next_insn; - if (insn->src_reg != BPF_PSEUDO_MAP_FD) { + /* In final convert_pseudo_ld_imm64() step, this is + * converted into regular 64-bit imm load insn. + */ + if ((insn[0].src_reg != BPF_PSEUDO_MAP_FD && + insn[0].src_reg != BPF_PSEUDO_MAP_VALUE) || + (insn[0].src_reg == BPF_PSEUDO_MAP_FD && + insn[1].imm != 0)) { verbose(env, "unrecognized bpf_ld_imm64 insn\n"); return -EINVAL; } - f = fdget(insn->imm); + f = fdget(insn[0].imm); map = __bpf_map_get(f); if (IS_ERR(map)) { verbose(env, "fd %d is not pointing to valid bpf_map\n", - insn->imm); + insn[0].imm); return PTR_ERR(map); } @@ -6292,16 +7949,47 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env) return err; } - /* store map pointer inside BPF_LD_IMM64 instruction */ - insn[0].imm = (u32) (unsigned long) map; - insn[1].imm = ((u64) (unsigned long) map) >> 32; + aux = &env->insn_aux_data[i]; + if (insn->src_reg == BPF_PSEUDO_MAP_FD) { + addr = (unsigned long)map; + } else { + u32 off = insn[1].imm; + + if (off >= BPF_MAX_VAR_OFF) { + verbose(env, "direct value offset of %u is not allowed\n", off); + fdput(f); + return -EINVAL; + } + + if (!map->ops->map_direct_value_addr) { + verbose(env, "no direct value access support for this map type\n"); + fdput(f); + return -EINVAL; + } + + err = map->ops->map_direct_value_addr(map, &addr, off); + if (err) { + verbose(env, "invalid access to map value pointer, value_size=%u off=%u\n", + map->value_size, off); + fdput(f); + return err; + } + + aux->map_off = off; + addr += off; + } + + insn[0].imm = (u32)addr; + insn[1].imm = addr >> 32; /* check whether we recorded this map already */ - for (j = 0; j < env->used_map_cnt; j++) + for (j = 0; j < env->used_map_cnt; j++) { if (env->used_maps[j] == map) { + aux->map_index = j; fdput(f); goto next_insn; } + } if (env->used_map_cnt >= MAX_USED_MAPS) { fdput(f); @@ -6318,6 +8006,8 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env) fdput(f); return PTR_ERR(map); } + + aux->map_index = env->used_map_cnt; env->used_maps[env->used_map_cnt++] = map; if (bpf_map_is_cgroup_storage(map) && @@ -6381,14 +8071,23 @@ static void convert_pseudo_ld_imm64(struct bpf_verifier_env *env) * insni[off, off + cnt). Adjust corresponding insn_aux_data by copying * [0, off) and [off, end) to new locations, so the patched range stays zero */ -static int adjust_insn_aux_data(struct bpf_verifier_env *env, u32 prog_len, - u32 off, u32 cnt) +static int adjust_insn_aux_data(struct bpf_verifier_env *env, + struct bpf_prog *new_prog, u32 off, u32 cnt) { struct bpf_insn_aux_data *new_data, *old_data = env->insn_aux_data; + struct bpf_insn *insn = new_prog->insnsi; + u32 prog_len; int i; + /* aux info at OFF always needs adjustment, no matter fast path + * (cnt == 1) is taken or not. There is no guarantee INSN at OFF is the + * original insn at old prog. + */ + old_data[off].zext_dst = insn_has_def32(env, insn + off + cnt - 1); + if (cnt == 1) return 0; + prog_len = new_prog->len; new_data = vzalloc(array_size(prog_len, sizeof(struct bpf_insn_aux_data))); if (!new_data) @@ -6396,8 +8095,10 @@ static int adjust_insn_aux_data(struct bpf_verifier_env *env, u32 prog_len, memcpy(new_data, old_data, sizeof(struct bpf_insn_aux_data) * off); memcpy(new_data + off + cnt - 1, old_data + off, sizeof(struct bpf_insn_aux_data) * (prog_len - off - cnt + 1)); - for (i = off; i < off + cnt - 1; i++) + for (i = off; i < off + cnt - 1; i++) { new_data[i].seen = true; + new_data[i].zext_dst = insn_has_def32(env, insn + i); + } env->insn_aux_data = new_data; vfree(old_data); return 0; @@ -6423,14 +8124,166 @@ static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 of struct bpf_prog *new_prog; new_prog = bpf_patch_insn_single(env->prog, off, patch, len); - if (!new_prog) + if (IS_ERR(new_prog)) { + if (PTR_ERR(new_prog) == -ERANGE) + verbose(env, + "insn %d cannot be patched due to 16-bit range\n", + env->insn_aux_data[off].orig_idx); return NULL; - if (adjust_insn_aux_data(env, new_prog->len, off, len)) + } + if (adjust_insn_aux_data(env, new_prog, off, len)) return NULL; adjust_subprog_starts(env, off, len); return new_prog; } +static int adjust_subprog_starts_after_remove(struct bpf_verifier_env *env, + u32 off, u32 cnt) +{ + int i, j; + + /* find first prog starting at or after off (first to remove) */ + for (i = 0; i < env->subprog_cnt; i++) + if (env->subprog_info[i].start >= off) + break; + /* find first prog starting at or after off + cnt (first to stay) */ + for (j = i; j < env->subprog_cnt; j++) + if (env->subprog_info[j].start >= off + cnt) + break; + /* if j doesn't start exactly at off + cnt, we are just removing + * the front of previous prog + */ + if (env->subprog_info[j].start != off + cnt) + j--; + + if (j > i) { + struct bpf_prog_aux *aux = env->prog->aux; + int move; + + /* move fake 'exit' subprog as well */ + move = env->subprog_cnt + 1 - j; + + memmove(env->subprog_info + i, + env->subprog_info + j, + sizeof(*env->subprog_info) * move); + env->subprog_cnt -= j - i; + + /* remove func_info */ + if (aux->func_info) { + move = aux->func_info_cnt - j; + + memmove(aux->func_info + i, + aux->func_info + j, + sizeof(*aux->func_info) * move); + aux->func_info_cnt -= j - i; + /* func_info->insn_off is set after all code rewrites, + * in adjust_btf_func() - no need to adjust + */ + } + } else { + /* convert i from "first prog to remove" to "first to adjust" */ + if (env->subprog_info[i].start == off) + i++; + } + + /* update fake 'exit' subprog as well */ + for (; i <= env->subprog_cnt; i++) + env->subprog_info[i].start -= cnt; + + return 0; +} + +static int bpf_adj_linfo_after_remove(struct bpf_verifier_env *env, u32 off, + u32 cnt) +{ + struct bpf_prog *prog = env->prog; + u32 i, l_off, l_cnt, nr_linfo; + struct bpf_line_info *linfo; + + nr_linfo = prog->aux->nr_linfo; + if (!nr_linfo) + return 0; + + linfo = prog->aux->linfo; + + /* find first line info to remove, count lines to be removed */ + for (i = 0; i < nr_linfo; i++) + if (linfo[i].insn_off >= off) + break; + + l_off = i; + l_cnt = 0; + for (; i < nr_linfo; i++) + if (linfo[i].insn_off < off + cnt) + l_cnt++; + else + break; + + /* First live insn doesn't match first live linfo, it needs to "inherit" + * last removed linfo. prog is already modified, so prog->len == off + * means no live instructions after (tail of the program was removed). + */ + if (prog->len != off && l_cnt && + (i == nr_linfo || linfo[i].insn_off != off + cnt)) { + l_cnt--; + linfo[--i].insn_off = off + cnt; + } + + /* remove the line info which refer to the removed instructions */ + if (l_cnt) { + memmove(linfo + l_off, linfo + i, + sizeof(*linfo) * (nr_linfo - i)); + + prog->aux->nr_linfo -= l_cnt; + nr_linfo = prog->aux->nr_linfo; + } + + /* pull all linfo[i].insn_off >= off + cnt in by cnt */ + for (i = l_off; i < nr_linfo; i++) + linfo[i].insn_off -= cnt; + + /* fix up all subprogs (incl. 'exit') which start >= off */ + for (i = 0; i <= env->subprog_cnt; i++) + if (env->subprog_info[i].linfo_idx > l_off) { + /* program may have started in the removed region but + * may not be fully removed + */ + if (env->subprog_info[i].linfo_idx >= l_off + l_cnt) + env->subprog_info[i].linfo_idx -= l_cnt; + else + env->subprog_info[i].linfo_idx = l_off; + } + + return 0; +} + +static int verifier_remove_insns(struct bpf_verifier_env *env, u32 off, u32 cnt) +{ + struct bpf_insn_aux_data *aux_data = env->insn_aux_data; + unsigned int orig_prog_len = env->prog->len; + int err; + + if (bpf_prog_is_dev_bound(env->prog->aux)) + bpf_prog_offload_remove_insns(env, off, cnt); + + err = bpf_remove_insns(env->prog, off, cnt); + if (err) + return err; + + err = adjust_subprog_starts_after_remove(env, off, cnt); + if (err) + return err; + + err = bpf_adj_linfo_after_remove(env, off, cnt); + if (err) + return err; + + memmove(aux_data + off, aux_data + off + cnt, + sizeof(*aux_data) * (orig_prog_len - off - cnt)); + + return 0; +} + /* The verifier does more data flow analysis than llvm and will not * explore branches that are dead at run time. Malicious programs can * have dead code too. Therefore replace all dead at-run-time code @@ -6457,6 +8310,169 @@ static void sanitize_dead_code(struct bpf_verifier_env *env) } } +static bool insn_is_cond_jump(u8 code) +{ + u8 op; + + if (BPF_CLASS(code) == BPF_JMP32) + return true; + + if (BPF_CLASS(code) != BPF_JMP) + return false; + + op = BPF_OP(code); + return op != BPF_JA && op != BPF_EXIT && op != BPF_CALL; +} + +static void opt_hard_wire_dead_code_branches(struct bpf_verifier_env *env) +{ + struct bpf_insn_aux_data *aux_data = env->insn_aux_data; + struct bpf_insn ja = BPF_JMP_IMM(BPF_JA, 0, 0, 0); + struct bpf_insn *insn = env->prog->insnsi; + const int insn_cnt = env->prog->len; + int i; + + for (i = 0; i < insn_cnt; i++, insn++) { + if (!insn_is_cond_jump(insn->code)) + continue; + + if (!aux_data[i + 1].seen) + ja.off = insn->off; + else if (!aux_data[i + 1 + insn->off].seen) + ja.off = 0; + else + continue; + + if (bpf_prog_is_dev_bound(env->prog->aux)) + bpf_prog_offload_replace_insn(env, i, &ja); + + memcpy(insn, &ja, sizeof(ja)); + } +} + +static int opt_remove_dead_code(struct bpf_verifier_env *env) +{ + struct bpf_insn_aux_data *aux_data = env->insn_aux_data; + int insn_cnt = env->prog->len; + int i, err; + + for (i = 0; i < insn_cnt; i++) { + int j; + + j = 0; + while (i + j < insn_cnt && !aux_data[i + j].seen) + j++; + if (!j) + continue; + + err = verifier_remove_insns(env, i, j); + if (err) + return err; + insn_cnt = env->prog->len; + } + + return 0; +} + +static int opt_remove_nops(struct bpf_verifier_env *env) +{ + const struct bpf_insn ja = BPF_JMP_IMM(BPF_JA, 0, 0, 0); + struct bpf_insn *insn = env->prog->insnsi; + int insn_cnt = env->prog->len; + int i, err; + + for (i = 0; i < insn_cnt; i++) { + if (memcmp(&insn[i], &ja, sizeof(ja))) + continue; + + err = verifier_remove_insns(env, i, 1); + if (err) + return err; + insn_cnt--; + i--; + } + + return 0; +} + +static int opt_subreg_zext_lo32_rnd_hi32(struct bpf_verifier_env *env, + const union bpf_attr *attr) +{ + struct bpf_insn *patch, zext_patch[2], rnd_hi32_patch[4]; + struct bpf_insn_aux_data *aux = env->insn_aux_data; + int i, patch_len, delta = 0, len = env->prog->len; + struct bpf_insn *insns = env->prog->insnsi; + struct bpf_prog *new_prog; + bool rnd_hi32; + + rnd_hi32 = attr->prog_flags & BPF_F_TEST_RND_HI32; + zext_patch[1] = BPF_ZEXT_REG(0); + rnd_hi32_patch[1] = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, 0); + rnd_hi32_patch[2] = BPF_ALU64_IMM(BPF_LSH, BPF_REG_AX, 32); + rnd_hi32_patch[3] = BPF_ALU64_REG(BPF_OR, 0, BPF_REG_AX); + for (i = 0; i < len; i++) { + int adj_idx = i + delta; + struct bpf_insn insn; + + insn = insns[adj_idx]; + if (!aux[adj_idx].zext_dst) { + u8 code, class; + u32 imm_rnd; + + if (!rnd_hi32) + continue; + + code = insn.code; + class = BPF_CLASS(code); + if (insn_no_def(&insn)) + continue; + + /* NOTE: arg "reg" (the fourth one) is only used for + * BPF_STX which has been ruled out in above + * check, it is safe to pass NULL here. + */ + if (is_reg64(env, &insn, insn.dst_reg, NULL, DST_OP)) { + if (class == BPF_LD && + BPF_MODE(code) == BPF_IMM) + i++; + continue; + } + + /* ctx load could be transformed into wider load. */ + if (class == BPF_LDX && + aux[adj_idx].ptr_type == PTR_TO_CTX) + continue; + + imm_rnd = get_random_int(); + rnd_hi32_patch[0] = insn; + rnd_hi32_patch[1].imm = imm_rnd; + rnd_hi32_patch[3].dst_reg = insn.dst_reg; + patch = rnd_hi32_patch; + patch_len = 4; + goto apply_patch_buffer; + } + + if (!bpf_jit_needs_zext()) + continue; + + zext_patch[0] = insn; + zext_patch[1].dst_reg = insn.dst_reg; + zext_patch[1].src_reg = insn.dst_reg; + patch = zext_patch; + patch_len = 2; +apply_patch_buffer: + new_prog = bpf_patch_insn_data(env, adj_idx, patch, patch_len); + if (!new_prog) + return -ENOMEM; + env->prog = new_prog; + insns = new_prog->insnsi; + aux = env->insn_aux_data; + delta += patch_len - 1; + } + + return 0; +} + /* convert load instructions that access fields of a context type into a * sequence of instructions that access fields of the underlying structure: * struct __sk_buff -> struct sk_buff @@ -6549,8 +8565,15 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) convert_ctx_access = ops->convert_ctx_access; break; case PTR_TO_SOCKET: + case PTR_TO_SOCK_COMMON: convert_ctx_access = bpf_sock_convert_ctx_access; break; + case PTR_TO_TCP_SOCK: + convert_ctx_access = bpf_tcp_sock_convert_ctx_access; + break; + case PTR_TO_XDP_SOCK: + convert_ctx_access = bpf_xdp_sock_convert_ctx_access; + break; default: continue; } @@ -6609,7 +8632,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) insn->dst_reg, shift); insn_buf[cnt++] = BPF_ALU64_IMM(BPF_AND, insn->dst_reg, - (1 << size * 8) - 1); + (1ULL << size * 8) - 1); } } @@ -6678,7 +8701,12 @@ static int jit_subprogs(struct bpf_verifier_env *env) subprog_end = env->subprog_info[i + 1].start; len = subprog_end - subprog_start; - func[i] = bpf_prog_alloc(bpf_prog_size(len), GFP_USER); + /* BPF_PROG_RUN doesn't call subprogs directly, + * hence main prog stats include the runtime of subprogs. + * subprogs don't have IDs and not reachable via prog_get_next_id + * func[i]->aux->stats will never be accessed and stays NULL + */ + func[i] = bpf_prog_alloc_no_stats(bpf_prog_size(len), GFP_USER); if (!func[i]) goto out_free; memcpy(func[i]->insnsi, &prog->insnsi[subprog_start], @@ -6721,9 +8749,8 @@ static int jit_subprogs(struct bpf_verifier_env *env) insn->src_reg != BPF_PSEUDO_CALL) continue; subprog = insn->off; - insn->imm = (u64 (*)(u64, u64, u64, u64, u64)) - func[subprog]->bpf_func - - __bpf_call_base; + insn->imm = BPF_CAST_CALL(func[subprog]->bpf_func) - + __bpf_call_base; } /* we use the aux data to keep a list of the start addresses @@ -6917,7 +8944,8 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) u32 off_reg; aux = &env->insn_aux_data[i + delta]; - if (!aux->alu_state) + if (!aux->alu_state || + aux->alu_state == BPF_ALU_NON_POINTER) continue; isneg = aux->alu_state & BPF_ALU_NEG_VALUE; @@ -7124,30 +9152,63 @@ static void free_states(struct bpf_verifier_env *env) struct bpf_verifier_state_list *sl, *sln; int i; + sl = env->free_list; + while (sl) { + sln = sl->next; + free_verifier_state(&sl->state, false); + kfree(sl); + sl = sln; + } + if (!env->explored_states) return; - for (i = 0; i < env->prog->len; i++) { + for (i = 0; i < state_htab_size(env); i++) { sl = env->explored_states[i]; - if (sl) - while (sl != STATE_LIST_MARK) { - sln = sl->next; - free_verifier_state(&sl->state, false); - kfree(sl); - sl = sln; - } + while (sl) { + sln = sl->next; + free_verifier_state(&sl->state, false); + kfree(sl); + sl = sln; + } } - kfree(env->explored_states); + kvfree(env->explored_states); +} + +static void print_verification_stats(struct bpf_verifier_env *env) +{ + int i; + + if (env->log.level & BPF_LOG_STATS) { + verbose(env, "verification time %lld usec\n", + div_u64(env->verification_time, 1000)); + verbose(env, "stack depth "); + for (i = 0; i < env->subprog_cnt; i++) { + u32 depth = env->subprog_info[i].stack_depth; + + verbose(env, "%d", depth); + if (i + 1 < env->subprog_cnt) + verbose(env, "+"); + } + verbose(env, "\n"); + } + verbose(env, "processed %d insns (limit %d) max_states_per_insn %d " + "total_states %d peak_states %d mark_read %d\n", + env->insn_processed, BPF_COMPLEXITY_LIMIT_INSNS, + env->max_states_per_insn, env->total_states, + env->peak_states, env->longest_mark_read_walk); } int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, union bpf_attr __user *uattr) { + u64 start_time = ktime_get_ns(); struct bpf_verifier_env *env; struct bpf_verifier_log *log; - int ret = -EINVAL; + int i, len, ret = -EINVAL; + bool is_priv; /* no program is valid */ if (ARRAY_SIZE(bpf_verifier_ops) == 0) @@ -7161,17 +9222,21 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, return -ENOMEM; log = &env->log; + len = (*prog)->len; env->insn_aux_data = - vzalloc(array_size(sizeof(struct bpf_insn_aux_data), - (*prog)->len)); + vzalloc(array_size(sizeof(struct bpf_insn_aux_data), len)); ret = -ENOMEM; if (!env->insn_aux_data) goto err_free_env; + for (i = 0; i < len; i++) + env->insn_aux_data[i].orig_idx = i; env->prog = *prog; env->ops = bpf_verifier_ops[env->prog->type]; + is_priv = capable(CAP_SYS_ADMIN); /* grab the mutex to protect few globals used by verifier */ - mutex_lock(&bpf_verifier_lock); + if (!is_priv) + mutex_lock(&bpf_verifier_lock); if (attr->log_level || attr->log_buf || attr->log_size) { /* user requested verbose verifier output @@ -7183,8 +9248,8 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, ret = -EINVAL; /* log attributes have to be sane */ - if (log->len_total < 128 || log->len_total > UINT_MAX >> 8 || - !log->level || !log->ubuf) + if (log->len_total < 128 || log->len_total > UINT_MAX >> 2 || + !log->level || !log->ubuf || log->level & ~BPF_LOG_MASK) goto err_unlock; } @@ -7194,6 +9259,8 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, if (attr->prog_flags & BPF_F_ANY_ALIGNMENT) env->strict_alignment = false; + env->allow_ptr_leaks = is_priv; + ret = replace_map_fd_with_map_ptr(env); if (ret < 0) goto skip_full_check; @@ -7204,15 +9271,13 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, goto skip_full_check; } - env->explored_states = kcalloc(env->prog->len, + env->explored_states = kvcalloc(state_htab_size(env), sizeof(struct bpf_verifier_state_list *), GFP_USER); ret = -ENOMEM; if (!env->explored_states) goto skip_full_check; - env->allow_ptr_leaks = capable(CAP_SYS_ADMIN); - ret = check_subprogs(env); if (ret < 0) goto skip_full_check; @@ -7242,8 +9307,17 @@ skip_full_check: ret = check_max_stack_depth(env); /* instruction rewrites happen after this point */ - if (ret == 0) - sanitize_dead_code(env); + if (is_priv) { + if (ret == 0) + opt_hard_wire_dead_code_branches(env); + if (ret == 0) + ret = opt_remove_dead_code(env); + if (ret == 0) + ret = opt_remove_nops(env); + } else { + if (ret == 0) + sanitize_dead_code(env); + } if (ret == 0) /* program is valid, convert *(u32*)(ctx + off) accesses */ @@ -7252,9 +9326,21 @@ skip_full_check: if (ret == 0) ret = fixup_bpf_calls(env); + /* do 32-bit optimization after insn patching has done so those patched + * insns could be handled correctly. + */ + if (ret == 0 && !bpf_prog_is_dev_bound(env->prog->aux)) { + ret = opt_subreg_zext_lo32_rnd_hi32(env, attr); + env->prog->aux->verifier_zext = bpf_jit_needs_zext() ? !ret + : false; + } + if (ret == 0) ret = fixup_call_args(env); + env->verification_time = ktime_get_ns() - start_time; + print_verification_stats(env); + if (log->level && bpf_verifier_log_full(log)) ret = -ENOSPC; if (log->level && !log->ubuf) { @@ -7294,7 +9380,8 @@ err_release_maps: release_maps(env); *prog = env->prog; err_unlock: - mutex_unlock(&bpf_verifier_lock); + if (!is_priv) + mutex_unlock(&bpf_verifier_lock); vfree(env->insn_aux_data); err_free_env: kfree(env); diff --git a/kernel/bpf/xskmap.c b/kernel/bpf/xskmap.c index 686d244e798d..9bb96ace9fa1 100644 --- a/kernel/bpf/xskmap.c +++ b/kernel/bpf/xskmap.c @@ -17,8 +17,8 @@ struct xsk_map { static struct bpf_map *xsk_map_alloc(union bpf_attr *attr) { - int cpu, err = -EINVAL; struct xsk_map *m; + int cpu, err; u64 cost; if (!capable(CAP_NET_ADMIN)) @@ -37,13 +37,9 @@ static struct bpf_map *xsk_map_alloc(union bpf_attr *attr) cost = (u64)m->map.max_entries * sizeof(struct xdp_sock *); cost += sizeof(struct list_head) * num_possible_cpus(); - if (cost >= U32_MAX - PAGE_SIZE) - goto free_m; - - m->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; /* Notice returns -EPERM on if map size is larger than memlock limit */ - err = bpf_map_precharge_memlock(m->map.pages); + err = bpf_map_charge_init(&m->map.memory, cost); if (err) goto free_m; @@ -51,7 +47,7 @@ static struct bpf_map *xsk_map_alloc(union bpf_attr *attr) m->flush_list = alloc_percpu(struct list_head); if (!m->flush_list) - goto free_m; + goto free_charge; for_each_possible_cpu(cpu) INIT_LIST_HEAD(per_cpu_ptr(m->flush_list, cpu)); @@ -65,6 +61,8 @@ static struct bpf_map *xsk_map_alloc(union bpf_attr *attr) free_percpu: free_percpu(m->flush_list); +free_charge: + bpf_map_charge_finish(&m->map.memory); free_m: kfree(m); return ERR_PTR(err); @@ -147,13 +145,18 @@ void __xsk_map_flush(struct bpf_map *map) list_for_each_entry_safe(xs, tmp, flush_list, flush_node) { xsk_flush(xs); - __list_del(xs->flush_node.prev, xs->flush_node.next); - xs->flush_node.prev = NULL; + __list_del_clearprev(&xs->flush_node); } } static void *xsk_map_lookup_elem(struct bpf_map *map, void *key) { + WARN_ON_ONCE(!rcu_read_lock_held()); + return __xsk_map_lookup_elem(map, *(u32 *)key); +} + +static void *xsk_map_lookup_elem_sys_only(struct bpf_map *map, void *key) +{ return ERR_PTR(-EOPNOTSUPP); } @@ -220,6 +223,7 @@ const struct bpf_map_ops xsk_map_ops = { .map_free = xsk_map_free, .map_get_next_key = xsk_map_get_next_key, .map_lookup_elem = xsk_map_lookup_elem, + .map_lookup_elem_sys_only = xsk_map_lookup_elem_sys_only, .map_update_elem = xsk_map_update_elem, .map_delete_elem = xsk_map_delete_elem, .map_check_btf = map_check_no_btf, diff --git a/kernel/capability.c b/kernel/capability.c index 1e1c0236f55b..1444f3954d75 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -93,9 +93,7 @@ static int cap_validate_magic(cap_user_header_t header, unsigned *tocopy) break; case _LINUX_CAPABILITY_VERSION_2: warn_deprecated_v2(); - /* - * fall through - v3 is otherwise equivalent to v2. - */ + /* fall through - v3 is otherwise equivalent to v2. */ case _LINUX_CAPABILITY_VERSION_3: *tocopy = _LINUX_CAPABILITY_U32S_3; break; @@ -299,7 +297,7 @@ bool has_ns_capability(struct task_struct *t, int ret; rcu_read_lock(); - ret = security_capable(__task_cred(t), ns, cap); + ret = security_capable(__task_cred(t), ns, cap, CAP_OPT_NONE); rcu_read_unlock(); return (ret == 0); @@ -340,7 +338,7 @@ bool has_ns_capability_noaudit(struct task_struct *t, int ret; rcu_read_lock(); - ret = security_capable_noaudit(__task_cred(t), ns, cap); + ret = security_capable(__task_cred(t), ns, cap, CAP_OPT_NOAUDIT); rcu_read_unlock(); return (ret == 0); @@ -363,7 +361,9 @@ bool has_capability_noaudit(struct task_struct *t, int cap) return has_ns_capability_noaudit(t, &init_user_ns, cap); } -static bool ns_capable_common(struct user_namespace *ns, int cap, bool audit) +static bool ns_capable_common(struct user_namespace *ns, + int cap, + unsigned int opts) { int capable; @@ -372,8 +372,7 @@ static bool ns_capable_common(struct user_namespace *ns, int cap, bool audit) BUG(); } - capable = audit ? security_capable(current_cred(), ns, cap) : - security_capable_noaudit(current_cred(), ns, cap); + capable = security_capable(current_cred(), ns, cap, opts); if (capable == 0) { current->flags |= PF_SUPERPRIV; return true; @@ -394,7 +393,7 @@ static bool ns_capable_common(struct user_namespace *ns, int cap, bool audit) */ bool ns_capable(struct user_namespace *ns, int cap) { - return ns_capable_common(ns, cap, true); + return ns_capable_common(ns, cap, CAP_OPT_NONE); } EXPORT_SYMBOL(ns_capable); @@ -412,11 +411,30 @@ EXPORT_SYMBOL(ns_capable); */ bool ns_capable_noaudit(struct user_namespace *ns, int cap) { - return ns_capable_common(ns, cap, false); + return ns_capable_common(ns, cap, CAP_OPT_NOAUDIT); } EXPORT_SYMBOL(ns_capable_noaudit); /** + * ns_capable_setid - Determine if the current task has a superior capability + * in effect, while signalling that this check is being done from within a + * setid syscall. + * @ns: The usernamespace we want the capability in + * @cap: The capability to be tested for + * + * Return true if the current task has the given superior capability currently + * available for use, false if not. + * + * This sets PF_SUPERPRIV on the task if the capability is available on the + * assumption that it's about to be used. + */ +bool ns_capable_setid(struct user_namespace *ns, int cap) +{ + return ns_capable_common(ns, cap, CAP_OPT_INSETID); +} +EXPORT_SYMBOL(ns_capable_setid); + +/** * capable - Determine if the current task has a superior capability in effect * @cap: The capability to be tested for * @@ -448,10 +466,11 @@ EXPORT_SYMBOL(capable); bool file_ns_capable(const struct file *file, struct user_namespace *ns, int cap) { + if (WARN_ON_ONCE(!cap_valid(cap))) return false; - if (security_capable(file->f_cred, ns, cap) == 0) + if (security_capable(file->f_cred, ns, cap, CAP_OPT_NONE) == 0) return true; return false; @@ -500,10 +519,12 @@ bool ptracer_capable(struct task_struct *tsk, struct user_namespace *ns) { int ret = 0; /* An absent tracer adds no restrictions */ const struct cred *cred; + rcu_read_lock(); cred = rcu_dereference(tsk->ptracer_cred); if (cred) - ret = security_capable_noaudit(cred, ns, CAP_SYS_PTRACE); + ret = security_capable(cred, ns, CAP_SYS_PTRACE, + CAP_OPT_NOAUDIT); rcu_read_unlock(); return (ret == 0); } diff --git a/kernel/cgroup/Makefile b/kernel/cgroup/Makefile index bfcdae896122..5d7a76bfbbb7 100644 --- a/kernel/cgroup/Makefile +++ b/kernel/cgroup/Makefile @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 -obj-y := cgroup.o rstat.o namespace.o cgroup-v1.o +obj-y := cgroup.o rstat.o namespace.o cgroup-v1.o freezer.o -obj-$(CONFIG_CGROUP_FREEZER) += freezer.o +obj-$(CONFIG_CGROUP_FREEZER) += legacy_freezer.o obj-$(CONFIG_CGROUP_PIDS) += pids.o obj-$(CONFIG_CGROUP_RDMA) += rdma.o obj-$(CONFIG_CPUSETS) += cpuset.o diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index c950864016e2..809e34a3c017 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h @@ -7,6 +7,7 @@ #include <linux/workqueue.h> #include <linux/list.h> #include <linux/refcount.h> +#include <linux/fs_context.h> #define TRACE_CGROUP_PATH_LEN 1024 extern spinlock_t trace_cgroup_path_lock; @@ -27,16 +28,44 @@ extern void __init enable_debug_cgroup(void); #define TRACE_CGROUP_PATH(type, cgrp, ...) \ do { \ if (trace_cgroup_##type##_enabled()) { \ - spin_lock(&trace_cgroup_path_lock); \ + unsigned long flags; \ + spin_lock_irqsave(&trace_cgroup_path_lock, \ + flags); \ cgroup_path(cgrp, trace_cgroup_path, \ TRACE_CGROUP_PATH_LEN); \ trace_cgroup_##type(cgrp, trace_cgroup_path, \ ##__VA_ARGS__); \ - spin_unlock(&trace_cgroup_path_lock); \ + spin_unlock_irqrestore(&trace_cgroup_path_lock, \ + flags); \ } \ } while (0) /* + * The cgroup filesystem superblock creation/mount context. + */ +struct cgroup_fs_context { + struct kernfs_fs_context kfc; + struct cgroup_root *root; + struct cgroup_namespace *ns; + unsigned int flags; /* CGRP_ROOT_* flags */ + + /* cgroup1 bits */ + bool cpuset_clone_children; + bool none; /* User explicitly requested empty subsystem */ + bool all_ss; /* Seen 'all' option */ + u16 subsys_mask; /* Selected subsystems */ + char *name; /* Hierarchy name */ + char *release_agent; /* Path for release notifications */ +}; + +static inline struct cgroup_fs_context *cgroup_fc2context(struct fs_context *fc) +{ + struct kernfs_fs_context *kfc = fc->fs_private; + + return container_of(kfc, struct cgroup_fs_context, kfc); +} + +/* * A cgroup can be associated with multiple css_sets as different tasks may * belong to different cgroups on different hierarchies. In the other * direction, a css_set is naturally associated with multiple cgroups. @@ -117,16 +146,6 @@ struct cgroup_mgctx { #define DEFINE_CGROUP_MGCTX(name) \ struct cgroup_mgctx name = CGROUP_MGCTX_INIT(name) -struct cgroup_sb_opts { - u16 subsys_mask; - unsigned int flags; - char *release_agent; - bool cpuset_clone_children; - char *name; - /* User explicitly requested empty subsystem */ - bool none; -}; - extern struct mutex cgroup_mutex; extern spinlock_t css_set_lock; extern struct cgroup_subsys *cgroup_subsys[]; @@ -197,12 +216,10 @@ int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen, struct cgroup_namespace *ns); void cgroup_free_root(struct cgroup_root *root); -void init_cgroup_root(struct cgroup_root *root, struct cgroup_sb_opts *opts); -int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask, int ref_flags); +void init_cgroup_root(struct cgroup_fs_context *ctx); +int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask); int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask); -struct dentry *cgroup_do_mount(struct file_system_type *fs_type, int flags, - struct cgroup_root *root, unsigned long magic, - struct cgroup_namespace *ns); +int cgroup_do_get_tree(struct fs_context *fc); int cgroup_migrate_vet_dst(struct cgroup *dst_cgrp); void cgroup_migrate_finish(struct cgroup_mgctx *mgctx); @@ -226,6 +243,7 @@ int cgroup_rmdir(struct kernfs_node *kn); int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node, struct kernfs_root *kf_root); +int __cgroup_task_count(const struct cgroup *cgrp); int cgroup_task_count(const struct cgroup *cgrp); /* @@ -246,14 +264,15 @@ extern const struct proc_ns_operations cgroupns_operations; */ extern struct cftype cgroup1_base_files[]; extern struct kernfs_syscall_ops cgroup1_kf_syscall_ops; +extern const struct fs_parameter_description cgroup1_fs_parameters; int proc_cgroupstats_show(struct seq_file *m, void *v); bool cgroup1_ssid_disabled(int ssid); void cgroup1_pidlist_destroy_all(struct cgroup *cgrp); void cgroup1_release_agent(struct work_struct *work); void cgroup1_check_for_release(struct cgroup *cgrp); -struct dentry *cgroup1_mount(struct file_system_type *fs_type, int flags, - void *data, unsigned long magic, - struct cgroup_namespace *ns); +int cgroup1_parse_param(struct fs_context *fc, struct fs_parameter *param); +int cgroup1_get_tree(struct fs_context *fc); +int cgroup1_reconfigure(struct fs_context *ctx); #endif /* __CGROUP_INTERNAL_H */ diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 583b969b0c0e..88006be40ea3 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only #include "cgroup-internal.h" #include <linux/ctype.h> @@ -13,9 +14,12 @@ #include <linux/delayacct.h> #include <linux/pid_namespace.h> #include <linux/cgroupstats.h> +#include <linux/fs_parser.h> #include <trace/events/cgroup.h> +#define cg_invalf(fc, fmt, ...) invalf(fc, fmt, ## __VA_ARGS__) + /* * pidlists linger the following amount before being destroyed. The goal * is avoiding frequent destruction in the middle of consecutive read calls @@ -339,22 +343,6 @@ static struct cgroup_pidlist *cgroup_pidlist_find_create(struct cgroup *cgrp, return l; } -/** - * cgroup_task_count - count the number of tasks in a cgroup. - * @cgrp: the cgroup in question - */ -int cgroup_task_count(const struct cgroup *cgrp) -{ - int count = 0; - struct cgrp_cset_link *link; - - spin_lock_irq(&css_set_lock); - list_for_each_entry(link, &cgrp->cset_links, cset_link) - count += link->cset->nr_tasks; - spin_unlock_irq(&css_set_lock); - return count; -} - /* * Load a cgroup's pidarray with either procs' tgids or tasks' pids */ @@ -906,172 +894,195 @@ static int cgroup1_show_options(struct seq_file *seq, struct kernfs_root *kf_roo return 0; } -static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) -{ - char *token, *o = data; - bool all_ss = false, one_ss = false; - u16 mask = U16_MAX; - struct cgroup_subsys *ss; - int nr_opts = 0; - int i; - -#ifdef CONFIG_CPUSETS - mask = ~((u16)1 << cpuset_cgrp_id); -#endif - - memset(opts, 0, sizeof(*opts)); +enum cgroup1_param { + Opt_all, + Opt_clone_children, + Opt_cpuset_v2_mode, + Opt_name, + Opt_none, + Opt_noprefix, + Opt_release_agent, + Opt_xattr, +}; - while ((token = strsep(&o, ",")) != NULL) { - nr_opts++; +static const struct fs_parameter_spec cgroup1_param_specs[] = { + fsparam_flag ("all", Opt_all), + fsparam_flag ("clone_children", Opt_clone_children), + fsparam_flag ("cpuset_v2_mode", Opt_cpuset_v2_mode), + fsparam_string("name", Opt_name), + fsparam_flag ("none", Opt_none), + fsparam_flag ("noprefix", Opt_noprefix), + fsparam_string("release_agent", Opt_release_agent), + fsparam_flag ("xattr", Opt_xattr), + {} +}; - if (!*token) - return -EINVAL; - if (!strcmp(token, "none")) { - /* Explicitly have no subsystems */ - opts->none = true; - continue; - } - if (!strcmp(token, "all")) { - /* Mutually exclusive option 'all' + subsystem name */ - if (one_ss) - return -EINVAL; - all_ss = true; - continue; - } - if (!strcmp(token, "noprefix")) { - opts->flags |= CGRP_ROOT_NOPREFIX; - continue; - } - if (!strcmp(token, "clone_children")) { - opts->cpuset_clone_children = true; - continue; - } - if (!strcmp(token, "cpuset_v2_mode")) { - opts->flags |= CGRP_ROOT_CPUSET_V2_MODE; - continue; - } - if (!strcmp(token, "xattr")) { - opts->flags |= CGRP_ROOT_XATTR; - continue; - } - if (!strncmp(token, "release_agent=", 14)) { - /* Specifying two release agents is forbidden */ - if (opts->release_agent) - return -EINVAL; - opts->release_agent = - kstrndup(token + 14, PATH_MAX - 1, GFP_KERNEL); - if (!opts->release_agent) - return -ENOMEM; - continue; - } - if (!strncmp(token, "name=", 5)) { - const char *name = token + 5; - - /* blocked by boot param? */ - if (cgroup_no_v1_named) - return -ENOENT; - /* Can't specify an empty name */ - if (!strlen(name)) - return -EINVAL; - /* Must match [\w.-]+ */ - for (i = 0; i < strlen(name); i++) { - char c = name[i]; - if (isalnum(c)) - continue; - if ((c == '.') || (c == '-') || (c == '_')) - continue; - return -EINVAL; - } - /* Specifying two names is forbidden */ - if (opts->name) - return -EINVAL; - opts->name = kstrndup(name, - MAX_CGROUP_ROOT_NAMELEN - 1, - GFP_KERNEL); - if (!opts->name) - return -ENOMEM; +const struct fs_parameter_description cgroup1_fs_parameters = { + .name = "cgroup1", + .specs = cgroup1_param_specs, +}; - continue; +int cgroup1_parse_param(struct fs_context *fc, struct fs_parameter *param) +{ + struct cgroup_fs_context *ctx = cgroup_fc2context(fc); + struct cgroup_subsys *ss; + struct fs_parse_result result; + int opt, i; + + opt = fs_parse(fc, &cgroup1_fs_parameters, param, &result); + if (opt == -ENOPARAM) { + if (strcmp(param->key, "source") == 0) { + fc->source = param->string; + param->string = NULL; + return 0; } - for_each_subsys(ss, i) { - if (strcmp(token, ss->legacy_name)) + if (strcmp(param->key, ss->legacy_name)) continue; - if (!cgroup_ssid_enabled(i)) + ctx->subsys_mask |= (1 << i); + return 0; + } + return cg_invalf(fc, "cgroup1: Unknown subsys name '%s'", param->key); + } + if (opt < 0) + return opt; + + switch (opt) { + case Opt_none: + /* Explicitly have no subsystems */ + ctx->none = true; + break; + case Opt_all: + ctx->all_ss = true; + break; + case Opt_noprefix: + ctx->flags |= CGRP_ROOT_NOPREFIX; + break; + case Opt_clone_children: + ctx->cpuset_clone_children = true; + break; + case Opt_cpuset_v2_mode: + ctx->flags |= CGRP_ROOT_CPUSET_V2_MODE; + break; + case Opt_xattr: + ctx->flags |= CGRP_ROOT_XATTR; + break; + case Opt_release_agent: + /* Specifying two release agents is forbidden */ + if (ctx->release_agent) + return cg_invalf(fc, "cgroup1: release_agent respecified"); + ctx->release_agent = param->string; + param->string = NULL; + break; + case Opt_name: + /* blocked by boot param? */ + if (cgroup_no_v1_named) + return -ENOENT; + /* Can't specify an empty name */ + if (!param->size) + return cg_invalf(fc, "cgroup1: Empty name"); + if (param->size > MAX_CGROUP_ROOT_NAMELEN - 1) + return cg_invalf(fc, "cgroup1: Name too long"); + /* Must match [\w.-]+ */ + for (i = 0; i < param->size; i++) { + char c = param->string[i]; + if (isalnum(c)) continue; - if (cgroup1_ssid_disabled(i)) + if ((c == '.') || (c == '-') || (c == '_')) continue; - - /* Mutually exclusive option 'all' + subsystem name */ - if (all_ss) - return -EINVAL; - opts->subsys_mask |= (1 << i); - one_ss = true; - - break; + return cg_invalf(fc, "cgroup1: Invalid name"); } - if (i == CGROUP_SUBSYS_COUNT) - return -ENOENT; + /* Specifying two names is forbidden */ + if (ctx->name) + return cg_invalf(fc, "cgroup1: name respecified"); + ctx->name = param->string; + param->string = NULL; + break; } + return 0; +} + +static int check_cgroupfs_options(struct fs_context *fc) +{ + struct cgroup_fs_context *ctx = cgroup_fc2context(fc); + u16 mask = U16_MAX; + u16 enabled = 0; + struct cgroup_subsys *ss; + int i; + +#ifdef CONFIG_CPUSETS + mask = ~((u16)1 << cpuset_cgrp_id); +#endif + for_each_subsys(ss, i) + if (cgroup_ssid_enabled(i) && !cgroup1_ssid_disabled(i)) + enabled |= 1 << i; + + ctx->subsys_mask &= enabled; /* - * If the 'all' option was specified select all the subsystems, - * otherwise if 'none', 'name=' and a subsystem name options were - * not specified, let's default to 'all' + * In absense of 'none', 'name=' or subsystem name options, + * let's default to 'all'. */ - if (all_ss || (!one_ss && !opts->none && !opts->name)) - for_each_subsys(ss, i) - if (cgroup_ssid_enabled(i) && !cgroup1_ssid_disabled(i)) - opts->subsys_mask |= (1 << i); + if (!ctx->subsys_mask && !ctx->none && !ctx->name) + ctx->all_ss = true; + + if (ctx->all_ss) { + /* Mutually exclusive option 'all' + subsystem name */ + if (ctx->subsys_mask) + return cg_invalf(fc, "cgroup1: subsys name conflicts with all"); + /* 'all' => select all the subsystems */ + ctx->subsys_mask = enabled; + } /* * We either have to specify by name or by subsystems. (So all * empty hierarchies must have a name). */ - if (!opts->subsys_mask && !opts->name) - return -EINVAL; + if (!ctx->subsys_mask && !ctx->name) + return cg_invalf(fc, "cgroup1: Need name or subsystem set"); /* * Option noprefix was introduced just for backward compatibility * with the old cpuset, so we allow noprefix only if mounting just * the cpuset subsystem. */ - if ((opts->flags & CGRP_ROOT_NOPREFIX) && (opts->subsys_mask & mask)) - return -EINVAL; + if ((ctx->flags & CGRP_ROOT_NOPREFIX) && (ctx->subsys_mask & mask)) + return cg_invalf(fc, "cgroup1: noprefix used incorrectly"); /* Can't specify "none" and some subsystems */ - if (opts->subsys_mask && opts->none) - return -EINVAL; + if (ctx->subsys_mask && ctx->none) + return cg_invalf(fc, "cgroup1: none used incorrectly"); return 0; } -static int cgroup1_remount(struct kernfs_root *kf_root, int *flags, char *data) +int cgroup1_reconfigure(struct fs_context *fc) { - int ret = 0; + struct cgroup_fs_context *ctx = cgroup_fc2context(fc); + struct kernfs_root *kf_root = kernfs_root_from_sb(fc->root->d_sb); struct cgroup_root *root = cgroup_root_from_kf(kf_root); - struct cgroup_sb_opts opts; + int ret = 0; u16 added_mask, removed_mask; cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp); /* See what subsystems are wanted */ - ret = parse_cgroupfs_options(data, &opts); + ret = check_cgroupfs_options(fc); if (ret) goto out_unlock; - if (opts.subsys_mask != root->subsys_mask || opts.release_agent) + if (ctx->subsys_mask != root->subsys_mask || ctx->release_agent) pr_warn("option changes via remount are deprecated (pid=%d comm=%s)\n", task_tgid_nr(current), current->comm); - added_mask = opts.subsys_mask & ~root->subsys_mask; - removed_mask = root->subsys_mask & ~opts.subsys_mask; + added_mask = ctx->subsys_mask & ~root->subsys_mask; + removed_mask = root->subsys_mask & ~ctx->subsys_mask; /* Don't allow flags or name to change at remount */ - if ((opts.flags ^ root->flags) || - (opts.name && strcmp(opts.name, root->name))) { - pr_err("option or name mismatch, new: 0x%x \"%s\", old: 0x%x \"%s\"\n", - opts.flags, opts.name ?: "", root->flags, root->name); + if ((ctx->flags ^ root->flags) || + (ctx->name && strcmp(ctx->name, root->name))) { + cg_invalf(fc, "option or name mismatch, new: 0x%x \"%s\", old: 0x%x \"%s\"", + ctx->flags, ctx->name ?: "", root->flags, root->name); ret = -EINVAL; goto out_unlock; } @@ -1088,17 +1099,15 @@ static int cgroup1_remount(struct kernfs_root *kf_root, int *flags, char *data) WARN_ON(rebind_subsystems(&cgrp_dfl_root, removed_mask)); - if (opts.release_agent) { + if (ctx->release_agent) { spin_lock(&release_agent_path_lock); - strcpy(root->release_agent_path, opts.release_agent); + strcpy(root->release_agent_path, ctx->release_agent); spin_unlock(&release_agent_path_lock); } trace_cgroup_remount(root); out_unlock: - kfree(opts.release_agent); - kfree(opts.name); mutex_unlock(&cgroup_mutex); return ret; } @@ -1106,30 +1115,30 @@ static int cgroup1_remount(struct kernfs_root *kf_root, int *flags, char *data) struct kernfs_syscall_ops cgroup1_kf_syscall_ops = { .rename = cgroup1_rename, .show_options = cgroup1_show_options, - .remount_fs = cgroup1_remount, .mkdir = cgroup_mkdir, .rmdir = cgroup_rmdir, .show_path = cgroup_show_path, }; -struct dentry *cgroup1_mount(struct file_system_type *fs_type, int flags, - void *data, unsigned long magic, - struct cgroup_namespace *ns) +/* + * The guts of cgroup1 mount - find or create cgroup_root to use. + * Called with cgroup_mutex held; returns 0 on success, -E... on + * error and positive - in case when the candidate is busy dying. + * On success it stashes a reference to cgroup_root into given + * cgroup_fs_context; that reference is *NOT* counting towards the + * cgroup_root refcount. + */ +static int cgroup1_root_to_use(struct fs_context *fc) { - struct super_block *pinned_sb = NULL; - struct cgroup_sb_opts opts; + struct cgroup_fs_context *ctx = cgroup_fc2context(fc); struct cgroup_root *root; struct cgroup_subsys *ss; - struct dentry *dentry; int i, ret; - bool new_root = false; - - cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp); /* First find the desired set of subsystems */ - ret = parse_cgroupfs_options(data, &opts); + ret = check_cgroupfs_options(fc); if (ret) - goto out_unlock; + return ret; /* * Destruction of cgroup root is asynchronous, so subsystems may @@ -1139,16 +1148,12 @@ struct dentry *cgroup1_mount(struct file_system_type *fs_type, int flags, * starting. Testing ref liveliness is good enough. */ for_each_subsys(ss, i) { - if (!(opts.subsys_mask & (1 << i)) || + if (!(ctx->subsys_mask & (1 << i)) || ss->root == &cgrp_dfl_root) continue; - if (!percpu_ref_tryget_live(&ss->root->cgrp.self.refcnt)) { - mutex_unlock(&cgroup_mutex); - msleep(10); - ret = restart_syscall(); - goto out_free; - } + if (!percpu_ref_tryget_live(&ss->root->cgrp.self.refcnt)) + return 1; /* restart */ cgroup_put(&ss->root->cgrp); } @@ -1163,8 +1168,8 @@ struct dentry *cgroup1_mount(struct file_system_type *fs_type, int flags, * name matches but sybsys_mask doesn't, we should fail. * Remember whether name matched. */ - if (opts.name) { - if (strcmp(opts.name, root->name)) + if (ctx->name) { + if (strcmp(ctx->name, root->name)) continue; name_match = true; } @@ -1173,42 +1178,18 @@ struct dentry *cgroup1_mount(struct file_system_type *fs_type, int flags, * If we asked for subsystems (or explicitly for no * subsystems) then they must match. */ - if ((opts.subsys_mask || opts.none) && - (opts.subsys_mask != root->subsys_mask)) { + if ((ctx->subsys_mask || ctx->none) && + (ctx->subsys_mask != root->subsys_mask)) { if (!name_match) continue; - ret = -EBUSY; - goto out_unlock; + return -EBUSY; } - if (root->flags ^ opts.flags) + if (root->flags ^ ctx->flags) pr_warn("new mount options do not match the existing superblock, will be ignored\n"); - /* - * We want to reuse @root whose lifetime is governed by its - * ->cgrp. Let's check whether @root is alive and keep it - * that way. As cgroup_kill_sb() can happen anytime, we - * want to block it by pinning the sb so that @root doesn't - * get killed before mount is complete. - * - * With the sb pinned, tryget_live can reliably indicate - * whether @root can be reused. If it's being killed, - * drain it. We can use wait_queue for the wait but this - * path is super cold. Let's just sleep a bit and retry. - */ - pinned_sb = kernfs_pin_sb(root->kf_root, NULL); - if (IS_ERR(pinned_sb) || - !percpu_ref_tryget_live(&root->cgrp.self.refcnt)) { - mutex_unlock(&cgroup_mutex); - if (!IS_ERR_OR_NULL(pinned_sb)) - deactivate_super(pinned_sb); - msleep(10); - ret = restart_syscall(); - goto out_free; - } - - ret = 0; - goto out_unlock; + ctx->root = root; + return 0; } /* @@ -1216,62 +1197,58 @@ struct dentry *cgroup1_mount(struct file_system_type *fs_type, int flags, * specification is allowed for already existing hierarchies but we * can't create new one without subsys specification. */ - if (!opts.subsys_mask && !opts.none) { - ret = -EINVAL; - goto out_unlock; - } + if (!ctx->subsys_mask && !ctx->none) + return cg_invalf(fc, "cgroup1: No subsys list or none specified"); /* Hierarchies may only be created in the initial cgroup namespace. */ - if (ns != &init_cgroup_ns) { - ret = -EPERM; - goto out_unlock; - } + if (ctx->ns != &init_cgroup_ns) + return -EPERM; root = kzalloc(sizeof(*root), GFP_KERNEL); - if (!root) { - ret = -ENOMEM; - goto out_unlock; - } - new_root = true; + if (!root) + return -ENOMEM; - init_cgroup_root(root, &opts); + ctx->root = root; + init_cgroup_root(ctx); - ret = cgroup_setup_root(root, opts.subsys_mask, PERCPU_REF_INIT_DEAD); + ret = cgroup_setup_root(root, ctx->subsys_mask); if (ret) cgroup_free_root(root); + return ret; +} -out_unlock: - mutex_unlock(&cgroup_mutex); -out_free: - kfree(opts.release_agent); - kfree(opts.name); +int cgroup1_get_tree(struct fs_context *fc) +{ + struct cgroup_fs_context *ctx = cgroup_fc2context(fc); + int ret; - if (ret) - return ERR_PTR(ret); + /* Check if the caller has permission to mount. */ + if (!ns_capable(ctx->ns->user_ns, CAP_SYS_ADMIN)) + return -EPERM; - dentry = cgroup_do_mount(&cgroup_fs_type, flags, root, - CGROUP_SUPER_MAGIC, ns); + cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp); - /* - * There's a race window after we release cgroup_mutex and before - * allocating a superblock. Make sure a concurrent process won't - * be able to re-use the root during this window by delaying the - * initialization of root refcnt. - */ - if (new_root) { - mutex_lock(&cgroup_mutex); - percpu_ref_reinit(&root->cgrp.self.refcnt); - mutex_unlock(&cgroup_mutex); - } + ret = cgroup1_root_to_use(fc); + if (!ret && !percpu_ref_tryget_live(&ctx->root->cgrp.self.refcnt)) + ret = 1; /* restart */ - /* - * If @pinned_sb, we're reusing an existing root and holding an - * extra ref on its sb. Mount is complete. Put the extra ref. - */ - if (pinned_sb) - deactivate_super(pinned_sb); + mutex_unlock(&cgroup_mutex); - return dentry; + if (!ret) + ret = cgroup_do_get_tree(fc); + + if (!ret && percpu_ref_is_dying(&ctx->root->cgrp.self.refcnt)) { + struct super_block *sb = fc->root->d_sb; + dput(fc->root); + deactivate_locked_super(sb); + ret = 1; + } + + if (unlikely(ret > 0)) { + msleep(10); + return restart_syscall(); + } + return ret; } static int __init cgroup1_wq_init(void) diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index f31bd61c9466..300b0c416341 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -54,6 +54,7 @@ #include <linux/proc_ns.h> #include <linux/nsproxy.h> #include <linux/file.h> +#include <linux/fs_parser.h> #include <linux/sched/cputime.h> #include <linux/psi.h> #include <net/sock.h> @@ -100,7 +101,7 @@ static DEFINE_SPINLOCK(cgroup_idr_lock); */ static DEFINE_SPINLOCK(cgroup_file_kn_lock); -struct percpu_rw_semaphore cgroup_threadgroup_rwsem; +DEFINE_PERCPU_RWSEM(cgroup_threadgroup_rwsem); #define cgroup_assert_mutex_or_rcu_locked() \ RCU_LOCKDEP_WARN(!rcu_read_lock_held() && \ @@ -197,7 +198,7 @@ static u64 css_serial_nr_next = 1; */ static u16 have_fork_callback __read_mostly; static u16 have_exit_callback __read_mostly; -static u16 have_free_callback __read_mostly; +static u16 have_release_callback __read_mostly; static u16 have_canfork_callback __read_mostly; /* cgroup namespace for init task */ @@ -214,7 +215,8 @@ static struct cftype cgroup_base_files[]; static int cgroup_apply_control(struct cgroup *cgrp); static void cgroup_finalize_control(struct cgroup *cgrp, int ret); -static void css_task_iter_advance(struct css_task_iter *it); +static void css_task_iter_skip(struct css_task_iter *it, + struct task_struct *task); static int cgroup_destroy_locked(struct cgroup *cgrp); static struct cgroup_subsys_state *css_create(struct cgroup *cgrp, struct cgroup_subsys *ss); @@ -592,6 +594,39 @@ static void cgroup_get_live(struct cgroup *cgrp) css_get(&cgrp->self); } +/** + * __cgroup_task_count - count the number of tasks in a cgroup. The caller + * is responsible for taking the css_set_lock. + * @cgrp: the cgroup in question + */ +int __cgroup_task_count(const struct cgroup *cgrp) +{ + int count = 0; + struct cgrp_cset_link *link; + + lockdep_assert_held(&css_set_lock); + + list_for_each_entry(link, &cgrp->cset_links, cset_link) + count += link->cset->nr_tasks; + + return count; +} + +/** + * cgroup_task_count - count the number of tasks in a cgroup. + * @cgrp: the cgroup in question + */ +int cgroup_task_count(const struct cgroup *cgrp) +{ + int count; + + spin_lock_irq(&css_set_lock); + count = __cgroup_task_count(cgrp); + spin_unlock_irq(&css_set_lock); + + return count; +} + struct cgroup_subsys_state *of_css(struct kernfs_open_file *of) { struct cgroup *cgrp = of->kn->parent->priv; @@ -704,6 +739,7 @@ struct css_set init_css_set = { .dom_cset = &init_css_set, .tasks = LIST_HEAD_INIT(init_css_set.tasks), .mg_tasks = LIST_HEAD_INIT(init_css_set.mg_tasks), + .dying_tasks = LIST_HEAD_INIT(init_css_set.dying_tasks), .task_iters = LIST_HEAD_INIT(init_css_set.task_iters), .threaded_csets = LIST_HEAD_INIT(init_css_set.threaded_csets), .cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links), @@ -782,6 +818,8 @@ static void cgroup_update_populated(struct cgroup *cgrp, bool populated) break; cgroup1_check_for_release(cgrp); + TRACE_CGROUP_PATH(notify_populated, cgrp, + cgroup_is_populated(cgrp)); cgroup_file_notify(&cgrp->events_file); child = cgrp; @@ -807,6 +845,21 @@ static void css_set_update_populated(struct css_set *cset, bool populated) cgroup_update_populated(link->cgrp, populated); } +/* + * @task is leaving, advance task iterators which are pointing to it so + * that they can resume at the next position. Advancing an iterator might + * remove it from the list, use safe walk. See css_task_iter_skip() for + * details. + */ +static void css_set_skip_task_iters(struct css_set *cset, + struct task_struct *task) +{ + struct css_task_iter *it, *pos; + + list_for_each_entry_safe(it, pos, &cset->task_iters, iters_node) + css_task_iter_skip(it, task); +} + /** * css_set_move_task - move a task from one css_set to another * @task: task being moved @@ -832,22 +885,9 @@ static void css_set_move_task(struct task_struct *task, css_set_update_populated(to_cset, true); if (from_cset) { - struct css_task_iter *it, *pos; - WARN_ON_ONCE(list_empty(&task->cg_list)); - /* - * @task is leaving, advance task iterators which are - * pointing to it so that they can resume at the next - * position. Advancing an iterator might remove it from - * the list, use safe walk. See css_task_iter_advance*() - * for details. - */ - list_for_each_entry_safe(it, pos, &from_cset->task_iters, - iters_node) - if (it->task_pos == &task->cg_list) - css_task_iter_advance(it); - + css_set_skip_task_iters(from_cset, task); list_del_init(&task->cg_list); if (!css_set_populated(from_cset)) css_set_update_populated(from_cset, false); @@ -1174,6 +1214,7 @@ static struct css_set *find_css_set(struct css_set *old_cset, cset->dom_cset = cset; INIT_LIST_HEAD(&cset->tasks); INIT_LIST_HEAD(&cset->mg_tasks); + INIT_LIST_HEAD(&cset->dying_tasks); INIT_LIST_HEAD(&cset->task_iters); INIT_LIST_HEAD(&cset->threaded_csets); INIT_HLIST_NODE(&cset->hlist); @@ -1772,26 +1813,42 @@ int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node, return len; } -static int parse_cgroup_root_flags(char *data, unsigned int *root_flags) -{ - char *token; +enum cgroup2_param { + Opt_nsdelegate, + Opt_memory_localevents, + nr__cgroup2_params +}; - *root_flags = 0; +static const struct fs_parameter_spec cgroup2_param_specs[] = { + fsparam_flag("nsdelegate", Opt_nsdelegate), + fsparam_flag("memory_localevents", Opt_memory_localevents), + {} +}; - if (!data || *data == '\0') - return 0; +static const struct fs_parameter_description cgroup2_fs_parameters = { + .name = "cgroup2", + .specs = cgroup2_param_specs, +}; - while ((token = strsep(&data, ",")) != NULL) { - if (!strcmp(token, "nsdelegate")) { - *root_flags |= CGRP_ROOT_NS_DELEGATE; - continue; - } +static int cgroup2_parse_param(struct fs_context *fc, struct fs_parameter *param) +{ + struct cgroup_fs_context *ctx = cgroup_fc2context(fc); + struct fs_parse_result result; + int opt; - pr_err("cgroup2: unknown option \"%s\"\n", token); - return -EINVAL; - } + opt = fs_parse(fc, &cgroup2_fs_parameters, param, &result); + if (opt < 0) + return opt; - return 0; + switch (opt) { + case Opt_nsdelegate: + ctx->flags |= CGRP_ROOT_NS_DELEGATE; + return 0; + case Opt_memory_localevents: + ctx->flags |= CGRP_ROOT_MEMORY_LOCAL_EVENTS; + return 0; + } + return -EINVAL; } static void apply_cgroup_root_flags(unsigned int root_flags) @@ -1801,6 +1858,11 @@ static void apply_cgroup_root_flags(unsigned int root_flags) cgrp_dfl_root.flags |= CGRP_ROOT_NS_DELEGATE; else cgrp_dfl_root.flags &= ~CGRP_ROOT_NS_DELEGATE; + + if (root_flags & CGRP_ROOT_MEMORY_LOCAL_EVENTS) + cgrp_dfl_root.flags |= CGRP_ROOT_MEMORY_LOCAL_EVENTS; + else + cgrp_dfl_root.flags &= ~CGRP_ROOT_MEMORY_LOCAL_EVENTS; } } @@ -1808,19 +1870,16 @@ static int cgroup_show_options(struct seq_file *seq, struct kernfs_root *kf_root { if (cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE) seq_puts(seq, ",nsdelegate"); + if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_LOCAL_EVENTS) + seq_puts(seq, ",memory_localevents"); return 0; } -static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data) +static int cgroup_reconfigure(struct fs_context *fc) { - unsigned int root_flags; - int ret; + struct cgroup_fs_context *ctx = cgroup_fc2context(fc); - ret = parse_cgroup_root_flags(data, &root_flags); - if (ret) - return ret; - - apply_cgroup_root_flags(root_flags); + apply_cgroup_root_flags(ctx->flags); return 0; } @@ -1908,8 +1967,9 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) INIT_WORK(&cgrp->release_agent_work, cgroup1_release_agent); } -void init_cgroup_root(struct cgroup_root *root, struct cgroup_sb_opts *opts) +void init_cgroup_root(struct cgroup_fs_context *ctx) { + struct cgroup_root *root = ctx->root; struct cgroup *cgrp = &root->cgrp; INIT_LIST_HEAD(&root->root_list); @@ -1918,16 +1978,16 @@ void init_cgroup_root(struct cgroup_root *root, struct cgroup_sb_opts *opts) init_cgroup_housekeeping(cgrp); idr_init(&root->cgroup_idr); - root->flags = opts->flags; - if (opts->release_agent) - strscpy(root->release_agent_path, opts->release_agent, PATH_MAX); - if (opts->name) - strscpy(root->name, opts->name, MAX_CGROUP_ROOT_NAMELEN); - if (opts->cpuset_clone_children) + root->flags = ctx->flags; + if (ctx->release_agent) + strscpy(root->release_agent_path, ctx->release_agent, PATH_MAX); + if (ctx->name) + strscpy(root->name, ctx->name, MAX_CGROUP_ROOT_NAMELEN); + if (ctx->cpuset_clone_children) set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags); } -int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask, int ref_flags) +int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask) { LIST_HEAD(tmp_links); struct cgroup *root_cgrp = &root->cgrp; @@ -1944,7 +2004,7 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask, int ref_flags) root_cgrp->ancestor_ids[0] = ret; ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release, - ref_flags, GFP_KERNEL); + 0, GFP_KERNEL); if (ret) goto out; @@ -2028,57 +2088,104 @@ out: return ret; } -struct dentry *cgroup_do_mount(struct file_system_type *fs_type, int flags, - struct cgroup_root *root, unsigned long magic, - struct cgroup_namespace *ns) +int cgroup_do_get_tree(struct fs_context *fc) { - struct dentry *dentry; - bool new_sb; + struct cgroup_fs_context *ctx = cgroup_fc2context(fc); + int ret; - dentry = kernfs_mount(fs_type, flags, root->kf_root, magic, &new_sb); + ctx->kfc.root = ctx->root->kf_root; + if (fc->fs_type == &cgroup2_fs_type) + ctx->kfc.magic = CGROUP2_SUPER_MAGIC; + else + ctx->kfc.magic = CGROUP_SUPER_MAGIC; + ret = kernfs_get_tree(fc); /* * In non-init cgroup namespace, instead of root cgroup's dentry, * we return the dentry corresponding to the cgroupns->root_cgrp. */ - if (!IS_ERR(dentry) && ns != &init_cgroup_ns) { + if (!ret && ctx->ns != &init_cgroup_ns) { struct dentry *nsdentry; + struct super_block *sb = fc->root->d_sb; struct cgroup *cgrp; mutex_lock(&cgroup_mutex); spin_lock_irq(&css_set_lock); - cgrp = cset_cgroup_from_root(ns->root_cset, root); + cgrp = cset_cgroup_from_root(ctx->ns->root_cset, ctx->root); spin_unlock_irq(&css_set_lock); mutex_unlock(&cgroup_mutex); - nsdentry = kernfs_node_dentry(cgrp->kn, dentry->d_sb); - dput(dentry); - dentry = nsdentry; + nsdentry = kernfs_node_dentry(cgrp->kn, sb); + dput(fc->root); + fc->root = nsdentry; + if (IS_ERR(nsdentry)) { + ret = PTR_ERR(nsdentry); + deactivate_locked_super(sb); + } } - if (IS_ERR(dentry) || !new_sb) - cgroup_put(&root->cgrp); + if (!ctx->kfc.new_sb_created) + cgroup_put(&ctx->root->cgrp); + + return ret; +} + +/* + * Destroy a cgroup filesystem context. + */ +static void cgroup_fs_context_free(struct fs_context *fc) +{ + struct cgroup_fs_context *ctx = cgroup_fc2context(fc); - return dentry; + kfree(ctx->name); + kfree(ctx->release_agent); + put_cgroup_ns(ctx->ns); + kernfs_free_fs_context(fc); + kfree(ctx); } -static struct dentry *cgroup_mount(struct file_system_type *fs_type, - int flags, const char *unused_dev_name, - void *data) +static int cgroup_get_tree(struct fs_context *fc) { - struct cgroup_namespace *ns = current->nsproxy->cgroup_ns; - struct dentry *dentry; + struct cgroup_fs_context *ctx = cgroup_fc2context(fc); int ret; - get_cgroup_ns(ns); + cgrp_dfl_visible = true; + cgroup_get_live(&cgrp_dfl_root.cgrp); + ctx->root = &cgrp_dfl_root; - /* Check if the caller has permission to mount. */ - if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN)) { - put_cgroup_ns(ns); - return ERR_PTR(-EPERM); - } + ret = cgroup_do_get_tree(fc); + if (!ret) + apply_cgroup_root_flags(ctx->flags); + return ret; +} + +static const struct fs_context_operations cgroup_fs_context_ops = { + .free = cgroup_fs_context_free, + .parse_param = cgroup2_parse_param, + .get_tree = cgroup_get_tree, + .reconfigure = cgroup_reconfigure, +}; + +static const struct fs_context_operations cgroup1_fs_context_ops = { + .free = cgroup_fs_context_free, + .parse_param = cgroup1_parse_param, + .get_tree = cgroup1_get_tree, + .reconfigure = cgroup1_reconfigure, +}; + +/* + * Initialise the cgroup filesystem creation/reconfiguration context. Notably, + * we select the namespace we're going to use. + */ +static int cgroup_init_fs_context(struct fs_context *fc) +{ + struct cgroup_fs_context *ctx; + + ctx = kzalloc(sizeof(struct cgroup_fs_context), GFP_KERNEL); + if (!ctx) + return -ENOMEM; /* * The first time anyone tries to mount a cgroup, enable the list @@ -2087,29 +2194,18 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, if (!use_task_css_set_links) cgroup_enable_task_cg_lists(); - if (fs_type == &cgroup2_fs_type) { - unsigned int root_flags; - - ret = parse_cgroup_root_flags(data, &root_flags); - if (ret) { - put_cgroup_ns(ns); - return ERR_PTR(ret); - } - - cgrp_dfl_visible = true; - cgroup_get_live(&cgrp_dfl_root.cgrp); - - dentry = cgroup_do_mount(&cgroup2_fs_type, flags, &cgrp_dfl_root, - CGROUP2_SUPER_MAGIC, ns); - if (!IS_ERR(dentry)) - apply_cgroup_root_flags(root_flags); - } else { - dentry = cgroup1_mount(&cgroup_fs_type, flags, data, - CGROUP_SUPER_MAGIC, ns); - } - - put_cgroup_ns(ns); - return dentry; + ctx->ns = current->nsproxy->cgroup_ns; + get_cgroup_ns(ctx->ns); + fc->fs_private = &ctx->kfc; + if (fc->fs_type == &cgroup2_fs_type) + fc->ops = &cgroup_fs_context_ops; + else + fc->ops = &cgroup1_fs_context_ops; + if (fc->user_ns) + put_user_ns(fc->user_ns); + fc->user_ns = get_user_ns(ctx->ns->user_ns); + fc->global = true; + return 0; } static void cgroup_kill_sb(struct super_block *sb) @@ -2118,33 +2214,33 @@ static void cgroup_kill_sb(struct super_block *sb) struct cgroup_root *root = cgroup_root_from_kf(kf_root); /* - * If @root doesn't have any mounts or children, start killing it. + * If @root doesn't have any children, start killing it. * This prevents new mounts by disabling percpu_ref_tryget_live(). * cgroup_mount() may wait for @root's release. * * And don't kill the default root. */ - if (!list_empty(&root->cgrp.self.children) || - root == &cgrp_dfl_root) - cgroup_put(&root->cgrp); - else + if (list_empty(&root->cgrp.self.children) && root != &cgrp_dfl_root && + !percpu_ref_is_dying(&root->cgrp.self.refcnt)) percpu_ref_kill(&root->cgrp.self.refcnt); - + cgroup_put(&root->cgrp); kernfs_kill_sb(sb); } struct file_system_type cgroup_fs_type = { - .name = "cgroup", - .mount = cgroup_mount, - .kill_sb = cgroup_kill_sb, - .fs_flags = FS_USERNS_MOUNT, + .name = "cgroup", + .init_fs_context = cgroup_init_fs_context, + .parameters = &cgroup1_fs_parameters, + .kill_sb = cgroup_kill_sb, + .fs_flags = FS_USERNS_MOUNT, }; static struct file_system_type cgroup2_fs_type = { - .name = "cgroup2", - .mount = cgroup_mount, - .kill_sb = cgroup_kill_sb, - .fs_flags = FS_USERNS_MOUNT, + .name = "cgroup2", + .init_fs_context = cgroup_init_fs_context, + .parameters = &cgroup2_fs_parameters, + .kill_sb = cgroup_kill_sb, + .fs_flags = FS_USERNS_MOUNT, }; int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen, @@ -2358,8 +2454,15 @@ static int cgroup_migrate_execute(struct cgroup_mgctx *mgctx) get_css_set(to_cset); to_cset->nr_tasks++; css_set_move_task(task, from_cset, to_cset, true); - put_css_set_locked(from_cset); from_cset->nr_tasks--; + /* + * If the source or destination cgroup is frozen, + * the task might require to change its state. + */ + cgroup_freezer_migrate_task(task, from_cset->dfl_cgrp, + to_cset->dfl_cgrp); + put_css_set_locked(from_cset); + } } spin_unlock_irq(&css_set_lock); @@ -2558,7 +2661,7 @@ int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx) dst_cset = find_css_set(src_cset, src_cset->mg_dst_cgrp); if (!dst_cset) - goto err; + return -ENOMEM; WARN_ON_ONCE(src_cset->mg_dst_cset || dst_cset->mg_dst_cset); @@ -2590,9 +2693,6 @@ int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx) } return 0; -err: - cgroup_migrate_finish(mgctx); - return -ENOMEM; } /** @@ -3403,8 +3503,11 @@ static ssize_t cgroup_max_depth_write(struct kernfs_open_file *of, static int cgroup_events_show(struct seq_file *seq, void *v) { - seq_printf(seq, "populated %d\n", - cgroup_is_populated(seq_css(seq)->cgroup)); + struct cgroup *cgrp = seq_css(seq)->cgroup; + + seq_printf(seq, "populated %d\n", cgroup_is_populated(cgrp)); + seq_printf(seq, "frozen %d\n", test_bit(CGRP_FROZEN, &cgrp->flags)); + return 0; } @@ -3454,17 +3557,118 @@ static int cpu_stat_show(struct seq_file *seq, void *v) #ifdef CONFIG_PSI static int cgroup_io_pressure_show(struct seq_file *seq, void *v) { - return psi_show(seq, &seq_css(seq)->cgroup->psi, PSI_IO); + struct cgroup *cgroup = seq_css(seq)->cgroup; + struct psi_group *psi = cgroup->id == 1 ? &psi_system : &cgroup->psi; + + return psi_show(seq, psi, PSI_IO); } static int cgroup_memory_pressure_show(struct seq_file *seq, void *v) { - return psi_show(seq, &seq_css(seq)->cgroup->psi, PSI_MEM); + struct cgroup *cgroup = seq_css(seq)->cgroup; + struct psi_group *psi = cgroup->id == 1 ? &psi_system : &cgroup->psi; + + return psi_show(seq, psi, PSI_MEM); } static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v) { - return psi_show(seq, &seq_css(seq)->cgroup->psi, PSI_CPU); + struct cgroup *cgroup = seq_css(seq)->cgroup; + struct psi_group *psi = cgroup->id == 1 ? &psi_system : &cgroup->psi; + + return psi_show(seq, psi, PSI_CPU); +} + +static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf, + size_t nbytes, enum psi_res res) +{ + struct psi_trigger *new; + struct cgroup *cgrp; + + cgrp = cgroup_kn_lock_live(of->kn, false); + if (!cgrp) + return -ENODEV; + + cgroup_get(cgrp); + cgroup_kn_unlock(of->kn); + + new = psi_trigger_create(&cgrp->psi, buf, nbytes, res); + if (IS_ERR(new)) { + cgroup_put(cgrp); + return PTR_ERR(new); + } + + psi_trigger_replace(&of->priv, new); + + cgroup_put(cgrp); + + return nbytes; +} + +static ssize_t cgroup_io_pressure_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, + loff_t off) +{ + return cgroup_pressure_write(of, buf, nbytes, PSI_IO); +} + +static ssize_t cgroup_memory_pressure_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, + loff_t off) +{ + return cgroup_pressure_write(of, buf, nbytes, PSI_MEM); +} + +static ssize_t cgroup_cpu_pressure_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, + loff_t off) +{ + return cgroup_pressure_write(of, buf, nbytes, PSI_CPU); +} + +static __poll_t cgroup_pressure_poll(struct kernfs_open_file *of, + poll_table *pt) +{ + return psi_trigger_poll(&of->priv, of->file, pt); +} + +static void cgroup_pressure_release(struct kernfs_open_file *of) +{ + psi_trigger_replace(&of->priv, NULL); +} +#endif /* CONFIG_PSI */ + +static int cgroup_freeze_show(struct seq_file *seq, void *v) +{ + struct cgroup *cgrp = seq_css(seq)->cgroup; + + seq_printf(seq, "%d\n", cgrp->freezer.freeze); + + return 0; +} + +static ssize_t cgroup_freeze_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct cgroup *cgrp; + ssize_t ret; + int freeze; + + ret = kstrtoint(strstrip(buf), 0, &freeze); + if (ret) + return ret; + + if (freeze < 0 || freeze > 1) + return -ERANGE; + + cgrp = cgroup_kn_lock_live(of->kn, false); + if (!cgrp) + return -ENOENT; + + cgroup_freeze(cgrp, freeze); + + cgroup_kn_unlock(of->kn); + + return nbytes; } -#endif static int cgroup_file_open(struct kernfs_open_file *of) { @@ -3533,6 +3737,16 @@ static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf, return ret ?: nbytes; } +static __poll_t cgroup_file_poll(struct kernfs_open_file *of, poll_table *pt) +{ + struct cftype *cft = of->kn->priv; + + if (cft->poll) + return cft->poll(of, pt); + + return kernfs_generic_poll(of, pt); +} + static void *cgroup_seqfile_start(struct seq_file *seq, loff_t *ppos) { return seq_cft(seq)->seq_start(seq, ppos); @@ -3571,6 +3785,7 @@ static struct kernfs_ops cgroup_kf_single_ops = { .open = cgroup_file_open, .release = cgroup_file_release, .write = cgroup_file_write, + .poll = cgroup_file_poll, .seq_show = cgroup_seqfile_show, }; @@ -3579,6 +3794,7 @@ static struct kernfs_ops cgroup_kf_ops = { .open = cgroup_file_open, .release = cgroup_file_release, .write = cgroup_file_write, + .poll = cgroup_file_poll, .seq_start = cgroup_seqfile_start, .seq_next = cgroup_seqfile_next, .seq_stop = cgroup_seqfile_stop, @@ -4010,6 +4226,7 @@ css_next_descendant_pre(struct cgroup_subsys_state *pos, return NULL; } +EXPORT_SYMBOL_GPL(css_next_descendant_pre); /** * css_rightmost_descendant - return the rightmost descendant of a css @@ -4197,15 +4414,18 @@ static void css_task_iter_advance_css_set(struct css_task_iter *it) it->task_pos = NULL; return; } - } while (!css_set_populated(cset)); + } while (!css_set_populated(cset) && list_empty(&cset->dying_tasks)); if (!list_empty(&cset->tasks)) it->task_pos = cset->tasks.next; - else + else if (!list_empty(&cset->mg_tasks)) it->task_pos = cset->mg_tasks.next; + else + it->task_pos = cset->dying_tasks.next; it->tasks_head = &cset->tasks; it->mg_tasks_head = &cset->mg_tasks; + it->dying_tasks_head = &cset->dying_tasks; /* * We don't keep css_sets locked across iteration steps and thus @@ -4231,9 +4451,20 @@ static void css_task_iter_advance_css_set(struct css_task_iter *it) list_add(&it->iters_node, &cset->task_iters); } +static void css_task_iter_skip(struct css_task_iter *it, + struct task_struct *task) +{ + lockdep_assert_held(&css_set_lock); + + if (it->task_pos == &task->cg_list) { + it->task_pos = it->task_pos->next; + it->flags |= CSS_TASK_ITER_SKIPPED; + } +} + static void css_task_iter_advance(struct css_task_iter *it) { - struct list_head *next; + struct task_struct *task; lockdep_assert_held(&css_set_lock); repeat: @@ -4243,25 +4474,40 @@ repeat: * consumed first and then ->mg_tasks. After ->mg_tasks, * we move onto the next cset. */ - next = it->task_pos->next; - - if (next == it->tasks_head) - next = it->mg_tasks_head->next; + if (it->flags & CSS_TASK_ITER_SKIPPED) + it->flags &= ~CSS_TASK_ITER_SKIPPED; + else + it->task_pos = it->task_pos->next; - if (next == it->mg_tasks_head) + if (it->task_pos == it->tasks_head) + it->task_pos = it->mg_tasks_head->next; + if (it->task_pos == it->mg_tasks_head) + it->task_pos = it->dying_tasks_head->next; + if (it->task_pos == it->dying_tasks_head) css_task_iter_advance_css_set(it); - else - it->task_pos = next; } else { /* called from start, proceed to the first cset */ css_task_iter_advance_css_set(it); } - /* if PROCS, skip over tasks which aren't group leaders */ - if ((it->flags & CSS_TASK_ITER_PROCS) && it->task_pos && - !thread_group_leader(list_entry(it->task_pos, struct task_struct, - cg_list))) - goto repeat; + if (!it->task_pos) + return; + + task = list_entry(it->task_pos, struct task_struct, cg_list); + + if (it->flags & CSS_TASK_ITER_PROCS) { + /* if PROCS, skip over tasks which aren't group leaders */ + if (!thread_group_leader(task)) + goto repeat; + + /* and dying leaders w/o live member threads */ + if (!atomic_read(&task->signal->live)) + goto repeat; + } else { + /* skip all dying ones */ + if (task->flags & PF_EXITING) + goto repeat; + } } /** @@ -4317,6 +4563,10 @@ struct task_struct *css_task_iter_next(struct css_task_iter *it) spin_lock_irq(&css_set_lock); + /* @it may be half-advanced by skips, finish advancing */ + if (it->flags & CSS_TASK_ITER_SKIPPED) + css_task_iter_advance(it); + if (it->task_pos) { it->cur_task = list_entry(it->task_pos, struct task_struct, cg_list); @@ -4598,6 +4848,12 @@ static struct cftype cgroup_base_files[] = { .seq_show = cgroup_stat_show, }, { + .name = "cgroup.freeze", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = cgroup_freeze_show, + .write = cgroup_freeze_write, + }, + { .name = "cpu.stat", .flags = CFTYPE_NOT_ON_ROOT, .seq_show = cpu_stat_show, @@ -4605,20 +4861,26 @@ static struct cftype cgroup_base_files[] = { #ifdef CONFIG_PSI { .name = "io.pressure", - .flags = CFTYPE_NOT_ON_ROOT, .seq_show = cgroup_io_pressure_show, + .write = cgroup_io_pressure_write, + .poll = cgroup_pressure_poll, + .release = cgroup_pressure_release, }, { .name = "memory.pressure", - .flags = CFTYPE_NOT_ON_ROOT, .seq_show = cgroup_memory_pressure_show, + .write = cgroup_memory_pressure_write, + .poll = cgroup_pressure_poll, + .release = cgroup_pressure_release, }, { .name = "cpu.pressure", - .flags = CFTYPE_NOT_ON_ROOT, .seq_show = cgroup_cpu_pressure_show, + .write = cgroup_cpu_pressure_write, + .poll = cgroup_pressure_poll, + .release = cgroup_pressure_release, }, -#endif +#endif /* CONFIG_PSI */ { } /* terminate */ }; @@ -4725,9 +4987,11 @@ static void css_release_work_fn(struct work_struct *work) if (cgroup_on_dfl(cgrp)) cgroup_rstat_flush(cgrp); + spin_lock_irq(&css_set_lock); for (tcgrp = cgroup_parent(cgrp); tcgrp; tcgrp = cgroup_parent(tcgrp)) tcgrp->nr_dying_descendants--; + spin_unlock_irq(&css_set_lock); cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id); cgrp->id = -1; @@ -4742,8 +5006,6 @@ static void css_release_work_fn(struct work_struct *work) if (cgrp->kn) RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv, NULL); - - cgroup_bpf_put(cgrp); } mutex_unlock(&cgroup_mutex); @@ -4945,12 +5207,31 @@ static struct cgroup *cgroup_create(struct cgroup *parent) if (ret) goto out_psi_free; + /* + * New cgroup inherits effective freeze counter, and + * if the parent has to be frozen, the child has too. + */ + cgrp->freezer.e_freeze = parent->freezer.e_freeze; + if (cgrp->freezer.e_freeze) + set_bit(CGRP_FROZEN, &cgrp->flags); + + spin_lock_irq(&css_set_lock); for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) { cgrp->ancestor_ids[tcgrp->level] = tcgrp->id; - if (tcgrp != cgrp) + if (tcgrp != cgrp) { tcgrp->nr_descendants++; + + /* + * If the new cgroup is frozen, all ancestor cgroups + * get a new frozen descendant, but their state can't + * change because of this. + */ + if (cgrp->freezer.e_freeze) + tcgrp->freezer.nr_frozen_descendants++; + } } + spin_unlock_irq(&css_set_lock); if (notify_on_release(parent)) set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); @@ -5235,13 +5516,23 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) if (parent && cgroup_is_threaded(cgrp)) parent->nr_threaded_children--; + spin_lock_irq(&css_set_lock); for (tcgrp = cgroup_parent(cgrp); tcgrp; tcgrp = cgroup_parent(tcgrp)) { tcgrp->nr_descendants--; tcgrp->nr_dying_descendants++; + /* + * If the dying cgroup is frozen, decrease frozen descendants + * counters of ancestor cgroups. + */ + if (test_bit(CGRP_FROZEN, &cgrp->flags)) + tcgrp->freezer.nr_frozen_descendants--; } + spin_unlock_irq(&css_set_lock); cgroup1_check_for_release(parent); + cgroup_bpf_offline(cgrp); + /* put the base reference */ percpu_ref_kill(&cgrp->self.refcnt); @@ -5267,7 +5558,6 @@ int cgroup_rmdir(struct kernfs_node *kn) static struct kernfs_syscall_ops cgroup_kf_syscall_ops = { .show_options = cgroup_show_options, - .remount_fs = cgroup_remount, .mkdir = cgroup_mkdir, .rmdir = cgroup_rmdir, .show_path = cgroup_show_path, @@ -5313,7 +5603,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early) have_fork_callback |= (bool)ss->fork << ss->id; have_exit_callback |= (bool)ss->exit << ss->id; - have_free_callback |= (bool)ss->free << ss->id; + have_release_callback |= (bool)ss->release << ss->id; have_canfork_callback |= (bool)ss->can_fork << ss->id; /* At system boot, before all subsystems have been @@ -5334,11 +5624,12 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early) */ int __init cgroup_init_early(void) { - static struct cgroup_sb_opts __initdata opts; + static struct cgroup_fs_context __initdata ctx; struct cgroup_subsys *ss; int i; - init_cgroup_root(&cgrp_dfl_root, &opts); + ctx.root = &cgrp_dfl_root; + init_cgroup_root(&ctx); cgrp_dfl_root.cgrp.self.flags |= CSS_NO_REF; RCU_INIT_POINTER(init_task.cgroups, &init_css_set); @@ -5376,7 +5667,6 @@ int __init cgroup_init(void) int ssid; BUILD_BUG_ON(CGROUP_SUBSYS_COUNT > 16); - BUG_ON(percpu_init_rwsem(&cgroup_threadgroup_rwsem)); BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files)); BUG_ON(cgroup_init_cftypes(NULL, cgroup1_base_files)); @@ -5399,7 +5689,7 @@ int __init cgroup_init(void) hash_add(css_set_table, &init_css_set.hlist, css_set_hash(init_css_set.subsys)); - BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0, 0)); + BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0)); mutex_unlock(&cgroup_mutex); @@ -5690,6 +5980,26 @@ void cgroup_post_fork(struct task_struct *child) cset->nr_tasks++; css_set_move_task(child, NULL, cset, false); } + + /* + * If the cgroup has to be frozen, the new task has too. + * Let's set the JOBCTL_TRAP_FREEZE jobctl bit to get + * the task into the frozen state. + */ + if (unlikely(cgroup_task_freeze(child))) { + spin_lock(&child->sighand->siglock); + WARN_ON_ONCE(child->frozen); + child->jobctl |= JOBCTL_TRAP_FREEZE; + spin_unlock(&child->sighand->siglock); + + /* + * Calling cgroup_update_frozen() isn't required here, + * because it will be called anyway a bit later + * from do_freezer_trap(). So we avoid cgroup's + * transient switch from the frozen state and back. + */ + } + spin_unlock_irq(&css_set_lock); } @@ -5737,7 +6047,13 @@ void cgroup_exit(struct task_struct *tsk) if (!list_empty(&tsk->cg_list)) { spin_lock_irq(&css_set_lock); css_set_move_task(tsk, cset, NULL, false); + list_add_tail(&tsk->cg_list, &cset->dying_tasks); cset->nr_tasks--; + + WARN_ON_ONCE(cgroup_task_frozen(tsk)); + if (unlikely(cgroup_task_freeze(tsk))) + cgroup_update_frozen(task_dfl_cgroup(tsk)); + spin_unlock_irq(&css_set_lock); } else { get_css_set(cset); @@ -5749,16 +6065,26 @@ void cgroup_exit(struct task_struct *tsk) } while_each_subsys_mask(); } -void cgroup_free(struct task_struct *task) +void cgroup_release(struct task_struct *task) { - struct css_set *cset = task_css_set(task); struct cgroup_subsys *ss; int ssid; - do_each_subsys_mask(ss, ssid, have_free_callback) { - ss->free(task); + do_each_subsys_mask(ss, ssid, have_release_callback) { + ss->release(task); } while_each_subsys_mask(); + if (use_task_css_set_links) { + spin_lock_irq(&css_set_lock); + css_set_skip_task_iters(task_css_set(task), task); + list_del_init(&task->cg_list); + spin_unlock_irq(&css_set_lock); + } +} + +void cgroup_free(struct task_struct *task) +{ + struct css_set *cset = task_css_set(task); put_css_set(cset); } @@ -5915,6 +6241,48 @@ struct cgroup *cgroup_get_from_fd(int fd) } EXPORT_SYMBOL_GPL(cgroup_get_from_fd); +static u64 power_of_ten(int power) +{ + u64 v = 1; + while (power--) + v *= 10; + return v; +} + +/** + * cgroup_parse_float - parse a floating number + * @input: input string + * @dec_shift: number of decimal digits to shift + * @v: output + * + * Parse a decimal floating point number in @input and store the result in + * @v with decimal point right shifted @dec_shift times. For example, if + * @input is "12.3456" and @dec_shift is 3, *@v will be set to 12345. + * Returns 0 on success, -errno otherwise. + * + * There's nothing cgroup specific about this function except that it's + * currently the only user. + */ +int cgroup_parse_float(const char *input, unsigned dec_shift, s64 *v) +{ + s64 whole, frac = 0; + int fstart = 0, fend = 0, flen; + + if (!sscanf(input, "%lld.%n%lld%n", &whole, &fstart, &frac, &fend)) + return -EINVAL; + if (frac < 0) + return -EINVAL; + + flen = fend > fstart ? fend - fstart : 0; + if (flen < dec_shift) + frac *= power_of_ten(dec_shift - flen); + else + frac = DIV_ROUND_CLOSEST_ULL(frac, power_of_ten(flen - dec_shift)); + + *v = whole * power_of_ten(dec_shift) + frac; + return 0; +} + /* * sock->sk_cgrp_data handling. For more info, see sock_cgroup_data * definition in cgroup-defs.h. @@ -5953,6 +6321,7 @@ void cgroup_sk_alloc(struct sock_cgroup_data *skcd) * Don't use cgroup_get_live(). */ cgroup_get(sock_cgroup_ptr(skcd)); + cgroup_bpf_get(sock_cgroup_ptr(skcd)); return; } @@ -5964,6 +6333,7 @@ void cgroup_sk_alloc(struct sock_cgroup_data *skcd) cset = task_css_set(current); if (likely(cgroup_tryget(cset->dfl_cgrp))) { skcd->val = (unsigned long)cset->dfl_cgrp; + cgroup_bpf_get(cset->dfl_cgrp); break; } cpu_relax(); @@ -5974,7 +6344,10 @@ void cgroup_sk_alloc(struct sock_cgroup_data *skcd) void cgroup_sk_free(struct sock_cgroup_data *skcd) { - cgroup_put(sock_cgroup_ptr(skcd)); + struct cgroup *cgrp = sock_cgroup_ptr(skcd); + + cgroup_bpf_put(cgrp); + cgroup_put(cgrp); } #endif /* CONFIG_SOCK_CGROUP_DATA */ @@ -5996,7 +6369,7 @@ int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, int ret; mutex_lock(&cgroup_mutex); - ret = __cgroup_bpf_detach(cgrp, prog, type, flags); + ret = __cgroup_bpf_detach(cgrp, prog, type); mutex_unlock(&cgroup_mutex); return ret; } @@ -6057,7 +6430,7 @@ static struct kobj_attribute cgroup_delegate_attr = __ATTR_RO(delegate); static ssize_t features_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return snprintf(buf, PAGE_SIZE, "nsdelegate\n"); + return snprintf(buf, PAGE_SIZE, "nsdelegate\nmemory_localevents\n"); } static struct kobj_attribute cgroup_features_attr = __ATTR_RO(features); @@ -6077,4 +6450,5 @@ static int __init cgroup_sysfs_init(void) return sysfs_create_group(kernel_kobj, &cgroup_sysfs_attr_group); } subsys_initcall(cgroup_sysfs_init); + #endif /* CONFIG_SYSFS */ diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 479743db6c37..863e434a6020 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -39,6 +39,7 @@ #include <linux/memory.h> #include <linux/export.h> #include <linux/mount.h> +#include <linux/fs_context.h> #include <linux/namei.h> #include <linux/pagemap.h> #include <linux/proc_fs.h> @@ -203,19 +204,6 @@ static inline struct cpuset *parent_cs(struct cpuset *cs) return css_cs(cs->css.parent); } -#ifdef CONFIG_NUMA -static inline bool task_has_mempolicy(struct task_struct *task) -{ - return task->mempolicy; -} -#else -static inline bool task_has_mempolicy(struct task_struct *task) -{ - return false; -} -#endif - - /* bits in struct cpuset flags field */ typedef enum { CS_ONLINE, @@ -372,25 +360,52 @@ static inline bool is_in_v2_mode(void) * users. If someone tries to mount the "cpuset" filesystem, we * silently switch it to mount "cgroup" instead */ -static struct dentry *cpuset_mount(struct file_system_type *fs_type, - int flags, const char *unused_dev_name, void *data) -{ - struct file_system_type *cgroup_fs = get_fs_type("cgroup"); - struct dentry *ret = ERR_PTR(-ENODEV); - if (cgroup_fs) { - char mountopts[] = - "cpuset,noprefix," - "release_agent=/sbin/cpuset_release_agent"; - ret = cgroup_fs->mount(cgroup_fs, flags, - unused_dev_name, mountopts); - put_filesystem(cgroup_fs); +static int cpuset_get_tree(struct fs_context *fc) +{ + struct file_system_type *cgroup_fs; + struct fs_context *new_fc; + int ret; + + cgroup_fs = get_fs_type("cgroup"); + if (!cgroup_fs) + return -ENODEV; + + new_fc = fs_context_for_mount(cgroup_fs, fc->sb_flags); + if (IS_ERR(new_fc)) { + ret = PTR_ERR(new_fc); + } else { + static const char agent_path[] = "/sbin/cpuset_release_agent"; + ret = vfs_parse_fs_string(new_fc, "cpuset", NULL, 0); + if (!ret) + ret = vfs_parse_fs_string(new_fc, "noprefix", NULL, 0); + if (!ret) + ret = vfs_parse_fs_string(new_fc, "release_agent", + agent_path, sizeof(agent_path) - 1); + if (!ret) + ret = vfs_get_tree(new_fc); + if (!ret) { /* steal the result */ + fc->root = new_fc->root; + new_fc->root = NULL; + } + put_fs_context(new_fc); } + put_filesystem(cgroup_fs); return ret; } +static const struct fs_context_operations cpuset_fs_context_ops = { + .get_tree = cpuset_get_tree, +}; + +static int cpuset_init_fs_context(struct fs_context *fc) +{ + fc->ops = &cpuset_fs_context_ops; + return 0; +} + static struct file_system_type cpuset_fs_type = { - .name = "cpuset", - .mount = cpuset_mount, + .name = "cpuset", + .init_fs_context = cpuset_init_fs_context, }; /* @@ -714,7 +729,7 @@ static inline int nr_cpusets(void) * load balancing domains (sched domains) as specified by that partial * partition. * - * See "What is sched_load_balance" in Documentation/cgroup-v1/cpusets.txt + * See "What is sched_load_balance" in Documentation/admin-guide/cgroup-v1/cpusets.rst * for a background explanation of this. * * Does not return errors, on the theory that the callers of this @@ -725,11 +740,10 @@ static inline int nr_cpusets(void) * Must be called with cpuset_mutex held. * * The three key local variables below are: - * q - a linked-list queue of cpuset pointers, used to implement a - * top-down scan of all cpusets. This scan loads a pointer - * to each cpuset marked is_sched_load_balance into the - * array 'csa'. For our purposes, rebuilding the schedulers - * sched domains, we can ignore !is_sched_load_balance cpusets. + * cp - cpuset pointer, used (together with pos_css) to perform a + * top-down scan of all cpusets. For our purposes, rebuilding + * the schedulers sched domains, we can ignore !is_sched_load_ + * balance cpusets. * csa - (for CpuSet Array) Array of pointers to all the cpusets * that need to be load balanced, for convenient iterative * access by the subsequent code that finds the best partition, @@ -760,7 +774,7 @@ static inline int nr_cpusets(void) static int generate_sched_domains(cpumask_var_t **domains, struct sched_domain_attr **attributes) { - struct cpuset *cp; /* scans q */ + struct cpuset *cp; /* top-down scan of cpusets */ struct cpuset **csa; /* array of all cpuset ptrs */ int csn; /* how many cpuset ptrs in csa so far */ int i, j, k; /* indices for partition finding loops */ @@ -2815,7 +2829,7 @@ static void cpuset_fork(struct task_struct *task) if (task_css_is_root(task, cpuset_cgrp_id)) return; - set_cpus_allowed_ptr(task, ¤t->cpus_allowed); + set_cpus_allowed_ptr(task, current->cpus_ptr); task->mems_allowed = current->mems_allowed; } @@ -3240,10 +3254,23 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask) spin_unlock_irqrestore(&callback_lock, flags); } +/** + * cpuset_cpus_allowed_fallback - final fallback before complete catastrophe. + * @tsk: pointer to task_struct with which the scheduler is struggling + * + * Description: In the case that the scheduler cannot find an allowed cpu in + * tsk->cpus_allowed, we fall back to task_cs(tsk)->cpus_allowed. In legacy + * mode however, this value is the same as task_cs(tsk)->effective_cpus, + * which will not contain a sane cpumask during cases such as cpu hotplugging. + * This is the absolute last resort for the scheduler and it is only used if + * _every_ other avenue has been traveled. + **/ + void cpuset_cpus_allowed_fallback(struct task_struct *tsk) { rcu_read_lock(); - do_set_cpus_allowed(tsk, task_cs(tsk)->effective_cpus); + do_set_cpus_allowed(tsk, is_in_v2_mode() ? + task_cs(tsk)->cpus_allowed : cpu_possible_mask); rcu_read_unlock(); /* diff --git a/kernel/cgroup/debug.c b/kernel/cgroup/debug.c index 5f1b87330bee..80aa3f027ac3 100644 --- a/kernel/cgroup/debug.c +++ b/kernel/cgroup/debug.c @@ -64,8 +64,8 @@ static int current_css_set_read(struct seq_file *seq, void *v) css = cset->subsys[ss->id]; if (!css) continue; - seq_printf(seq, "%2d: %-4s\t- %lx[%d]\n", ss->id, ss->name, - (unsigned long)css, css->id); + seq_printf(seq, "%2d: %-4s\t- %p[%d]\n", ss->id, ss->name, + css, css->id); } rcu_read_unlock(); spin_unlock_irq(&css_set_lock); @@ -224,8 +224,8 @@ static int cgroup_subsys_states_read(struct seq_file *seq, void *v) if (css->parent) snprintf(pbuf, sizeof(pbuf) - 1, " P=%d", css->parent->id); - seq_printf(seq, "%2d: %-4s\t- %lx[%d] %d%s\n", ss->id, ss->name, - (unsigned long)css, css->id, + seq_printf(seq, "%2d: %-4s\t- %p[%d] %d%s\n", ss->id, ss->name, + css, css->id, atomic_read(&css->online_cnt), pbuf); } diff --git a/kernel/cgroup/freezer.c b/kernel/cgroup/freezer.c index 08236798d173..8cf010680678 100644 --- a/kernel/cgroup/freezer.c +++ b/kernel/cgroup/freezer.c @@ -1,481 +1,314 @@ -/* - * cgroup_freezer.c - control group freezer subsystem - * - * Copyright IBM Corporation, 2007 - * - * Author : Cedric Le Goater <clg@fr.ibm.com> - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of version 2.1 of the GNU Lesser General Public License - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. - */ - -#include <linux/export.h> -#include <linux/slab.h> +//SPDX-License-Identifier: GPL-2.0 #include <linux/cgroup.h> -#include <linux/fs.h> -#include <linux/uaccess.h> -#include <linux/freezer.h> -#include <linux/seq_file.h> -#include <linux/mutex.h> - -/* - * A cgroup is freezing if any FREEZING flags are set. FREEZING_SELF is - * set if "FROZEN" is written to freezer.state cgroupfs file, and cleared - * for "THAWED". FREEZING_PARENT is set if the parent freezer is FREEZING - * for whatever reason. IOW, a cgroup has FREEZING_PARENT set if one of - * its ancestors has FREEZING_SELF set. - */ -enum freezer_state_flags { - CGROUP_FREEZER_ONLINE = (1 << 0), /* freezer is fully online */ - CGROUP_FREEZING_SELF = (1 << 1), /* this freezer is freezing */ - CGROUP_FREEZING_PARENT = (1 << 2), /* the parent freezer is freezing */ - CGROUP_FROZEN = (1 << 3), /* this and its descendants frozen */ +#include <linux/sched.h> +#include <linux/sched/task.h> +#include <linux/sched/signal.h> - /* mask for all FREEZING flags */ - CGROUP_FREEZING = CGROUP_FREEZING_SELF | CGROUP_FREEZING_PARENT, -}; +#include "cgroup-internal.h" -struct freezer { - struct cgroup_subsys_state css; - unsigned int state; -}; +#include <trace/events/cgroup.h> -static DEFINE_MUTEX(freezer_mutex); - -static inline struct freezer *css_freezer(struct cgroup_subsys_state *css) +/* + * Propagate the cgroup frozen state upwards by the cgroup tree. + */ +static void cgroup_propagate_frozen(struct cgroup *cgrp, bool frozen) { - return css ? container_of(css, struct freezer, css) : NULL; -} + int desc = 1; -static inline struct freezer *task_freezer(struct task_struct *task) -{ - return css_freezer(task_css(task, freezer_cgrp_id)); + /* + * If the new state is frozen, some freezing ancestor cgroups may change + * their state too, depending on if all their descendants are frozen. + * + * Otherwise, all ancestor cgroups are forced into the non-frozen state. + */ + while ((cgrp = cgroup_parent(cgrp))) { + if (frozen) { + cgrp->freezer.nr_frozen_descendants += desc; + if (!test_bit(CGRP_FROZEN, &cgrp->flags) && + test_bit(CGRP_FREEZE, &cgrp->flags) && + cgrp->freezer.nr_frozen_descendants == + cgrp->nr_descendants) { + set_bit(CGRP_FROZEN, &cgrp->flags); + cgroup_file_notify(&cgrp->events_file); + TRACE_CGROUP_PATH(notify_frozen, cgrp, 1); + desc++; + } + } else { + cgrp->freezer.nr_frozen_descendants -= desc; + if (test_bit(CGRP_FROZEN, &cgrp->flags)) { + clear_bit(CGRP_FROZEN, &cgrp->flags); + cgroup_file_notify(&cgrp->events_file); + TRACE_CGROUP_PATH(notify_frozen, cgrp, 0); + desc++; + } + } + } } -static struct freezer *parent_freezer(struct freezer *freezer) +/* + * Revisit the cgroup frozen state. + * Checks if the cgroup is really frozen and perform all state transitions. + */ +void cgroup_update_frozen(struct cgroup *cgrp) { - return css_freezer(freezer->css.parent); -} + bool frozen; -bool cgroup_freezing(struct task_struct *task) -{ - bool ret; + lockdep_assert_held(&css_set_lock); - rcu_read_lock(); - ret = task_freezer(task)->state & CGROUP_FREEZING; - rcu_read_unlock(); + /* + * If the cgroup has to be frozen (CGRP_FREEZE bit set), + * and all tasks are frozen and/or stopped, let's consider + * the cgroup frozen. Otherwise it's not frozen. + */ + frozen = test_bit(CGRP_FREEZE, &cgrp->flags) && + cgrp->freezer.nr_frozen_tasks == __cgroup_task_count(cgrp); - return ret; -} + if (frozen) { + /* Already there? */ + if (test_bit(CGRP_FROZEN, &cgrp->flags)) + return; -static const char *freezer_state_strs(unsigned int state) -{ - if (state & CGROUP_FROZEN) - return "FROZEN"; - if (state & CGROUP_FREEZING) - return "FREEZING"; - return "THAWED"; -}; - -static struct cgroup_subsys_state * -freezer_css_alloc(struct cgroup_subsys_state *parent_css) -{ - struct freezer *freezer; + set_bit(CGRP_FROZEN, &cgrp->flags); + } else { + /* Already there? */ + if (!test_bit(CGRP_FROZEN, &cgrp->flags)) + return; - freezer = kzalloc(sizeof(struct freezer), GFP_KERNEL); - if (!freezer) - return ERR_PTR(-ENOMEM); + clear_bit(CGRP_FROZEN, &cgrp->flags); + } + cgroup_file_notify(&cgrp->events_file); + TRACE_CGROUP_PATH(notify_frozen, cgrp, frozen); - return &freezer->css; + /* Update the state of ancestor cgroups. */ + cgroup_propagate_frozen(cgrp, frozen); } -/** - * freezer_css_online - commit creation of a freezer css - * @css: css being created - * - * We're committing to creation of @css. Mark it online and inherit - * parent's freezing state while holding both parent's and our - * freezer->lock. +/* + * Increment cgroup's nr_frozen_tasks. */ -static int freezer_css_online(struct cgroup_subsys_state *css) +static void cgroup_inc_frozen_cnt(struct cgroup *cgrp) { - struct freezer *freezer = css_freezer(css); - struct freezer *parent = parent_freezer(freezer); - - mutex_lock(&freezer_mutex); - - freezer->state |= CGROUP_FREEZER_ONLINE; - - if (parent && (parent->state & CGROUP_FREEZING)) { - freezer->state |= CGROUP_FREEZING_PARENT | CGROUP_FROZEN; - atomic_inc(&system_freezing_cnt); - } - - mutex_unlock(&freezer_mutex); - return 0; + cgrp->freezer.nr_frozen_tasks++; } -/** - * freezer_css_offline - initiate destruction of a freezer css - * @css: css being destroyed - * - * @css is going away. Mark it dead and decrement system_freezing_count if - * it was holding one. +/* + * Decrement cgroup's nr_frozen_tasks. */ -static void freezer_css_offline(struct cgroup_subsys_state *css) +static void cgroup_dec_frozen_cnt(struct cgroup *cgrp) { - struct freezer *freezer = css_freezer(css); - - mutex_lock(&freezer_mutex); - - if (freezer->state & CGROUP_FREEZING) - atomic_dec(&system_freezing_cnt); - - freezer->state = 0; - - mutex_unlock(&freezer_mutex); + cgrp->freezer.nr_frozen_tasks--; + WARN_ON_ONCE(cgrp->freezer.nr_frozen_tasks < 0); } -static void freezer_css_free(struct cgroup_subsys_state *css) +/* + * Enter frozen/stopped state, if not yet there. Update cgroup's counters, + * and revisit the state of the cgroup, if necessary. + */ +void cgroup_enter_frozen(void) { - kfree(css_freezer(css)); + struct cgroup *cgrp; + + if (current->frozen) + return; + + spin_lock_irq(&css_set_lock); + current->frozen = true; + cgrp = task_dfl_cgroup(current); + cgroup_inc_frozen_cnt(cgrp); + cgroup_update_frozen(cgrp); + spin_unlock_irq(&css_set_lock); } /* - * Tasks can be migrated into a different freezer anytime regardless of its - * current state. freezer_attach() is responsible for making new tasks - * conform to the current state. + * Conditionally leave frozen/stopped state. Update cgroup's counters, + * and revisit the state of the cgroup, if necessary. * - * Freezer state changes and task migration are synchronized via - * @freezer->lock. freezer_attach() makes the new tasks conform to the - * current state and all following state changes can see the new tasks. + * If always_leave is not set, and the cgroup is freezing, + * we're racing with the cgroup freezing. In this case, we don't + * drop the frozen counter to avoid a transient switch to + * the unfrozen state. */ -static void freezer_attach(struct cgroup_taskset *tset) +void cgroup_leave_frozen(bool always_leave) { - struct task_struct *task; - struct cgroup_subsys_state *new_css; - - mutex_lock(&freezer_mutex); - - /* - * Make the new tasks conform to the current state of @new_css. - * For simplicity, when migrating any task to a FROZEN cgroup, we - * revert it to FREEZING and let update_if_frozen() determine the - * correct state later. - * - * Tasks in @tset are on @new_css but may not conform to its - * current state before executing the following - !frozen tasks may - * be visible in a FROZEN cgroup and frozen tasks in a THAWED one. - */ - cgroup_taskset_for_each(task, new_css, tset) { - struct freezer *freezer = css_freezer(new_css); - - if (!(freezer->state & CGROUP_FREEZING)) { - __thaw_task(task); - } else { - freeze_task(task); - /* clear FROZEN and propagate upwards */ - while (freezer && (freezer->state & CGROUP_FROZEN)) { - freezer->state &= ~CGROUP_FROZEN; - freezer = parent_freezer(freezer); - } - } + struct cgroup *cgrp; + + spin_lock_irq(&css_set_lock); + cgrp = task_dfl_cgroup(current); + if (always_leave || !test_bit(CGRP_FREEZE, &cgrp->flags)) { + cgroup_dec_frozen_cnt(cgrp); + cgroup_update_frozen(cgrp); + WARN_ON_ONCE(!current->frozen); + current->frozen = false; + } else if (!(current->jobctl & JOBCTL_TRAP_FREEZE)) { + spin_lock(¤t->sighand->siglock); + current->jobctl |= JOBCTL_TRAP_FREEZE; + set_thread_flag(TIF_SIGPENDING); + spin_unlock(¤t->sighand->siglock); } - - mutex_unlock(&freezer_mutex); + spin_unlock_irq(&css_set_lock); } -/** - * freezer_fork - cgroup post fork callback - * @task: a task which has just been forked - * - * @task has just been created and should conform to the current state of - * the cgroup_freezer it belongs to. This function may race against - * freezer_attach(). Losing to freezer_attach() means that we don't have - * to do anything as freezer_attach() will put @task into the appropriate - * state. +/* + * Freeze or unfreeze the task by setting or clearing the JOBCTL_TRAP_FREEZE + * jobctl bit. */ -static void freezer_fork(struct task_struct *task) +static void cgroup_freeze_task(struct task_struct *task, bool freeze) { - struct freezer *freezer; + unsigned long flags; - /* - * The root cgroup is non-freezable, so we can skip locking the - * freezer. This is safe regardless of race with task migration. - * If we didn't race or won, skipping is obviously the right thing - * to do. If we lost and root is the new cgroup, noop is still the - * right thing to do. - */ - if (task_css_is_root(task, freezer_cgrp_id)) + /* If the task is about to die, don't bother with freezing it. */ + if (!lock_task_sighand(task, &flags)) return; - mutex_lock(&freezer_mutex); - rcu_read_lock(); - - freezer = task_freezer(task); - if (freezer->state & CGROUP_FREEZING) - freeze_task(task); + if (freeze) { + task->jobctl |= JOBCTL_TRAP_FREEZE; + signal_wake_up(task, false); + } else { + task->jobctl &= ~JOBCTL_TRAP_FREEZE; + wake_up_process(task); + } - rcu_read_unlock(); - mutex_unlock(&freezer_mutex); + unlock_task_sighand(task, &flags); } -/** - * update_if_frozen - update whether a cgroup finished freezing - * @css: css of interest - * - * Once FREEZING is initiated, transition to FROZEN is lazily updated by - * calling this function. If the current state is FREEZING but not FROZEN, - * this function checks whether all tasks of this cgroup and the descendant - * cgroups finished freezing and, if so, sets FROZEN. - * - * The caller is responsible for grabbing RCU read lock and calling - * update_if_frozen() on all descendants prior to invoking this function. - * - * Task states and freezer state might disagree while tasks are being - * migrated into or out of @css, so we can't verify task states against - * @freezer state here. See freezer_attach() for details. +/* + * Freeze or unfreeze all tasks in the given cgroup. */ -static void update_if_frozen(struct cgroup_subsys_state *css) +static void cgroup_do_freeze(struct cgroup *cgrp, bool freeze) { - struct freezer *freezer = css_freezer(css); - struct cgroup_subsys_state *pos; struct css_task_iter it; struct task_struct *task; - lockdep_assert_held(&freezer_mutex); - - if (!(freezer->state & CGROUP_FREEZING) || - (freezer->state & CGROUP_FROZEN)) - return; + lockdep_assert_held(&cgroup_mutex); - /* are all (live) children frozen? */ - rcu_read_lock(); - css_for_each_child(pos, css) { - struct freezer *child = css_freezer(pos); - - if ((child->state & CGROUP_FREEZER_ONLINE) && - !(child->state & CGROUP_FROZEN)) { - rcu_read_unlock(); - return; - } - } - rcu_read_unlock(); + spin_lock_irq(&css_set_lock); + if (freeze) + set_bit(CGRP_FREEZE, &cgrp->flags); + else + clear_bit(CGRP_FREEZE, &cgrp->flags); + spin_unlock_irq(&css_set_lock); - /* are all tasks frozen? */ - css_task_iter_start(css, 0, &it); + if (freeze) + TRACE_CGROUP_PATH(freeze, cgrp); + else + TRACE_CGROUP_PATH(unfreeze, cgrp); + css_task_iter_start(&cgrp->self, 0, &it); while ((task = css_task_iter_next(&it))) { - if (freezing(task)) { - /* - * freezer_should_skip() indicates that the task - * should be skipped when determining freezing - * completion. Consider it frozen in addition to - * the usual frozen condition. - */ - if (!frozen(task) && !freezer_should_skip(task)) - goto out_iter_end; - } - } - - freezer->state |= CGROUP_FROZEN; -out_iter_end: - css_task_iter_end(&it); -} - -static int freezer_read(struct seq_file *m, void *v) -{ - struct cgroup_subsys_state *css = seq_css(m), *pos; - - mutex_lock(&freezer_mutex); - rcu_read_lock(); - - /* update states bottom-up */ - css_for_each_descendant_post(pos, css) { - if (!css_tryget_online(pos)) + /* + * Ignore kernel threads here. Freezing cgroups containing + * kthreads isn't supported. + */ + if (task->flags & PF_KTHREAD) continue; - rcu_read_unlock(); - - update_if_frozen(pos); - - rcu_read_lock(); - css_put(pos); + cgroup_freeze_task(task, freeze); } - - rcu_read_unlock(); - mutex_unlock(&freezer_mutex); - - seq_puts(m, freezer_state_strs(css_freezer(css)->state)); - seq_putc(m, '\n'); - return 0; -} - -static void freeze_cgroup(struct freezer *freezer) -{ - struct css_task_iter it; - struct task_struct *task; - - css_task_iter_start(&freezer->css, 0, &it); - while ((task = css_task_iter_next(&it))) - freeze_task(task); css_task_iter_end(&it); -} -static void unfreeze_cgroup(struct freezer *freezer) -{ - struct css_task_iter it; - struct task_struct *task; - - css_task_iter_start(&freezer->css, 0, &it); - while ((task = css_task_iter_next(&it))) - __thaw_task(task); - css_task_iter_end(&it); + /* + * Cgroup state should be revisited here to cover empty leaf cgroups + * and cgroups which descendants are already in the desired state. + */ + spin_lock_irq(&css_set_lock); + if (cgrp->nr_descendants == cgrp->freezer.nr_frozen_descendants) + cgroup_update_frozen(cgrp); + spin_unlock_irq(&css_set_lock); } -/** - * freezer_apply_state - apply state change to a single cgroup_freezer - * @freezer: freezer to apply state change to - * @freeze: whether to freeze or unfreeze - * @state: CGROUP_FREEZING_* flag to set or clear - * - * Set or clear @state on @cgroup according to @freeze, and perform - * freezing or thawing as necessary. +/* + * Adjust the task state (freeze or unfreeze) and revisit the state of + * source and destination cgroups. */ -static void freezer_apply_state(struct freezer *freezer, bool freeze, - unsigned int state) +void cgroup_freezer_migrate_task(struct task_struct *task, + struct cgroup *src, struct cgroup *dst) { - /* also synchronizes against task migration, see freezer_attach() */ - lockdep_assert_held(&freezer_mutex); + lockdep_assert_held(&css_set_lock); - if (!(freezer->state & CGROUP_FREEZER_ONLINE)) + /* + * Kernel threads are not supposed to be frozen at all. + */ + if (task->flags & PF_KTHREAD) return; - if (freeze) { - if (!(freezer->state & CGROUP_FREEZING)) - atomic_inc(&system_freezing_cnt); - freezer->state |= state; - freeze_cgroup(freezer); - } else { - bool was_freezing = freezer->state & CGROUP_FREEZING; - - freezer->state &= ~state; - - if (!(freezer->state & CGROUP_FREEZING)) { - if (was_freezing) - atomic_dec(&system_freezing_cnt); - freezer->state &= ~CGROUP_FROZEN; - unfreeze_cgroup(freezer); - } + /* + * Adjust counters of freezing and frozen tasks. + * Note, that if the task is frozen, but the destination cgroup is not + * frozen, we bump both counters to keep them balanced. + */ + if (task->frozen) { + cgroup_inc_frozen_cnt(dst); + cgroup_dec_frozen_cnt(src); } -} - -/** - * freezer_change_state - change the freezing state of a cgroup_freezer - * @freezer: freezer of interest - * @freeze: whether to freeze or thaw - * - * Freeze or thaw @freezer according to @freeze. The operations are - * recursive - all descendants of @freezer will be affected. - */ -static void freezer_change_state(struct freezer *freezer, bool freeze) -{ - struct cgroup_subsys_state *pos; + cgroup_update_frozen(dst); + cgroup_update_frozen(src); /* - * Update all its descendants in pre-order traversal. Each - * descendant will try to inherit its parent's FREEZING state as - * CGROUP_FREEZING_PARENT. + * Force the task to the desired state. */ - mutex_lock(&freezer_mutex); - rcu_read_lock(); - css_for_each_descendant_pre(pos, &freezer->css) { - struct freezer *pos_f = css_freezer(pos); - struct freezer *parent = parent_freezer(pos_f); - - if (!css_tryget_online(pos)) - continue; - rcu_read_unlock(); - - if (pos_f == freezer) - freezer_apply_state(pos_f, freeze, - CGROUP_FREEZING_SELF); - else - freezer_apply_state(pos_f, - parent->state & CGROUP_FREEZING, - CGROUP_FREEZING_PARENT); - - rcu_read_lock(); - css_put(pos); - } - rcu_read_unlock(); - mutex_unlock(&freezer_mutex); + cgroup_freeze_task(task, test_bit(CGRP_FREEZE, &dst->flags)); } -static ssize_t freezer_write(struct kernfs_open_file *of, - char *buf, size_t nbytes, loff_t off) +void cgroup_freeze(struct cgroup *cgrp, bool freeze) { - bool freeze; + struct cgroup_subsys_state *css; + struct cgroup *dsct; + bool applied = false; - buf = strstrip(buf); + lockdep_assert_held(&cgroup_mutex); - if (strcmp(buf, freezer_state_strs(0)) == 0) - freeze = false; - else if (strcmp(buf, freezer_state_strs(CGROUP_FROZEN)) == 0) - freeze = true; - else - return -EINVAL; + /* + * Nothing changed? Just exit. + */ + if (cgrp->freezer.freeze == freeze) + return; - freezer_change_state(css_freezer(of_css(of)), freeze); - return nbytes; -} + cgrp->freezer.freeze = freeze; -static u64 freezer_self_freezing_read(struct cgroup_subsys_state *css, - struct cftype *cft) -{ - struct freezer *freezer = css_freezer(css); + /* + * Propagate changes downwards the cgroup tree. + */ + css_for_each_descendant_pre(css, &cgrp->self) { + dsct = css->cgroup; - return (bool)(freezer->state & CGROUP_FREEZING_SELF); -} + if (cgroup_is_dead(dsct)) + continue; -static u64 freezer_parent_freezing_read(struct cgroup_subsys_state *css, - struct cftype *cft) -{ - struct freezer *freezer = css_freezer(css); + if (freeze) { + dsct->freezer.e_freeze++; + /* + * Already frozen because of ancestor's settings? + */ + if (dsct->freezer.e_freeze > 1) + continue; + } else { + dsct->freezer.e_freeze--; + /* + * Still frozen because of ancestor's settings? + */ + if (dsct->freezer.e_freeze > 0) + continue; - return (bool)(freezer->state & CGROUP_FREEZING_PARENT); -} + WARN_ON_ONCE(dsct->freezer.e_freeze < 0); + } + + /* + * Do change actual state: freeze or unfreeze. + */ + cgroup_do_freeze(dsct, freeze); + applied = true; + } -static struct cftype files[] = { - { - .name = "state", - .flags = CFTYPE_NOT_ON_ROOT, - .seq_show = freezer_read, - .write = freezer_write, - }, - { - .name = "self_freezing", - .flags = CFTYPE_NOT_ON_ROOT, - .read_u64 = freezer_self_freezing_read, - }, - { - .name = "parent_freezing", - .flags = CFTYPE_NOT_ON_ROOT, - .read_u64 = freezer_parent_freezing_read, - }, - { } /* terminate */ -}; - -struct cgroup_subsys freezer_cgrp_subsys = { - .css_alloc = freezer_css_alloc, - .css_online = freezer_css_online, - .css_offline = freezer_css_offline, - .css_free = freezer_css_free, - .attach = freezer_attach, - .fork = freezer_fork, - .legacy_cftypes = files, -}; + /* + * Even if the actual state hasn't changed, let's notify a user. + * The state can be enforced by an ancestor cgroup: the cgroup + * can already be in the desired state or it can be locked in the + * opposite state, so that the transition will never happen. + * In both cases it's better to notify a user, that there is + * nothing to wait for. + */ + if (!applied) { + TRACE_CGROUP_PATH(notify_frozen, cgrp, + test_bit(CGRP_FROZEN, &cgrp->flags)); + cgroup_file_notify(&cgrp->events_file); + } +} diff --git a/kernel/cgroup/legacy_freezer.c b/kernel/cgroup/legacy_freezer.c new file mode 100644 index 000000000000..08236798d173 --- /dev/null +++ b/kernel/cgroup/legacy_freezer.c @@ -0,0 +1,481 @@ +/* + * cgroup_freezer.c - control group freezer subsystem + * + * Copyright IBM Corporation, 2007 + * + * Author : Cedric Le Goater <clg@fr.ibm.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2.1 of the GNU Lesser General Public License + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + */ + +#include <linux/export.h> +#include <linux/slab.h> +#include <linux/cgroup.h> +#include <linux/fs.h> +#include <linux/uaccess.h> +#include <linux/freezer.h> +#include <linux/seq_file.h> +#include <linux/mutex.h> + +/* + * A cgroup is freezing if any FREEZING flags are set. FREEZING_SELF is + * set if "FROZEN" is written to freezer.state cgroupfs file, and cleared + * for "THAWED". FREEZING_PARENT is set if the parent freezer is FREEZING + * for whatever reason. IOW, a cgroup has FREEZING_PARENT set if one of + * its ancestors has FREEZING_SELF set. + */ +enum freezer_state_flags { + CGROUP_FREEZER_ONLINE = (1 << 0), /* freezer is fully online */ + CGROUP_FREEZING_SELF = (1 << 1), /* this freezer is freezing */ + CGROUP_FREEZING_PARENT = (1 << 2), /* the parent freezer is freezing */ + CGROUP_FROZEN = (1 << 3), /* this and its descendants frozen */ + + /* mask for all FREEZING flags */ + CGROUP_FREEZING = CGROUP_FREEZING_SELF | CGROUP_FREEZING_PARENT, +}; + +struct freezer { + struct cgroup_subsys_state css; + unsigned int state; +}; + +static DEFINE_MUTEX(freezer_mutex); + +static inline struct freezer *css_freezer(struct cgroup_subsys_state *css) +{ + return css ? container_of(css, struct freezer, css) : NULL; +} + +static inline struct freezer *task_freezer(struct task_struct *task) +{ + return css_freezer(task_css(task, freezer_cgrp_id)); +} + +static struct freezer *parent_freezer(struct freezer *freezer) +{ + return css_freezer(freezer->css.parent); +} + +bool cgroup_freezing(struct task_struct *task) +{ + bool ret; + + rcu_read_lock(); + ret = task_freezer(task)->state & CGROUP_FREEZING; + rcu_read_unlock(); + + return ret; +} + +static const char *freezer_state_strs(unsigned int state) +{ + if (state & CGROUP_FROZEN) + return "FROZEN"; + if (state & CGROUP_FREEZING) + return "FREEZING"; + return "THAWED"; +}; + +static struct cgroup_subsys_state * +freezer_css_alloc(struct cgroup_subsys_state *parent_css) +{ + struct freezer *freezer; + + freezer = kzalloc(sizeof(struct freezer), GFP_KERNEL); + if (!freezer) + return ERR_PTR(-ENOMEM); + + return &freezer->css; +} + +/** + * freezer_css_online - commit creation of a freezer css + * @css: css being created + * + * We're committing to creation of @css. Mark it online and inherit + * parent's freezing state while holding both parent's and our + * freezer->lock. + */ +static int freezer_css_online(struct cgroup_subsys_state *css) +{ + struct freezer *freezer = css_freezer(css); + struct freezer *parent = parent_freezer(freezer); + + mutex_lock(&freezer_mutex); + + freezer->state |= CGROUP_FREEZER_ONLINE; + + if (parent && (parent->state & CGROUP_FREEZING)) { + freezer->state |= CGROUP_FREEZING_PARENT | CGROUP_FROZEN; + atomic_inc(&system_freezing_cnt); + } + + mutex_unlock(&freezer_mutex); + return 0; +} + +/** + * freezer_css_offline - initiate destruction of a freezer css + * @css: css being destroyed + * + * @css is going away. Mark it dead and decrement system_freezing_count if + * it was holding one. + */ +static void freezer_css_offline(struct cgroup_subsys_state *css) +{ + struct freezer *freezer = css_freezer(css); + + mutex_lock(&freezer_mutex); + + if (freezer->state & CGROUP_FREEZING) + atomic_dec(&system_freezing_cnt); + + freezer->state = 0; + + mutex_unlock(&freezer_mutex); +} + +static void freezer_css_free(struct cgroup_subsys_state *css) +{ + kfree(css_freezer(css)); +} + +/* + * Tasks can be migrated into a different freezer anytime regardless of its + * current state. freezer_attach() is responsible for making new tasks + * conform to the current state. + * + * Freezer state changes and task migration are synchronized via + * @freezer->lock. freezer_attach() makes the new tasks conform to the + * current state and all following state changes can see the new tasks. + */ +static void freezer_attach(struct cgroup_taskset *tset) +{ + struct task_struct *task; + struct cgroup_subsys_state *new_css; + + mutex_lock(&freezer_mutex); + + /* + * Make the new tasks conform to the current state of @new_css. + * For simplicity, when migrating any task to a FROZEN cgroup, we + * revert it to FREEZING and let update_if_frozen() determine the + * correct state later. + * + * Tasks in @tset are on @new_css but may not conform to its + * current state before executing the following - !frozen tasks may + * be visible in a FROZEN cgroup and frozen tasks in a THAWED one. + */ + cgroup_taskset_for_each(task, new_css, tset) { + struct freezer *freezer = css_freezer(new_css); + + if (!(freezer->state & CGROUP_FREEZING)) { + __thaw_task(task); + } else { + freeze_task(task); + /* clear FROZEN and propagate upwards */ + while (freezer && (freezer->state & CGROUP_FROZEN)) { + freezer->state &= ~CGROUP_FROZEN; + freezer = parent_freezer(freezer); + } + } + } + + mutex_unlock(&freezer_mutex); +} + +/** + * freezer_fork - cgroup post fork callback + * @task: a task which has just been forked + * + * @task has just been created and should conform to the current state of + * the cgroup_freezer it belongs to. This function may race against + * freezer_attach(). Losing to freezer_attach() means that we don't have + * to do anything as freezer_attach() will put @task into the appropriate + * state. + */ +static void freezer_fork(struct task_struct *task) +{ + struct freezer *freezer; + + /* + * The root cgroup is non-freezable, so we can skip locking the + * freezer. This is safe regardless of race with task migration. + * If we didn't race or won, skipping is obviously the right thing + * to do. If we lost and root is the new cgroup, noop is still the + * right thing to do. + */ + if (task_css_is_root(task, freezer_cgrp_id)) + return; + + mutex_lock(&freezer_mutex); + rcu_read_lock(); + + freezer = task_freezer(task); + if (freezer->state & CGROUP_FREEZING) + freeze_task(task); + + rcu_read_unlock(); + mutex_unlock(&freezer_mutex); +} + +/** + * update_if_frozen - update whether a cgroup finished freezing + * @css: css of interest + * + * Once FREEZING is initiated, transition to FROZEN is lazily updated by + * calling this function. If the current state is FREEZING but not FROZEN, + * this function checks whether all tasks of this cgroup and the descendant + * cgroups finished freezing and, if so, sets FROZEN. + * + * The caller is responsible for grabbing RCU read lock and calling + * update_if_frozen() on all descendants prior to invoking this function. + * + * Task states and freezer state might disagree while tasks are being + * migrated into or out of @css, so we can't verify task states against + * @freezer state here. See freezer_attach() for details. + */ +static void update_if_frozen(struct cgroup_subsys_state *css) +{ + struct freezer *freezer = css_freezer(css); + struct cgroup_subsys_state *pos; + struct css_task_iter it; + struct task_struct *task; + + lockdep_assert_held(&freezer_mutex); + + if (!(freezer->state & CGROUP_FREEZING) || + (freezer->state & CGROUP_FROZEN)) + return; + + /* are all (live) children frozen? */ + rcu_read_lock(); + css_for_each_child(pos, css) { + struct freezer *child = css_freezer(pos); + + if ((child->state & CGROUP_FREEZER_ONLINE) && + !(child->state & CGROUP_FROZEN)) { + rcu_read_unlock(); + return; + } + } + rcu_read_unlock(); + + /* are all tasks frozen? */ + css_task_iter_start(css, 0, &it); + + while ((task = css_task_iter_next(&it))) { + if (freezing(task)) { + /* + * freezer_should_skip() indicates that the task + * should be skipped when determining freezing + * completion. Consider it frozen in addition to + * the usual frozen condition. + */ + if (!frozen(task) && !freezer_should_skip(task)) + goto out_iter_end; + } + } + + freezer->state |= CGROUP_FROZEN; +out_iter_end: + css_task_iter_end(&it); +} + +static int freezer_read(struct seq_file *m, void *v) +{ + struct cgroup_subsys_state *css = seq_css(m), *pos; + + mutex_lock(&freezer_mutex); + rcu_read_lock(); + + /* update states bottom-up */ + css_for_each_descendant_post(pos, css) { + if (!css_tryget_online(pos)) + continue; + rcu_read_unlock(); + + update_if_frozen(pos); + + rcu_read_lock(); + css_put(pos); + } + + rcu_read_unlock(); + mutex_unlock(&freezer_mutex); + + seq_puts(m, freezer_state_strs(css_freezer(css)->state)); + seq_putc(m, '\n'); + return 0; +} + +static void freeze_cgroup(struct freezer *freezer) +{ + struct css_task_iter it; + struct task_struct *task; + + css_task_iter_start(&freezer->css, 0, &it); + while ((task = css_task_iter_next(&it))) + freeze_task(task); + css_task_iter_end(&it); +} + +static void unfreeze_cgroup(struct freezer *freezer) +{ + struct css_task_iter it; + struct task_struct *task; + + css_task_iter_start(&freezer->css, 0, &it); + while ((task = css_task_iter_next(&it))) + __thaw_task(task); + css_task_iter_end(&it); +} + +/** + * freezer_apply_state - apply state change to a single cgroup_freezer + * @freezer: freezer to apply state change to + * @freeze: whether to freeze or unfreeze + * @state: CGROUP_FREEZING_* flag to set or clear + * + * Set or clear @state on @cgroup according to @freeze, and perform + * freezing or thawing as necessary. + */ +static void freezer_apply_state(struct freezer *freezer, bool freeze, + unsigned int state) +{ + /* also synchronizes against task migration, see freezer_attach() */ + lockdep_assert_held(&freezer_mutex); + + if (!(freezer->state & CGROUP_FREEZER_ONLINE)) + return; + + if (freeze) { + if (!(freezer->state & CGROUP_FREEZING)) + atomic_inc(&system_freezing_cnt); + freezer->state |= state; + freeze_cgroup(freezer); + } else { + bool was_freezing = freezer->state & CGROUP_FREEZING; + + freezer->state &= ~state; + + if (!(freezer->state & CGROUP_FREEZING)) { + if (was_freezing) + atomic_dec(&system_freezing_cnt); + freezer->state &= ~CGROUP_FROZEN; + unfreeze_cgroup(freezer); + } + } +} + +/** + * freezer_change_state - change the freezing state of a cgroup_freezer + * @freezer: freezer of interest + * @freeze: whether to freeze or thaw + * + * Freeze or thaw @freezer according to @freeze. The operations are + * recursive - all descendants of @freezer will be affected. + */ +static void freezer_change_state(struct freezer *freezer, bool freeze) +{ + struct cgroup_subsys_state *pos; + + /* + * Update all its descendants in pre-order traversal. Each + * descendant will try to inherit its parent's FREEZING state as + * CGROUP_FREEZING_PARENT. + */ + mutex_lock(&freezer_mutex); + rcu_read_lock(); + css_for_each_descendant_pre(pos, &freezer->css) { + struct freezer *pos_f = css_freezer(pos); + struct freezer *parent = parent_freezer(pos_f); + + if (!css_tryget_online(pos)) + continue; + rcu_read_unlock(); + + if (pos_f == freezer) + freezer_apply_state(pos_f, freeze, + CGROUP_FREEZING_SELF); + else + freezer_apply_state(pos_f, + parent->state & CGROUP_FREEZING, + CGROUP_FREEZING_PARENT); + + rcu_read_lock(); + css_put(pos); + } + rcu_read_unlock(); + mutex_unlock(&freezer_mutex); +} + +static ssize_t freezer_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + bool freeze; + + buf = strstrip(buf); + + if (strcmp(buf, freezer_state_strs(0)) == 0) + freeze = false; + else if (strcmp(buf, freezer_state_strs(CGROUP_FROZEN)) == 0) + freeze = true; + else + return -EINVAL; + + freezer_change_state(css_freezer(of_css(of)), freeze); + return nbytes; +} + +static u64 freezer_self_freezing_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + struct freezer *freezer = css_freezer(css); + + return (bool)(freezer->state & CGROUP_FREEZING_SELF); +} + +static u64 freezer_parent_freezing_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + struct freezer *freezer = css_freezer(css); + + return (bool)(freezer->state & CGROUP_FREEZING_PARENT); +} + +static struct cftype files[] = { + { + .name = "state", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = freezer_read, + .write = freezer_write, + }, + { + .name = "self_freezing", + .flags = CFTYPE_NOT_ON_ROOT, + .read_u64 = freezer_self_freezing_read, + }, + { + .name = "parent_freezing", + .flags = CFTYPE_NOT_ON_ROOT, + .read_u64 = freezer_parent_freezing_read, + }, + { } /* terminate */ +}; + +struct cgroup_subsys freezer_cgrp_subsys = { + .css_alloc = freezer_css_alloc, + .css_online = freezer_css_online, + .css_offline = freezer_css_offline, + .css_free = freezer_css_free, + .attach = freezer_attach, + .fork = freezer_fork, + .legacy_cftypes = files, +}; diff --git a/kernel/cgroup/pids.c b/kernel/cgroup/pids.c index 9829c67ebc0a..8e513a573fe9 100644 --- a/kernel/cgroup/pids.c +++ b/kernel/cgroup/pids.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Process number limiting controller for cgroups. * @@ -25,10 +26,6 @@ * a superset of parent/child/pids.current. * * Copyright (C) 2015 Aleksa Sarai <cyphar@cyphar.com> - * - * This file is subject to the terms and conditions of version 2 of the GNU - * General Public License. See the file COPYING in the main directory of the - * Linux distribution for more details. */ #include <linux/kernel.h> @@ -247,7 +244,7 @@ static void pids_cancel_fork(struct task_struct *task) pids_uncharge(pids, 1); } -static void pids_free(struct task_struct *task) +static void pids_release(struct task_struct *task) { struct pids_cgroup *pids = css_pids(task_css(task, pids_cgrp_id)); @@ -342,7 +339,7 @@ struct cgroup_subsys pids_cgrp_subsys = { .cancel_attach = pids_cancel_attach, .can_fork = pids_can_fork, .cancel_fork = pids_cancel_fork, - .free = pids_free, + .release = pids_release, .legacy_cftypes = pids_files, .dfl_cftypes = pids_files, .threaded = true, diff --git a/kernel/cgroup/rdma.c b/kernel/cgroup/rdma.c index d3bbb757ee49..ae042c347c64 100644 --- a/kernel/cgroup/rdma.c +++ b/kernel/cgroup/rdma.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * RDMA resource limiting controller for cgroups. * @@ -5,10 +6,6 @@ * additional RDMA resources after a certain limit is reached. * * Copyright (C) 2016 Parav Pandit <pandit.parav@gmail.com> - * - * This file is subject to the terms and conditions of version 2 of the GNU - * General Public License. See the file COPYING in the main directory of the - * Linux distribution for more details. */ #include <linux/bitops.h> @@ -313,10 +310,8 @@ EXPORT_SYMBOL(rdmacg_try_charge); * If IB stack wish a device to participate in rdma cgroup resource * tracking, it must invoke this API to register with rdma cgroup before * any user space application can start using the RDMA resources. - * Returns 0 on success or EINVAL when table length given is beyond - * supported size. */ -int rdmacg_register_device(struct rdmacg_device *device) +void rdmacg_register_device(struct rdmacg_device *device) { INIT_LIST_HEAD(&device->dev_node); INIT_LIST_HEAD(&device->rpools); @@ -324,7 +319,6 @@ int rdmacg_register_device(struct rdmacg_device *device) mutex_lock(&rdmacg_mutex); list_add_tail(&device->dev_node, &rdmacg_devices); mutex_unlock(&rdmacg_mutex); - return 0; } EXPORT_SYMBOL(rdmacg_register_device); diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c index d503d1a9007c..ca19b4c8acf5 100644 --- a/kernel/cgroup/rstat.c +++ b/kernel/cgroup/rstat.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only #include "cgroup-internal.h" #include <linux/sched/cputime.h> @@ -87,7 +88,6 @@ static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos, struct cgroup *root, int cpu) { struct cgroup_rstat_cpu *rstatc; - struct cgroup *parent; if (pos == root) return NULL; @@ -115,8 +115,8 @@ static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos, * However, due to the way we traverse, @pos will be the first * child in most cases. The only exception is @root. */ - parent = cgroup_parent(pos); - if (parent && rstatc->updated_next) { + if (rstatc->updated_next) { + struct cgroup *parent = cgroup_parent(pos); struct cgroup_rstat_cpu *prstatc = cgroup_rstat_cpu(parent, cpu); struct cgroup_rstat_cpu *nrstatc; struct cgroup **nextp; @@ -140,9 +140,12 @@ static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos, * updated stat. */ smp_mb(); + + return pos; } - return pos; + /* only happens for @root */ + return NULL; } /* see cgroup_rstat_flush() */ diff --git a/kernel/compat.c b/kernel/compat.c index f01affa17e22..a2bc1d6ceb57 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * linux/kernel/compat.c * @@ -5,10 +6,6 @@ * on 64 bit kernels. * * Copyright (C) 2002-2003 Stephen Rothwell, IBM Corporation - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. */ #include <linux/linkage.h> @@ -20,7 +17,6 @@ #include <linux/syscalls.h> #include <linux/unistd.h> #include <linux/security.h> -#include <linux/timex.h> #include <linux/export.h> #include <linux/migrate.h> #include <linux/posix-timers.h> @@ -30,69 +26,6 @@ #include <linux/uaccess.h> -int compat_get_timex(struct timex *txc, const struct compat_timex __user *utp) -{ - struct compat_timex tx32; - - memset(txc, 0, sizeof(struct timex)); - if (copy_from_user(&tx32, utp, sizeof(struct compat_timex))) - return -EFAULT; - - txc->modes = tx32.modes; - txc->offset = tx32.offset; - txc->freq = tx32.freq; - txc->maxerror = tx32.maxerror; - txc->esterror = tx32.esterror; - txc->status = tx32.status; - txc->constant = tx32.constant; - txc->precision = tx32.precision; - txc->tolerance = tx32.tolerance; - txc->time.tv_sec = tx32.time.tv_sec; - txc->time.tv_usec = tx32.time.tv_usec; - txc->tick = tx32.tick; - txc->ppsfreq = tx32.ppsfreq; - txc->jitter = tx32.jitter; - txc->shift = tx32.shift; - txc->stabil = tx32.stabil; - txc->jitcnt = tx32.jitcnt; - txc->calcnt = tx32.calcnt; - txc->errcnt = tx32.errcnt; - txc->stbcnt = tx32.stbcnt; - - return 0; -} - -int compat_put_timex(struct compat_timex __user *utp, const struct timex *txc) -{ - struct compat_timex tx32; - - memset(&tx32, 0, sizeof(struct compat_timex)); - tx32.modes = txc->modes; - tx32.offset = txc->offset; - tx32.freq = txc->freq; - tx32.maxerror = txc->maxerror; - tx32.esterror = txc->esterror; - tx32.status = txc->status; - tx32.constant = txc->constant; - tx32.precision = txc->precision; - tx32.tolerance = txc->tolerance; - tx32.time.tv_sec = txc->time.tv_sec; - tx32.time.tv_usec = txc->time.tv_usec; - tx32.tick = txc->tick; - tx32.ppsfreq = txc->ppsfreq; - tx32.jitter = txc->jitter; - tx32.shift = txc->shift; - tx32.stabil = txc->stabil; - tx32.jitcnt = txc->jitcnt; - tx32.calcnt = txc->calcnt; - tx32.errcnt = txc->errcnt; - tx32.stbcnt = txc->stbcnt; - tx32.tai = txc->tai; - if (copy_to_user(utp, &tx32, sizeof(struct compat_timex))) - return -EFAULT; - return 0; -} - static int __compat_get_timeval(struct timeval *tv, const struct old_timeval32 __user *ctv) { return (!access_ok(ctv, sizeof(*ctv)) || @@ -410,8 +343,11 @@ get_compat_sigset(sigset_t *set, const compat_sigset_t __user *compat) return -EFAULT; switch (_NSIG_WORDS) { case 4: set->sig[3] = v.sig[6] | (((long)v.sig[7]) << 32 ); + /* fall through */ case 3: set->sig[2] = v.sig[4] | (((long)v.sig[5]) << 32 ); + /* fall through */ case 2: set->sig[1] = v.sig[2] | (((long)v.sig[3]) << 32 ); + /* fall through */ case 1: set->sig[0] = v.sig[0] | (((long)v.sig[1]) << 32 ); } #else diff --git a/kernel/configs.c b/kernel/configs.c index 2df132b20217..b062425ccf8d 100644 --- a/kernel/configs.c +++ b/kernel/configs.c @@ -30,37 +30,35 @@ #include <linux/init.h> #include <linux/uaccess.h> -/**************************************************/ -/* the actual current config file */ - /* - * Define kernel_config_data and kernel_config_data_size, which contains the - * wrapped and compressed configuration file. The file is first compressed - * with gzip and then bounded by two eight byte magic numbers to allow - * extraction from a binary kernel image: - * - * IKCFG_ST - * <image> - * IKCFG_ED + * "IKCFG_ST" and "IKCFG_ED" are used to extract the config data from + * a binary kernel image or a module. See scripts/extract-ikconfig. */ -#define MAGIC_START "IKCFG_ST" -#define MAGIC_END "IKCFG_ED" -#include "config_data.h" - - -#define MAGIC_SIZE (sizeof(MAGIC_START) - 1) -#define kernel_config_data_size \ - (sizeof(kernel_config_data) - 1 - MAGIC_SIZE * 2) +asm ( +" .pushsection .rodata, \"a\" \n" +" .ascii \"IKCFG_ST\" \n" +" .global kernel_config_data \n" +"kernel_config_data: \n" +" .incbin \"kernel/config_data.gz\" \n" +" .global kernel_config_data_end \n" +"kernel_config_data_end: \n" +" .ascii \"IKCFG_ED\" \n" +" .popsection \n" +); #ifdef CONFIG_IKCONFIG_PROC +extern char kernel_config_data; +extern char kernel_config_data_end; + static ssize_t ikconfig_read_current(struct file *file, char __user *buf, size_t len, loff_t * offset) { return simple_read_from_buffer(buf, len, offset, - kernel_config_data + MAGIC_SIZE, - kernel_config_data_size); + &kernel_config_data, + &kernel_config_data_end - + &kernel_config_data); } static const struct file_operations ikconfig_file_ops = { @@ -79,7 +77,7 @@ static int __init ikconfig_init(void) if (!entry) return -ENOMEM; - proc_set_size(entry, kernel_config_data_size); + proc_set_size(entry, &kernel_config_data_end - &kernel_config_data); return 0; } diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index 9ad37b9e44a7..be01a4d627c9 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Context tracking: Probe on high level context boundaries such as kernel * and userspace. This includes syscalls and exceptions entry/exit. diff --git a/kernel/cpu.c b/kernel/cpu.c index d1c6d152da89..e84c0873559e 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -9,6 +9,7 @@ #include <linux/notifier.h> #include <linux/sched/signal.h> #include <linux/sched/hotplug.h> +#include <linux/sched/isolation.h> #include <linux/sched/task.h> #include <linux/sched/smt.h> #include <linux/unistd.h> @@ -313,6 +314,15 @@ void cpus_write_unlock(void) void lockdep_assert_cpus_held(void) { + /* + * We can't have hotplug operations before userspace starts running, + * and some init codepaths will knowingly not take the hotplug lock. + * This is all valid, so mute lockdep until it makes sense to report + * unheld locks. + */ + if (system_state < SYSTEM_RUNNING) + return; + percpu_rwsem_assert_held(&cpu_hotplug_lock); } @@ -512,7 +522,7 @@ static int bringup_wait_for_ap(unsigned int cpu) /* * SMT soft disabling on X86 requires to bring the CPU out of the * BIOS 'wait for SIPI' state in order to set the CR4.MCE bit. The - * CPU marked itself as booted_once in cpu_notify_starting() so the + * CPU marked itself as booted_once in notify_cpu_starting() so the * cpu_smt_allowed() check will now return false if this is not the * primary sibling. */ @@ -555,6 +565,20 @@ static void undo_cpu_up(unsigned int cpu, struct cpuhp_cpu_state *st) cpuhp_invoke_callback(cpu, st->state, false, NULL, NULL); } +static inline bool can_rollback_cpu(struct cpuhp_cpu_state *st) +{ + if (IS_ENABLED(CONFIG_HOTPLUG_CPU)) + return true; + /* + * When CPU hotplug is disabled, then taking the CPU down is not + * possible because takedown_cpu() and the architecture and + * subsystem specific mechanisms are not available. So the CPU + * which would be completely unplugged again needs to stay around + * in the current state. + */ + return st->state <= CPUHP_BRINGUP_CPU; +} + static int cpuhp_up_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st, enum cpuhp_state target) { @@ -565,8 +589,10 @@ static int cpuhp_up_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st, st->state++; ret = cpuhp_invoke_callback(cpu, st->state, true, NULL, NULL); if (ret) { - st->target = prev_state; - undo_cpu_up(cpu, st); + if (can_rollback_cpu(st)) { + st->target = prev_state; + undo_cpu_up(cpu, st); + } break; } } @@ -835,6 +861,8 @@ static int take_cpu_down(void *_param) /* Give up timekeeping duties */ tick_handover_do_timer(); + /* Remove CPU from timer broadcasting */ + tick_offline_cpu(cpu); /* Park the stopper thread */ stop_machine_park(cpu); return 0; @@ -1174,8 +1202,15 @@ int freeze_secondary_cpus(int primary) int cpu, error = 0; cpu_maps_update_begin(); - if (!cpu_online(primary)) + if (primary == -1) { primary = cpumask_first(cpu_online_mask); + if (!housekeeping_cpu(primary, HK_FLAG_TIMER)) + primary = housekeeping_any_cpu(HK_FLAG_TIMER); + } else { + if (!cpu_online(primary)) + primary = cpumask_first(cpu_online_mask); + } + /* * We take down all of the non-boot CPUs in one shot to avoid races * with the userspace trying to use the CPU hotplug at the same time @@ -1186,6 +1221,13 @@ int freeze_secondary_cpus(int primary) for_each_online_cpu(cpu) { if (cpu == primary) continue; + + if (pm_wakeup_pending()) { + pr_info("Wakeup pending. Abort CPU freeze\n"); + error = -EBUSY; + break; + } + trace_suspend_resume(TPS("CPU_OFF"), cpu, true); error = _cpu_down(cpu, 1, CPUHP_OFFLINE); trace_suspend_resume(TPS("CPU_OFF"), cpu, false); @@ -1929,6 +1971,9 @@ static ssize_t write_cpuhp_fail(struct device *dev, if (ret) return ret; + if (fail < CPUHP_OFFLINE || fail > CPUHP_ONLINE) + return -EINVAL; + /* * Cannot fail STARTING/DYING callbacks. */ @@ -2008,19 +2053,6 @@ static const struct attribute_group cpuhp_cpu_root_attr_group = { #ifdef CONFIG_HOTPLUG_SMT -static const char *smt_states[] = { - [CPU_SMT_ENABLED] = "on", - [CPU_SMT_DISABLED] = "off", - [CPU_SMT_FORCE_DISABLED] = "forceoff", - [CPU_SMT_NOT_SUPPORTED] = "notsupported", -}; - -static ssize_t -show_smt_control(struct device *dev, struct device_attribute *attr, char *buf) -{ - return snprintf(buf, PAGE_SIZE - 2, "%s\n", smt_states[cpu_smt_control]); -} - static void cpuhp_offline_cpu_device(unsigned int cpu) { struct device *dev = get_cpu_device(cpu); @@ -2039,7 +2071,7 @@ static void cpuhp_online_cpu_device(unsigned int cpu) kobject_uevent(&dev->kobj, KOBJ_ONLINE); } -static int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval) +int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval) { int cpu, ret = 0; @@ -2071,7 +2103,7 @@ static int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval) return ret; } -static int cpuhp_smt_enable(void) +int cpuhp_smt_enable(void) { int cpu, ret = 0; @@ -2091,9 +2123,10 @@ static int cpuhp_smt_enable(void) return ret; } + static ssize_t -store_smt_control(struct device *dev, struct device_attribute *attr, - const char *buf, size_t count) +__store_smt_control(struct device *dev, struct device_attribute *attr, + const char *buf, size_t count) { int ctrlval, ret; @@ -2131,14 +2164,44 @@ store_smt_control(struct device *dev, struct device_attribute *attr, unlock_device_hotplug(); return ret ? ret : count; } + +#else /* !CONFIG_HOTPLUG_SMT */ +static ssize_t +__store_smt_control(struct device *dev, struct device_attribute *attr, + const char *buf, size_t count) +{ + return -ENODEV; +} +#endif /* CONFIG_HOTPLUG_SMT */ + +static const char *smt_states[] = { + [CPU_SMT_ENABLED] = "on", + [CPU_SMT_DISABLED] = "off", + [CPU_SMT_FORCE_DISABLED] = "forceoff", + [CPU_SMT_NOT_SUPPORTED] = "notsupported", + [CPU_SMT_NOT_IMPLEMENTED] = "notimplemented", +}; + +static ssize_t +show_smt_control(struct device *dev, struct device_attribute *attr, char *buf) +{ + const char *state = smt_states[cpu_smt_control]; + + return snprintf(buf, PAGE_SIZE - 2, "%s\n", state); +} + +static ssize_t +store_smt_control(struct device *dev, struct device_attribute *attr, + const char *buf, size_t count) +{ + return __store_smt_control(dev, attr, buf, count); +} static DEVICE_ATTR(control, 0644, show_smt_control, store_smt_control); static ssize_t show_smt_active(struct device *dev, struct device_attribute *attr, char *buf) { - bool active = topology_max_smt_threads() > 1; - - return snprintf(buf, PAGE_SIZE - 2, "%d\n", active); + return snprintf(buf, PAGE_SIZE - 2, "%d\n", sched_smt_active()); } static DEVICE_ATTR(active, 0444, show_smt_active, NULL); @@ -2154,21 +2217,17 @@ static const struct attribute_group cpuhp_smt_attr_group = { NULL }; -static int __init cpu_smt_state_init(void) +static int __init cpu_smt_sysfs_init(void) { return sysfs_create_group(&cpu_subsys.dev_root->kobj, &cpuhp_smt_attr_group); } -#else -static inline int cpu_smt_state_init(void) { return 0; } -#endif - static int __init cpuhp_sysfs_init(void) { int cpu, ret; - ret = cpu_smt_state_init(); + ret = cpu_smt_sysfs_init(); if (ret) return ret; @@ -2189,7 +2248,7 @@ static int __init cpuhp_sysfs_init(void) return 0; } device_initcall(cpuhp_sysfs_init); -#endif +#endif /* CONFIG_SYSFS && CONFIG_HOTPLUG_CPU */ /* * cpu_bit_bitmap[] is a special, "compressed" data structure that @@ -2279,3 +2338,21 @@ void __init boot_cpu_hotplug_init(void) #endif this_cpu_write(cpuhp_state.state, CPUHP_ONLINE); } + +enum cpu_mitigations cpu_mitigations __ro_after_init = CPU_MITIGATIONS_AUTO; + +static int __init mitigations_parse_cmdline(char *arg) +{ + if (!strcmp(arg, "off")) + cpu_mitigations = CPU_MITIGATIONS_OFF; + else if (!strcmp(arg, "auto")) + cpu_mitigations = CPU_MITIGATIONS_AUTO; + else if (!strcmp(arg, "auto,nosmt")) + cpu_mitigations = CPU_MITIGATIONS_AUTO_NOSMT; + else + pr_crit("Unsupported mitigations=%s, system may still be vulnerable\n", + arg); + + return 0; +} +early_param("mitigations", mitigations_parse_cmdline); diff --git a/kernel/cpu_pm.c b/kernel/cpu_pm.c index 67b02e138a47..cbca6879ab7d 100644 --- a/kernel/cpu_pm.c +++ b/kernel/cpu_pm.c @@ -1,18 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2011 Google, Inc. * * Author: * Colin Cross <ccross@android.com> - * - * This software is licensed under the terms of the GNU General Public - * License version 2, as published by the Free Software Foundation, and - * may be copied, distributed, and modified under those terms. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * */ #include <linux/kernel.h> diff --git a/kernel/crash_core.c b/kernel/crash_core.c index 933cb3e45b98..9f1557b98468 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -1,9 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * crash.c - kernel crash support code. * Copyright (C) 2002-2004 Eric Biederman <ebiederm@xmission.com> - * - * This source code is licensed under the GNU General Public License, - * Version 2. See the file COPYING for more details. */ #include <linux/crash_core.h> @@ -464,6 +462,8 @@ static int __init crash_save_vmcoreinfo_init(void) VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE); #ifdef CONFIG_HUGETLB_PAGE VMCOREINFO_NUMBER(HUGETLB_PAGE_DTOR); +#define PAGE_OFFLINE_MAPCOUNT_VALUE (~PG_offline) + VMCOREINFO_NUMBER(PAGE_OFFLINE_MAPCOUNT_VALUE); #endif arch_crash_save_vmcoreinfo(); diff --git a/kernel/crash_dump.c b/kernel/crash_dump.c index b64e238b553b..9c23ae074b40 100644 --- a/kernel/crash_dump.c +++ b/kernel/crash_dump.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only #include <linux/kernel.h> #include <linux/crash_dump.h> #include <linux/init.h> diff --git a/kernel/cred.c b/kernel/cred.c index 21f4a97085b4..f9a0ce66c9c3 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -1,12 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* Task credentials management - see Documentation/security/credentials.rst * * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public Licence - * as published by the Free Software Foundation; either version - * 2 of the Licence, or (at your option) any later version. */ #include <linux/export.h> #include <linux/cred.h> @@ -174,6 +170,11 @@ void exit_creds(struct task_struct *tsk) validate_creds(cred); alter_cred_subscribers(cred, -1); put_cred(cred); + +#ifdef CONFIG_KEYS_REQUEST_CACHE + key_put(current->cached_requested_key); + current->cached_requested_key = NULL; +#endif } /** @@ -327,6 +328,10 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags) struct cred *new; int ret; +#ifdef CONFIG_KEYS_REQUEST_CACHE + p->cached_requested_key = NULL; +#endif + if ( #ifdef CONFIG_KEYS !p->cred->thread_keyring && @@ -450,14 +455,23 @@ int commit_creds(struct cred *new) if (task->mm) set_dumpable(task->mm, suid_dumpable); task->pdeath_signal = 0; + /* + * If a task drops privileges and becomes nondumpable, + * the dumpability change must become visible before + * the credential change; otherwise, a __ptrace_may_access() + * racing with this change may be able to attach to a task it + * shouldn't be able to attach to (as if the task had dropped + * privileges without becoming nondumpable). + * Pairs with a read barrier in __ptrace_may_access(). + */ smp_wmb(); } /* alter the thread keyring */ if (!uid_eq(new->fsuid, old->fsuid)) - key_fsuid_changed(task); + key_fsuid_changed(new); if (!gid_eq(new->fsgid, old->fsgid)) - key_fsgid_changed(task); + key_fsgid_changed(new); /* do it * RLIMIT_NPROC limits on user->processes have already been checked @@ -760,19 +774,6 @@ bool creds_are_invalid(const struct cred *cred) { if (cred->magic != CRED_MAGIC) return true; -#ifdef CONFIG_SECURITY_SELINUX - /* - * cred->security == NULL if security_cred_alloc_blank() or - * security_prepare_creds() returned an error. - */ - if (selinux_is_enabled() && cred->security) { - if ((unsigned long) cred->security < PAGE_SIZE) - return true; - if ((*(u32 *)cred->security & 0xffffff00) == - (POISON_FREE << 24 | POISON_FREE << 16 | POISON_FREE << 8)) - return true; - } -#endif return false; } EXPORT_SYMBOL(creds_are_invalid); diff --git a/kernel/debug/Makefile b/kernel/debug/Makefile index a85edc339985..332ee6c6ec2c 100644 --- a/kernel/debug/Makefile +++ b/kernel/debug/Makefile @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0-only # # Makefile for the linux kernel debugger # diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c index 7510dc687c0d..4b280fc7dd67 100644 --- a/kernel/debug/gdbstub.c +++ b/kernel/debug/gdbstub.c @@ -1033,13 +1033,14 @@ int gdb_serial_stub(struct kgdb_state *ks) return DBG_PASS_EVENT; } #endif + /* Fall through */ case 'C': /* Exception passing */ tmp = gdb_cmd_exception_pass(ks); if (tmp > 0) goto default_handle; if (tmp == 0) break; - /* Fall through on tmp < 0 */ + /* Fall through - on tmp < 0 */ case 'c': /* Continue packet */ case 's': /* Single step packet */ if (kgdb_contthread && kgdb_contthread != current) { @@ -1048,7 +1049,7 @@ int gdb_serial_stub(struct kgdb_state *ks) break; } dbg_activate_sw_breakpoints(); - /* Fall through to default processing */ + /* Fall through - to default processing */ default: default_handle: error = kgdb_arch_handle_exception(ks->ex_vector, @@ -1094,10 +1095,10 @@ int gdbstub_state(struct kgdb_state *ks, char *cmd) return error; case 's': case 'c': - strcpy(remcom_in_buffer, cmd); + strscpy(remcom_in_buffer, cmd, sizeof(remcom_in_buffer)); return 0; case '$': - strcpy(remcom_in_buffer, cmd); + strscpy(remcom_in_buffer, cmd, sizeof(remcom_in_buffer)); gdbstub_use_prev_in_buf = strlen(remcom_in_buffer); gdbstub_prev_in_buf_pos = 0; return 0; diff --git a/kernel/debug/kdb/Makefile b/kernel/debug/kdb/Makefile index d4fc58f4b88d..efac857c5511 100644 --- a/kernel/debug/kdb/Makefile +++ b/kernel/debug/kdb/Makefile @@ -6,7 +6,6 @@ # Copyright (c) 2009 Wind River Systems, Inc. All Rights Reserved. # -CCVERSION := $(shell $(CC) -v 2>&1 | sed -ne '$$p') obj-y := kdb_io.o kdb_main.o kdb_support.o kdb_bt.o gen-kdb_cmds.o kdb_bp.o kdb_debugger.o obj-$(CONFIG_KDB_KEYBOARD) += kdb_keyboard.o diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c index 6a4b41484afe..3a5184eb6977 100644 --- a/kernel/debug/kdb/kdb_io.c +++ b/kernel/debug/kdb/kdb_io.c @@ -446,7 +446,7 @@ poll_again: char *kdb_getstr(char *buffer, size_t bufsize, const char *prompt) { if (prompt && kdb_prompt_str != prompt) - strncpy(kdb_prompt_str, prompt, CMD_BUFLEN); + strscpy(kdb_prompt_str, prompt, CMD_BUFLEN); kdb_printf(kdb_prompt_str); kdb_nextline = 1; /* Prompt and input resets line number */ return kdb_read(buffer, bufsize); diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index 82a3b32a7cfc..9ecfa37c7fbf 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -2522,7 +2522,6 @@ static int kdb_summary(int argc, const char **argv) kdb_printf("machine %s\n", init_uts_ns.name.machine); kdb_printf("nodename %s\n", init_uts_ns.name.nodename); kdb_printf("domainname %s\n", init_uts_ns.name.domainname); - kdb_printf("ccversion %s\n", __stringify(CCVERSION)); now = __ktime_get_real_seconds(); time64_to_tm(now, 0, &tm); @@ -2584,7 +2583,7 @@ static int kdb_per_cpu(int argc, const char **argv) diag = kdbgetularg(argv[3], &whichcpu); if (diag) return diag; - if (!cpu_online(whichcpu)) { + if (whichcpu >= nr_cpu_ids || !cpu_online(whichcpu)) { kdb_printf("cpu %ld is not online\n", whichcpu); return KDB_BADCPUNUM; } diff --git a/kernel/debug/kdb/kdb_support.c b/kernel/debug/kdb/kdb_support.c index 50bf9b119bad..b8e6306e7e13 100644 --- a/kernel/debug/kdb/kdb_support.c +++ b/kernel/debug/kdb/kdb_support.c @@ -192,7 +192,7 @@ int kallsyms_symbol_complete(char *prefix_name, int max_len) while ((name = kdb_walk_kallsyms(&pos))) { if (strncmp(name, prefix_name, prefix_len) == 0) { - strcpy(ks_namebuf, name); + strscpy(ks_namebuf, name, sizeof(ks_namebuf)); /* Work out the longest name that matches the prefix */ if (++number == 1) { prev_len = min_t(int, max_len-1, diff --git a/kernel/delayacct.c b/kernel/delayacct.c index 2a12b988c717..27725754ac99 100644 --- a/kernel/delayacct.c +++ b/kernel/delayacct.c @@ -1,16 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* delayacct.c - per-task delay accounting * * Copyright (C) Shailabh Nagar, IBM Corp. 2006 - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it would be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See - * the GNU General Public License for more details. */ #include <linux/sched.h> diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig index ca88b867e7fe..70f8f8d9200e 100644 --- a/kernel/dma/Kconfig +++ b/kernel/dma/Kconfig @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0-only config HAS_DMA bool @@ -16,7 +17,16 @@ config ARCH_DMA_ADDR_T_64BIT config ARCH_HAS_DMA_COHERENCE_H bool -config HAVE_GENERIC_DMA_COHERENT +config ARCH_HAS_DMA_SET_MASK + bool + +config DMA_DECLARE_COHERENT + bool + +config ARCH_HAS_SETUP_DMA_OPS + bool + +config ARCH_HAS_TEARDOWN_DMA_OPS bool config ARCH_HAS_SYNC_DMA_FOR_DEVICE @@ -29,6 +39,9 @@ config ARCH_HAS_SYNC_DMA_FOR_CPU config ARCH_HAS_SYNC_DMA_FOR_CPU_ALL bool +config ARCH_HAS_DMA_PREP_COHERENT + bool + config ARCH_HAS_DMA_COHERENT_TO_PFN bool @@ -48,8 +61,122 @@ config SWIOTLB config DMA_REMAP depends on MMU + select GENERIC_ALLOCATOR bool config DMA_DIRECT_REMAP bool select DMA_REMAP + +config DMA_CMA + bool "DMA Contiguous Memory Allocator" + depends on HAVE_DMA_CONTIGUOUS && CMA + help + This enables the Contiguous Memory Allocator which allows drivers + to allocate big physically-contiguous blocks of memory for use with + hardware components that do not support I/O map nor scatter-gather. + + You can disable CMA by specifying "cma=0" on the kernel's command + line. + + For more information see <include/linux/dma-contiguous.h>. + If unsure, say "n". + +if DMA_CMA +comment "Default contiguous memory area size:" + +config CMA_SIZE_MBYTES + int "Size in Mega Bytes" + depends on !CMA_SIZE_SEL_PERCENTAGE + default 0 if X86 + default 16 + help + Defines the size (in MiB) of the default memory area for Contiguous + Memory Allocator. If the size of 0 is selected, CMA is disabled by + default, but it can be enabled by passing cma=size[MG] to the kernel. + + +config CMA_SIZE_PERCENTAGE + int "Percentage of total memory" + depends on !CMA_SIZE_SEL_MBYTES + default 0 if X86 + default 10 + help + Defines the size of the default memory area for Contiguous Memory + Allocator as a percentage of the total memory in the system. + If 0 percent is selected, CMA is disabled by default, but it can be + enabled by passing cma=size[MG] to the kernel. + +choice + prompt "Selected region size" + default CMA_SIZE_SEL_MBYTES + +config CMA_SIZE_SEL_MBYTES + bool "Use mega bytes value only" + +config CMA_SIZE_SEL_PERCENTAGE + bool "Use percentage value only" + +config CMA_SIZE_SEL_MIN + bool "Use lower value (minimum)" + +config CMA_SIZE_SEL_MAX + bool "Use higher value (maximum)" + +endchoice + +config CMA_ALIGNMENT + int "Maximum PAGE_SIZE order of alignment for contiguous buffers" + range 4 12 + default 8 + help + DMA mapping framework by default aligns all buffers to the smallest + PAGE_SIZE order which is greater than or equal to the requested buffer + size. This works well for buffers up to a few hundreds kilobytes, but + for larger buffers it just a memory waste. With this parameter you can + specify the maximum PAGE_SIZE order for contiguous buffers. Larger + buffers will be aligned only to this specified order. The order is + expressed as a power of two multiplied by the PAGE_SIZE. + + For example, if your system defaults to 4KiB pages, the order value + of 8 means that the buffers will be aligned up to 1MiB only. + + If unsure, leave the default value "8". + +endif + +config DMA_API_DEBUG + bool "Enable debugging of DMA-API usage" + select NEED_DMA_MAP_STATE + help + Enable this option to debug the use of the DMA API by device drivers. + With this option you will be able to detect common bugs in device + drivers like double-freeing of DMA mappings or freeing mappings that + were never allocated. + + This also attempts to catch cases where a page owned by DMA is + accessed by the cpu in a way that could cause data corruption. For + example, this enables cow_user_page() to check that the source page is + not undergoing DMA. + + This option causes a performance degradation. Use only if you want to + debug device drivers and dma interactions. + + If unsure, say N. + +config DMA_API_DEBUG_SG + bool "Debug DMA scatter-gather usage" + default y + depends on DMA_API_DEBUG + help + Perform extra checking that callers of dma_map_sg() have respected the + appropriate segment length/boundary limits for the given device when + preparing DMA scatterlists. + + This is particularly likely to have been overlooked in cases where the + dma_map_sg() API is used for general bulk mapping of pages rather than + preparing literal scatter-gather descriptors, where there is a risk of + unexpected behaviour from DMA API implementations if the scatterlist + is technically out-of-spec. + + If unsure, say N. diff --git a/kernel/dma/Makefile b/kernel/dma/Makefile index 72ff6e46aa86..d237cf3dc181 100644 --- a/kernel/dma/Makefile +++ b/kernel/dma/Makefile @@ -2,7 +2,7 @@ obj-$(CONFIG_HAS_DMA) += mapping.o direct.o dummy.o obj-$(CONFIG_DMA_CMA) += contiguous.o -obj-$(CONFIG_HAVE_GENERIC_DMA_COHERENT) += coherent.o +obj-$(CONFIG_DMA_DECLARE_COHERENT) += coherent.o obj-$(CONFIG_DMA_VIRT_OPS) += virt.o obj-$(CONFIG_DMA_API_DEBUG) += debug.o obj-$(CONFIG_SWIOTLB) += swiotlb.o diff --git a/kernel/dma/coherent.c b/kernel/dma/coherent.c index 66f0fb7e9a3a..29fd6590dc1e 100644 --- a/kernel/dma/coherent.c +++ b/kernel/dma/coherent.c @@ -14,7 +14,6 @@ struct dma_coherent_mem { dma_addr_t device_base; unsigned long pfn_base; int size; - int flags; unsigned long *bitmap; spinlock_t spinlock; bool use_dev_dma_pfn_offset; @@ -38,12 +37,12 @@ static inline dma_addr_t dma_get_device_base(struct device *dev, return mem->device_base; } -static int dma_init_coherent_memory( - phys_addr_t phys_addr, dma_addr_t device_addr, size_t size, int flags, - struct dma_coherent_mem **mem) +static int dma_init_coherent_memory(phys_addr_t phys_addr, + dma_addr_t device_addr, size_t size, + struct dma_coherent_mem **mem) { struct dma_coherent_mem *dma_mem = NULL; - void __iomem *mem_base = NULL; + void *mem_base = NULL; int pages = size >> PAGE_SHIFT; int bitmap_size = BITS_TO_LONGS(pages) * sizeof(long); int ret; @@ -73,7 +72,6 @@ static int dma_init_coherent_memory( dma_mem->device_base = device_addr; dma_mem->pfn_base = PFN_DOWN(phys_addr); dma_mem->size = pages; - dma_mem->flags = flags; spin_lock_init(&dma_mem->spinlock); *mem = dma_mem; @@ -110,12 +108,12 @@ static int dma_assign_coherent_memory(struct device *dev, } int dma_declare_coherent_memory(struct device *dev, phys_addr_t phys_addr, - dma_addr_t device_addr, size_t size, int flags) + dma_addr_t device_addr, size_t size) { struct dma_coherent_mem *mem; int ret; - ret = dma_init_coherent_memory(phys_addr, device_addr, size, flags, &mem); + ret = dma_init_coherent_memory(phys_addr, device_addr, size, &mem); if (ret) return ret; @@ -137,29 +135,6 @@ void dma_release_declared_memory(struct device *dev) } EXPORT_SYMBOL(dma_release_declared_memory); -void *dma_mark_declared_memory_occupied(struct device *dev, - dma_addr_t device_addr, size_t size) -{ - struct dma_coherent_mem *mem = dev->dma_mem; - unsigned long flags; - int pos, err; - - size += device_addr & ~PAGE_MASK; - - if (!mem) - return ERR_PTR(-EINVAL); - - spin_lock_irqsave(&mem->spinlock, flags); - pos = PFN_DOWN(device_addr - dma_get_device_base(dev, mem)); - err = bitmap_allocate_region(mem->bitmap, pos, get_order(size)); - spin_unlock_irqrestore(&mem->spinlock, flags); - - if (err != 0) - return ERR_PTR(err); - return mem->virt_base + (pos << PAGE_SHIFT); -} -EXPORT_SYMBOL(dma_mark_declared_memory_occupied); - static void *__dma_alloc_from_coherent(struct dma_coherent_mem *mem, ssize_t size, dma_addr_t *dma_handle) { @@ -213,15 +188,7 @@ int dma_alloc_from_dev_coherent(struct device *dev, ssize_t size, return 0; *ret = __dma_alloc_from_coherent(mem, size, dma_handle); - if (*ret) - return 1; - - /* - * In the case where the allocation can not be satisfied from the - * per-device area, try to fall back to generic memory if the - * constraints allow it. - */ - return mem->flags & DMA_MEMORY_EXCLUSIVE; + return 1; } void *dma_alloc_from_global_coherent(ssize_t size, dma_addr_t *dma_handle) @@ -350,8 +317,7 @@ static int rmem_dma_device_init(struct reserved_mem *rmem, struct device *dev) if (!mem) { ret = dma_init_coherent_memory(rmem->base, rmem->base, - rmem->size, - DMA_MEMORY_EXCLUSIVE, &mem); + rmem->size, &mem); if (ret) { pr_err("Reserved memory: failed to init DMA memory pool at %pa, size %ld MiB\n", &rmem->base, (unsigned long)rmem->size / SZ_1M); diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c index b2a87905846d..bfc0c17f2a3d 100644 --- a/kernel/dma/contiguous.c +++ b/kernel/dma/contiguous.c @@ -214,6 +214,62 @@ bool dma_release_from_contiguous(struct device *dev, struct page *pages, return cma_release(dev_get_cma_area(dev), pages, count); } +/** + * dma_alloc_contiguous() - allocate contiguous pages + * @dev: Pointer to device for which the allocation is performed. + * @size: Requested allocation size. + * @gfp: Allocation flags. + * + * This function allocates contiguous memory buffer for specified device. It + * first tries to use device specific contiguous memory area if available or + * the default global one, then tries a fallback allocation of normal pages. + * + * Note that it byapss one-page size of allocations from the global area as + * the addresses within one page are always contiguous, so there is no need + * to waste CMA pages for that kind; it also helps reduce fragmentations. + */ +struct page *dma_alloc_contiguous(struct device *dev, size_t size, gfp_t gfp) +{ + int node = dev ? dev_to_node(dev) : NUMA_NO_NODE; + size_t count = PAGE_ALIGN(size) >> PAGE_SHIFT; + size_t align = get_order(PAGE_ALIGN(size)); + struct page *page = NULL; + struct cma *cma = NULL; + + if (dev && dev->cma_area) + cma = dev->cma_area; + else if (count > 1) + cma = dma_contiguous_default_area; + + /* CMA can be used only in the context which permits sleeping */ + if (cma && gfpflags_allow_blocking(gfp)) { + align = min_t(size_t, align, CONFIG_CMA_ALIGNMENT); + page = cma_alloc(cma, count, align, gfp & __GFP_NOWARN); + } + + /* Fallback allocation of normal pages */ + if (!page) + page = alloc_pages_node(node, gfp, align); + return page; +} + +/** + * dma_free_contiguous() - release allocated pages + * @dev: Pointer to device for which the pages were allocated. + * @page: Pointer to the allocated pages. + * @size: Size of allocated pages. + * + * This function releases memory allocated by dma_alloc_contiguous(). As the + * cma_release returns false when provided pages do not belong to contiguous + * area and true otherwise, this function then does a fallback __free_pages() + * upon a false-return. + */ +void dma_free_contiguous(struct device *dev, struct page *page, size_t size) +{ + if (!cma_release(dev_get_cma_area(dev), page, size >> PAGE_SHIFT)) + __free_pages(page, get_order(size)); +} + /* * Support for reserved memory regions defined in device tree */ diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c index 23cf5361bcf1..099002d84f46 100644 --- a/kernel/dma/debug.c +++ b/kernel/dma/debug.c @@ -1,20 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2008 Advanced Micro Devices, Inc. * * Author: Joerg Roedel <joerg.roedel@amd.com> - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License version 2 as published - * by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #define pr_fmt(fmt) "DMA-API: " fmt @@ -89,8 +77,8 @@ struct dma_debug_entry { int sg_mapped_ents; enum map_err_types map_err_type; #ifdef CONFIG_STACKTRACE - struct stack_trace stacktrace; - unsigned long st_entries[DMA_DEBUG_STACKTRACE_ENTRIES]; + unsigned int stack_len; + unsigned long stack_entries[DMA_DEBUG_STACKTRACE_ENTRIES]; #endif }; @@ -134,17 +122,6 @@ static u32 nr_total_entries; /* number of preallocated entries requested by kernel cmdline */ static u32 nr_prealloc_entries = PREALLOC_DMA_DEBUG_ENTRIES; -/* debugfs dentry's for the stuff above */ -static struct dentry *dma_debug_dent __read_mostly; -static struct dentry *global_disable_dent __read_mostly; -static struct dentry *error_count_dent __read_mostly; -static struct dentry *show_all_errors_dent __read_mostly; -static struct dentry *show_num_errors_dent __read_mostly; -static struct dentry *num_free_entries_dent __read_mostly; -static struct dentry *min_free_entries_dent __read_mostly; -static struct dentry *nr_total_entries_dent __read_mostly; -static struct dentry *filter_dent __read_mostly; - /* per-driver filter related state */ #define NAME_MAX_LEN 64 @@ -185,7 +162,7 @@ static inline void dump_entry_trace(struct dma_debug_entry *entry) #ifdef CONFIG_STACKTRACE if (entry) { pr_warning("Mapped at:\n"); - print_stack_trace(&entry->stacktrace, 0); + stack_trace_print(entry->stack_entries, entry->stack_len, 0); } #endif } @@ -715,12 +692,10 @@ static struct dma_debug_entry *dma_entry_alloc(void) spin_unlock_irqrestore(&free_entries_lock, flags); #ifdef CONFIG_STACKTRACE - entry->stacktrace.max_entries = DMA_DEBUG_STACKTRACE_ENTRIES; - entry->stacktrace.entries = entry->st_entries; - entry->stacktrace.skip = 2; - save_stack_trace(&entry->stacktrace); + entry->stack_len = stack_trace_save(entry->stack_entries, + ARRAY_SIZE(entry->stack_entries), + 1); #endif - return entry; } @@ -840,66 +815,46 @@ static const struct file_operations filter_fops = { .llseek = default_llseek, }; -static int dma_debug_fs_init(void) +static int dump_show(struct seq_file *seq, void *v) { - dma_debug_dent = debugfs_create_dir("dma-api", NULL); - if (!dma_debug_dent) { - pr_err("can not create debugfs directory\n"); - return -ENOMEM; - } + int idx; - global_disable_dent = debugfs_create_bool("disabled", 0444, - dma_debug_dent, - &global_disable); - if (!global_disable_dent) - goto out_err; - - error_count_dent = debugfs_create_u32("error_count", 0444, - dma_debug_dent, &error_count); - if (!error_count_dent) - goto out_err; - - show_all_errors_dent = debugfs_create_u32("all_errors", 0644, - dma_debug_dent, - &show_all_errors); - if (!show_all_errors_dent) - goto out_err; - - show_num_errors_dent = debugfs_create_u32("num_errors", 0644, - dma_debug_dent, - &show_num_errors); - if (!show_num_errors_dent) - goto out_err; - - num_free_entries_dent = debugfs_create_u32("num_free_entries", 0444, - dma_debug_dent, - &num_free_entries); - if (!num_free_entries_dent) - goto out_err; - - min_free_entries_dent = debugfs_create_u32("min_free_entries", 0444, - dma_debug_dent, - &min_free_entries); - if (!min_free_entries_dent) - goto out_err; - - nr_total_entries_dent = debugfs_create_u32("nr_total_entries", 0444, - dma_debug_dent, - &nr_total_entries); - if (!nr_total_entries_dent) - goto out_err; - - filter_dent = debugfs_create_file("driver_filter", 0644, - dma_debug_dent, NULL, &filter_fops); - if (!filter_dent) - goto out_err; + for (idx = 0; idx < HASH_SIZE; idx++) { + struct hash_bucket *bucket = &dma_entry_hash[idx]; + struct dma_debug_entry *entry; + unsigned long flags; + spin_lock_irqsave(&bucket->lock, flags); + list_for_each_entry(entry, &bucket->list, list) { + seq_printf(seq, + "%s %s %s idx %d P=%llx N=%lx D=%llx L=%llx %s %s\n", + dev_name(entry->dev), + dev_driver_string(entry->dev), + type2name[entry->type], idx, + phys_addr(entry), entry->pfn, + entry->dev_addr, entry->size, + dir2name[entry->direction], + maperr2str[entry->map_err_type]); + } + spin_unlock_irqrestore(&bucket->lock, flags); + } return 0; +} +DEFINE_SHOW_ATTRIBUTE(dump); -out_err: - debugfs_remove_recursive(dma_debug_dent); - - return -ENOMEM; +static void dma_debug_fs_init(void) +{ + struct dentry *dentry = debugfs_create_dir("dma-api", NULL); + + debugfs_create_bool("disabled", 0444, dentry, &global_disable); + debugfs_create_u32("error_count", 0444, dentry, &error_count); + debugfs_create_u32("all_errors", 0644, dentry, &show_all_errors); + debugfs_create_u32("num_errors", 0644, dentry, &show_num_errors); + debugfs_create_u32("num_free_entries", 0444, dentry, &num_free_entries); + debugfs_create_u32("min_free_entries", 0444, dentry, &min_free_entries); + debugfs_create_u32("nr_total_entries", 0444, dentry, &nr_total_entries); + debugfs_create_file("driver_filter", 0644, dentry, NULL, &filter_fops); + debugfs_create_file("dump", 0444, dentry, NULL, &dump_fops); } static int device_dma_allocations(struct device *dev, struct dma_debug_entry **out_entry) @@ -985,12 +940,7 @@ static int dma_debug_init(void) spin_lock_init(&dma_entry_hash[i].lock); } - if (dma_debug_fs_init() != 0) { - pr_err("error creating debugfs entries - disabling\n"); - global_disable = true; - - return 0; - } + dma_debug_fs_init(); nr_pages = DIV_ROUND_UP(nr_prealloc_entries, DMA_DEBUG_DYNAMIC_ENTRIES); for (i = 0; i < nr_pages; ++i) diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index 355d16acee6d..b90e1aede743 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -96,8 +96,6 @@ static bool dma_coherent_ok(struct device *dev, phys_addr_t phys, size_t size) struct page *__dma_direct_alloc_pages(struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs) { - unsigned int count = PAGE_ALIGN(size) >> PAGE_SHIFT; - int page_order = get_order(size); struct page *page = NULL; u64 phys_mask; @@ -109,20 +107,9 @@ struct page *__dma_direct_alloc_pages(struct device *dev, size_t size, gfp |= __dma_direct_optimal_gfp_mask(dev, dev->coherent_dma_mask, &phys_mask); again: - /* CMA can be used only in the context which permits sleeping */ - if (gfpflags_allow_blocking(gfp)) { - page = dma_alloc_from_contiguous(dev, count, page_order, - gfp & __GFP_NOWARN); - if (page && !dma_coherent_ok(dev, page_to_phys(page), size)) { - dma_release_from_contiguous(dev, page, count); - page = NULL; - } - } - if (!page) - page = alloc_pages_node(dev_to_node(dev), gfp, page_order); - + page = dma_alloc_contiguous(dev, size, gfp); if (page && !dma_coherent_ok(dev, page_to_phys(page), size)) { - __free_pages(page, page_order); + dma_free_contiguous(dev, page, size); page = NULL; if (IS_ENABLED(CONFIG_ZONE_DMA32) && @@ -132,8 +119,7 @@ again: goto again; } - if (IS_ENABLED(CONFIG_ZONE_DMA) && - phys_mask < DMA_BIT_MASK(32) && !(gfp & GFP_DMA)) { + if (IS_ENABLED(CONFIG_ZONE_DMA) && !(gfp & GFP_DMA)) { gfp = (gfp & ~GFP_DMA32) | GFP_DMA; goto again; } @@ -152,10 +138,18 @@ void *dma_direct_alloc_pages(struct device *dev, size_t size, if (!page) return NULL; + if (attrs & DMA_ATTR_NO_KERNEL_MAPPING) { + /* remove any dirty cache lines on the kernel alias */ + if (!PageHighMem(page)) + arch_dma_prep_coherent(page, size); + /* return the page pointer as the opaque cookie */ + return page; + } + if (PageHighMem(page)) { /* * Depending on the cma= arguments and per-arch setup - * dma_alloc_from_contiguous could return highmem pages. + * dma_alloc_contiguous could return highmem pages. * Without remapping there is no way to return them here, * so log an error and fail. */ @@ -172,15 +166,19 @@ void *dma_direct_alloc_pages(struct device *dev, size_t size, *dma_handle = phys_to_dma(dev, page_to_phys(page)); } memset(ret, 0, size); + + if (IS_ENABLED(CONFIG_ARCH_HAS_UNCACHED_SEGMENT) && + dma_alloc_need_uncached(dev, attrs)) { + arch_dma_prep_coherent(page, size); + ret = uncached_kernel_address(ret); + } + return ret; } void __dma_direct_free_pages(struct device *dev, size_t size, struct page *page) { - unsigned int count = PAGE_ALIGN(size) >> PAGE_SHIFT; - - if (!dma_release_from_contiguous(dev, page, count)) - __free_pages(page, get_order(size)); + dma_free_contiguous(dev, page, size); } void dma_direct_free_pages(struct device *dev, size_t size, void *cpu_addr, @@ -188,15 +186,26 @@ void dma_direct_free_pages(struct device *dev, size_t size, void *cpu_addr, { unsigned int page_order = get_order(size); + if (attrs & DMA_ATTR_NO_KERNEL_MAPPING) { + /* cpu_addr is a struct page cookie, not a kernel address */ + __dma_direct_free_pages(dev, size, cpu_addr); + return; + } + if (force_dma_unencrypted()) set_memory_encrypted((unsigned long)cpu_addr, 1 << page_order); + + if (IS_ENABLED(CONFIG_ARCH_HAS_UNCACHED_SEGMENT) && + dma_alloc_need_uncached(dev, attrs)) + cpu_addr = cached_kernel_address(cpu_addr); __dma_direct_free_pages(dev, size, virt_to_page(cpu_addr)); } void *dma_direct_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs) { - if (!dev_is_dma_coherent(dev)) + if (!IS_ENABLED(CONFIG_ARCH_HAS_UNCACHED_SEGMENT) && + dma_alloc_need_uncached(dev, attrs)) return arch_dma_alloc(dev, size, dma_handle, gfp, attrs); return dma_direct_alloc_pages(dev, size, dma_handle, gfp, attrs); } @@ -204,7 +213,8 @@ void *dma_direct_alloc(struct device *dev, size_t size, void dma_direct_free(struct device *dev, size_t size, void *cpu_addr, dma_addr_t dma_addr, unsigned long attrs) { - if (!dev_is_dma_coherent(dev)) + if (!IS_ENABLED(CONFIG_ARCH_HAS_UNCACHED_SEGMENT) && + dma_alloc_need_uncached(dev, attrs)) arch_dma_free(dev, size, cpu_addr, dma_addr, attrs); else dma_direct_free_pages(dev, size, cpu_addr, dma_addr, attrs); @@ -312,7 +322,7 @@ static inline bool dma_direct_possible(struct device *dev, dma_addr_t dma_addr, size_t size) { return swiotlb_force != SWIOTLB_FORCE && - (!dev || dma_capable(dev, dma_addr, size)); + dma_capable(dev, dma_addr, size); } dma_addr_t dma_direct_map_page(struct device *dev, struct page *page, @@ -356,6 +366,20 @@ out_unmap: } EXPORT_SYMBOL(dma_direct_map_sg); +dma_addr_t dma_direct_map_resource(struct device *dev, phys_addr_t paddr, + size_t size, enum dma_data_direction dir, unsigned long attrs) +{ + dma_addr_t dma_addr = paddr; + + if (unlikely(!dma_direct_possible(dev, dma_addr, size))) { + report_addr(dev, dma_addr, size); + return DMA_MAPPING_ERROR; + } + + return dma_addr; +} +EXPORT_SYMBOL(dma_direct_map_resource); + /* * Because 32-bit DMA masks are so common we expect every architecture to be * able to satisfy them - either by not supporting more physical memory, or by @@ -380,3 +404,14 @@ int dma_direct_supported(struct device *dev, u64 mask) */ return mask >= __phys_to_dma(dev, min_mask); } + +size_t dma_direct_max_mapping_size(struct device *dev) +{ + size_t size = SIZE_MAX; + + /* If SWIOTLB is active, use its maximum mapping size */ + if (is_swiotlb_active()) + size = swiotlb_max_mapping_size(dev); + + return size; +} diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index a11006b6d8e8..1f628e7ac709 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -207,7 +207,6 @@ int dma_mmap_attrs(struct device *dev, struct vm_area_struct *vma, } EXPORT_SYMBOL(dma_mmap_attrs); -#ifndef ARCH_HAS_DMA_GET_REQUIRED_MASK static u64 dma_default_get_required_mask(struct device *dev) { u32 low_totalram = ((max_pfn - 1) << PAGE_SHIFT); @@ -238,11 +237,6 @@ u64 dma_get_required_mask(struct device *dev) return dma_default_get_required_mask(dev); } EXPORT_SYMBOL_GPL(dma_get_required_mask); -#endif - -#ifndef arch_dma_alloc_attrs -#define arch_dma_alloc_attrs(dev) (true) -#endif void *dma_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t flag, unsigned long attrs) @@ -250,7 +244,7 @@ void *dma_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle, const struct dma_map_ops *ops = get_dma_ops(dev); void *cpu_addr; - WARN_ON_ONCE(dev && !dev->coherent_dma_mask); + WARN_ON_ONCE(!dev->coherent_dma_mask); if (dma_alloc_from_dev_coherent(dev, size, dma_handle, &cpu_addr)) return cpu_addr; @@ -258,9 +252,6 @@ void *dma_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle, /* let the implementation decide on the zone to allocate from: */ flag &= ~(__GFP_DMA | __GFP_DMA32 | __GFP_HIGHMEM); - if (!arch_dma_alloc_attrs(&dev)) - return NULL; - if (dma_is_direct(ops)) cpu_addr = dma_direct_alloc(dev, size, dma_handle, flag, attrs); else if (ops->alloc) @@ -318,22 +309,39 @@ int dma_supported(struct device *dev, u64 mask) } EXPORT_SYMBOL(dma_supported); -#ifndef HAVE_ARCH_DMA_SET_MASK +#ifdef CONFIG_ARCH_HAS_DMA_SET_MASK +void arch_dma_set_mask(struct device *dev, u64 mask); +#else +#define arch_dma_set_mask(dev, mask) do { } while (0) +#endif + int dma_set_mask(struct device *dev, u64 mask) { + /* + * Truncate the mask to the actually supported dma_addr_t width to + * avoid generating unsupportable addresses. + */ + mask = (dma_addr_t)mask; + if (!dev->dma_mask || !dma_supported(dev, mask)) return -EIO; + arch_dma_set_mask(dev, mask); dma_check_mask(dev, mask); *dev->dma_mask = mask; return 0; } EXPORT_SYMBOL(dma_set_mask); -#endif #ifndef CONFIG_ARCH_HAS_DMA_SET_COHERENT_MASK int dma_set_coherent_mask(struct device *dev, u64 mask) { + /* + * Truncate the mask to the actually supported dma_addr_t width to + * avoid generating unsupportable addresses. + */ + mask = (dma_addr_t)mask; + if (!dma_supported(dev, mask)) return -EIO; @@ -357,3 +365,17 @@ void dma_cache_sync(struct device *dev, void *vaddr, size_t size, ops->cache_sync(dev, vaddr, size, dir); } EXPORT_SYMBOL(dma_cache_sync); + +size_t dma_max_mapping_size(struct device *dev) +{ + const struct dma_map_ops *ops = get_dma_ops(dev); + size_t size = SIZE_MAX; + + if (dma_is_direct(ops)) + size = dma_direct_max_mapping_size(dev); + else if (ops && ops->max_mapping_size) + size = ops->max_mapping_size(dev); + + return size; +} +EXPORT_SYMBOL_GPL(dma_max_mapping_size); diff --git a/kernel/dma/remap.c b/kernel/dma/remap.c index 7a723194ecbe..a594aec07882 100644 --- a/kernel/dma/remap.c +++ b/kernel/dma/remap.c @@ -158,6 +158,9 @@ out: bool dma_in_atomic_pool(void *start, size_t size) { + if (unlikely(!atomic_pool)) + return false; + return addr_in_gen_pool(atomic_pool, (unsigned long)start, size); } @@ -199,8 +202,7 @@ void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle, size = PAGE_ALIGN(size); - if (!gfpflags_allow_blocking(flags) && - !(attrs & DMA_ATTR_NO_KERNEL_MAPPING)) { + if (!gfpflags_allow_blocking(flags)) { ret = dma_alloc_from_pool(size, &page, flags); if (!ret) return NULL; @@ -214,11 +216,6 @@ void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle, /* remove any dirty cache lines on the kernel alias */ arch_dma_prep_coherent(page, size); - if (attrs & DMA_ATTR_NO_KERNEL_MAPPING) { - ret = page; /* opaque cookie */ - goto done; - } - /* create a coherent mapping */ ret = dma_common_contiguous_remap(page, size, VM_USERMAP, arch_dma_mmap_pgprot(dev, PAGE_KERNEL, attrs), @@ -237,10 +234,7 @@ done: void arch_dma_free(struct device *dev, size_t size, void *vaddr, dma_addr_t dma_handle, unsigned long attrs) { - if (attrs & DMA_ATTR_NO_KERNEL_MAPPING) { - /* vaddr is a struct page cookie, not a kernel address */ - __dma_direct_free_pages(dev, size, vaddr); - } else if (!dma_free_from_pool(vaddr, PAGE_ALIGN(size))) { + if (!dma_free_from_pool(vaddr, PAGE_ALIGN(size))) { phys_addr_t phys = dma_to_phys(dev, dma_handle); struct page *page = pfn_to_page(__phys_to_pfn(phys)); diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 2ca8a200be97..9de232229063 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Dynamic DMA mapping support. * @@ -199,6 +200,7 @@ void __init swiotlb_update_mem_attributes(void) int __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose) { unsigned long i, bytes; + size_t alloc_size; bytes = nslabs << IO_TLB_SHIFT; @@ -211,12 +213,18 @@ int __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose) * to find contiguous free memory regions of size up to IO_TLB_SEGSIZE * between io_tlb_start and io_tlb_end. */ - io_tlb_list = memblock_alloc( - PAGE_ALIGN(io_tlb_nslabs * sizeof(int)), - PAGE_SIZE); - io_tlb_orig_addr = memblock_alloc( - PAGE_ALIGN(io_tlb_nslabs * sizeof(phys_addr_t)), - PAGE_SIZE); + alloc_size = PAGE_ALIGN(io_tlb_nslabs * sizeof(int)); + io_tlb_list = memblock_alloc(alloc_size, PAGE_SIZE); + if (!io_tlb_list) + panic("%s: Failed to allocate %zu bytes align=0x%lx\n", + __func__, alloc_size, PAGE_SIZE); + + alloc_size = PAGE_ALIGN(io_tlb_nslabs * sizeof(phys_addr_t)); + io_tlb_orig_addr = memblock_alloc(alloc_size, PAGE_SIZE); + if (!io_tlb_orig_addr) + panic("%s: Failed to allocate %zu bytes align=0x%lx\n", + __func__, alloc_size, PAGE_SIZE); + for (i = 0; i < io_tlb_nslabs; i++) { io_tlb_list[i] = IO_TLB_SEGSIZE - OFFSET(i, IO_TLB_SEGSIZE); io_tlb_orig_addr[i] = INVALID_PHYS_ADDR; @@ -249,7 +257,7 @@ swiotlb_init(int verbose) bytes = io_tlb_nslabs << IO_TLB_SHIFT; /* Get IO TLB memory from the low pages */ - vstart = memblock_alloc_low_nopanic(PAGE_ALIGN(bytes), PAGE_SIZE); + vstart = memblock_alloc_low(PAGE_ALIGN(bytes), PAGE_SIZE); if (vstart && !swiotlb_init_with_tbl(vstart, io_tlb_nslabs, verbose)) return; @@ -672,45 +680,30 @@ bool swiotlb_map(struct device *dev, phys_addr_t *phys, dma_addr_t *dma_addr, return true; } -/* - * Return whether the given device DMA address mask can be supported - * properly. For example, if your device can only drive the low 24-bits - * during bus mastering, then you would pass 0x00ffffff as the mask to - * this function. - */ -int -swiotlb_dma_supported(struct device *hwdev, u64 mask) +size_t swiotlb_max_mapping_size(struct device *dev) { - return __phys_to_dma(hwdev, io_tlb_end - 1) <= mask; + return ((size_t)1 << IO_TLB_SHIFT) * IO_TLB_SEGSIZE; +} + +bool is_swiotlb_active(void) +{ + /* + * When SWIOTLB is initialized, even if io_tlb_start points to physical + * address zero, io_tlb_end surely doesn't. + */ + return io_tlb_end != 0; } #ifdef CONFIG_DEBUG_FS static int __init swiotlb_create_debugfs(void) { - struct dentry *d_swiotlb_usage; - struct dentry *ent; - - d_swiotlb_usage = debugfs_create_dir("swiotlb", NULL); - - if (!d_swiotlb_usage) - return -ENOMEM; - - ent = debugfs_create_ulong("io_tlb_nslabs", 0400, - d_swiotlb_usage, &io_tlb_nslabs); - if (!ent) - goto fail; - - ent = debugfs_create_ulong("io_tlb_used", 0400, - d_swiotlb_usage, &io_tlb_used); - if (!ent) - goto fail; + struct dentry *root; + root = debugfs_create_dir("swiotlb", NULL); + debugfs_create_ulong("io_tlb_nslabs", 0400, root, &io_tlb_nslabs); + debugfs_create_ulong("io_tlb_used", 0400, root, &io_tlb_used); return 0; - -fail: - debugfs_remove_recursive(d_swiotlb_usage); - return -ENOMEM; } late_initcall(swiotlb_create_debugfs); diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c index 24a77c34e9ad..c2b41a263166 100644 --- a/kernel/events/callchain.c +++ b/kernel/events/callchain.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Performance events callchain code, extracted from core.c: * @@ -5,8 +6,6 @@ * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> - * - * For licensing details see kernel-base/COPYING */ #include <linux/perf_event.h> diff --git a/kernel/events/core.c b/kernel/events/core.c index e5ede6918050..eea9d52b010c 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Performance events core code: * @@ -5,8 +6,6 @@ * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> - * - * For licensing details see kernel-base/COPYING */ #include <linux/fs.h> @@ -385,6 +384,8 @@ static atomic_t nr_namespaces_events __read_mostly; static atomic_t nr_task_events __read_mostly; static atomic_t nr_freq_events __read_mostly; static atomic_t nr_switch_events __read_mostly; +static atomic_t nr_ksymbol_events __read_mostly; +static atomic_t nr_bpf_events __read_mostly; static LIST_HEAD(pmus); static DEFINE_MUTEX(pmus_lock); @@ -1171,7 +1172,7 @@ static void perf_event_ctx_deactivate(struct perf_event_context *ctx) static void get_ctx(struct perf_event_context *ctx) { - WARN_ON(!atomic_inc_not_zero(&ctx->refcount)); + refcount_inc(&ctx->refcount); } static void free_ctx(struct rcu_head *head) @@ -1185,7 +1186,7 @@ static void free_ctx(struct rcu_head *head) static void put_ctx(struct perf_event_context *ctx) { - if (atomic_dec_and_test(&ctx->refcount)) { + if (refcount_dec_and_test(&ctx->refcount)) { if (ctx->parent_ctx) put_ctx(ctx->parent_ctx); if (ctx->task && ctx->task != TASK_TOMBSTONE) @@ -1254,6 +1255,7 @@ static void put_ctx(struct perf_event_context *ctx) * perf_event_context::lock * perf_event::mmap_mutex * mmap_sem + * perf_addr_filters_head::lock * * cpu_hotplug_lock * pmus_lock @@ -1267,7 +1269,7 @@ perf_event_ctx_lock_nested(struct perf_event *event, int nesting) again: rcu_read_lock(); ctx = READ_ONCE(event->ctx); - if (!atomic_inc_not_zero(&ctx->refcount)) { + if (!refcount_inc_not_zero(&ctx->refcount)) { rcu_read_unlock(); goto again; } @@ -1400,7 +1402,7 @@ retry: } if (ctx->task == TASK_TOMBSTONE || - !atomic_inc_not_zero(&ctx->refcount)) { + !refcount_inc_not_zero(&ctx->refcount)) { raw_spin_unlock(&ctx->lock); ctx = NULL; } else { @@ -2007,8 +2009,8 @@ event_sched_out(struct perf_event *event, event->pmu->del(event, 0); event->oncpu = -1; - if (event->pending_disable) { - event->pending_disable = 0; + if (READ_ONCE(event->pending_disable) >= 0) { + WRITE_ONCE(event->pending_disable, -1); state = PERF_EVENT_STATE_OFF; } perf_event_set_state(event, state); @@ -2196,7 +2198,8 @@ EXPORT_SYMBOL_GPL(perf_event_disable); void perf_event_disable_inatomic(struct perf_event *event) { - event->pending_disable = 1; + WRITE_ONCE(event->pending_disable, smp_processor_id()); + /* can fail, see perf_pending_event_disable() */ irq_work_queue(&event->pending); } @@ -2475,6 +2478,16 @@ static void ctx_resched(struct perf_cpu_context *cpuctx, perf_pmu_enable(cpuctx->ctx.pmu); } +void perf_pmu_resched(struct pmu *pmu) +{ + struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); + struct perf_event_context *task_ctx = cpuctx->task_ctx; + + perf_ctx_lock(cpuctx, task_ctx); + ctx_resched(cpuctx, task_ctx, EVENT_ALL|EVENT_CPU); + perf_ctx_unlock(cpuctx, task_ctx); +} + /* * Cross CPU call to install and enable a performance event * @@ -2540,6 +2553,9 @@ unlock: return ret; } +static bool exclusive_event_installable(struct perf_event *event, + struct perf_event_context *ctx); + /* * Attach a performance event to a context. * @@ -2554,6 +2570,8 @@ perf_install_in_context(struct perf_event_context *ctx, lockdep_assert_held(&ctx->mutex); + WARN_ON_ONCE(!exclusive_event_installable(event, ctx)); + if (event->cpu != -1) event->cpu = cpu; @@ -2797,7 +2815,7 @@ static int perf_event_stop(struct perf_event *event, int restart) * * (p1) when userspace mappings change as a result of (1) or (2) or (3) below, * we update the addresses of corresponding vmas in - * event::addr_filters_offs array and bump the event::addr_filters_gen; + * event::addr_filter_ranges array and bump the event::addr_filters_gen; * (p2) when an event is scheduled in (pmu::add), it calls * perf_event_addr_filters_sync() which calls pmu::addr_filters_sync() * if the generation has changed since the previous call. @@ -2939,6 +2957,12 @@ static void ctx_sched_out(struct perf_event_context *ctx, if (!ctx->nr_active || !(is_active & EVENT_ALL)) return; + /* + * If we had been multiplexing, no rotations are necessary, now no events + * are active. + */ + ctx->rotate_necessary = 0; + perf_pmu_disable(ctx->pmu); if (is_active & EVENT_PINNED) { list_for_each_entry_safe(event, tmp, &ctx->pinned_active, active_list) @@ -3306,10 +3330,13 @@ static int flexible_sched_in(struct perf_event *event, void *data) return 0; if (group_can_go_on(event, sid->cpuctx, sid->can_add_hw)) { - if (!group_sched_in(event, sid->cpuctx, sid->ctx)) - list_add_tail(&event->active_list, &sid->ctx->flexible_active); - else + int ret = group_sched_in(event, sid->cpuctx, sid->ctx); + if (ret) { sid->can_add_hw = 0; + sid->ctx->rotate_necessary = 1; + return 0; + } + list_add_tail(&event->active_list, &sid->ctx->flexible_active); } return 0; @@ -3677,24 +3704,17 @@ ctx_first_active(struct perf_event_context *ctx) static bool perf_rotate_context(struct perf_cpu_context *cpuctx) { struct perf_event *cpu_event = NULL, *task_event = NULL; - bool cpu_rotate = false, task_rotate = false; - struct perf_event_context *ctx = NULL; + struct perf_event_context *task_ctx = NULL; + int cpu_rotate, task_rotate; /* * Since we run this from IRQ context, nobody can install new * events, thus the event count values are stable. */ - if (cpuctx->ctx.nr_events) { - if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active) - cpu_rotate = true; - } - - ctx = cpuctx->task_ctx; - if (ctx && ctx->nr_events) { - if (ctx->nr_events != ctx->nr_active) - task_rotate = true; - } + cpu_rotate = cpuctx->ctx.rotate_necessary; + task_ctx = cpuctx->task_ctx; + task_rotate = task_ctx ? task_ctx->rotate_necessary : 0; if (!(cpu_rotate || task_rotate)) return false; @@ -3703,7 +3723,7 @@ static bool perf_rotate_context(struct perf_cpu_context *cpuctx) perf_pmu_disable(cpuctx->ctx.pmu); if (task_rotate) - task_event = ctx_first_active(ctx); + task_event = ctx_first_active(task_ctx); if (cpu_rotate) cpu_event = ctx_first_active(&cpuctx->ctx); @@ -3711,17 +3731,17 @@ static bool perf_rotate_context(struct perf_cpu_context *cpuctx) * As per the order given at ctx_resched() first 'pop' task flexible * and then, if needed CPU flexible. */ - if (task_event || (ctx && cpu_event)) - ctx_sched_out(ctx, cpuctx, EVENT_FLEXIBLE); + if (task_event || (task_ctx && cpu_event)) + ctx_sched_out(task_ctx, cpuctx, EVENT_FLEXIBLE); if (cpu_event) cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); if (task_event) - rotate_ctx(ctx, task_event); + rotate_ctx(task_ctx, task_event); if (cpu_event) rotate_ctx(&cpuctx->ctx, cpu_event); - perf_event_sched_in(cpuctx, ctx, current); + perf_event_sched_in(cpuctx, task_ctx, current); perf_pmu_enable(cpuctx->ctx.pmu); perf_ctx_unlock(cpuctx, cpuctx->task_ctx); @@ -4056,7 +4076,7 @@ static void __perf_event_init_context(struct perf_event_context *ctx) INIT_LIST_HEAD(&ctx->event_list); INIT_LIST_HEAD(&ctx->pinned_active); INIT_LIST_HEAD(&ctx->flexible_active); - atomic_set(&ctx->refcount, 1); + refcount_set(&ctx->refcount, 1); } static struct perf_event_context * @@ -4235,8 +4255,9 @@ static bool is_sb_event(struct perf_event *event) if (attr->mmap || attr->mmap_data || attr->mmap2 || attr->comm || attr->comm_exec || - attr->task || - attr->context_switch) + attr->task || attr->ksymbol || + attr->context_switch || + attr->bpf_event) return true; return false; } @@ -4305,6 +4326,10 @@ static void unaccount_event(struct perf_event *event) dec = true; if (has_branch_stack(event)) dec = true; + if (event->attr.ksymbol) + atomic_dec(&nr_ksymbol_events); + if (event->attr.bpf_event) + atomic_dec(&nr_bpf_events); if (dec) { if (!atomic_add_unless(&perf_sched_count, -1, 1)) @@ -4340,7 +4365,7 @@ static int exclusive_event_init(struct perf_event *event) { struct pmu *pmu = event->pmu; - if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE)) + if (!is_exclusive_pmu(pmu)) return 0; /* @@ -4371,7 +4396,7 @@ static void exclusive_event_destroy(struct perf_event *event) { struct pmu *pmu = event->pmu; - if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE)) + if (!is_exclusive_pmu(pmu)) return; /* see comment in exclusive_event_init() */ @@ -4391,14 +4416,15 @@ static bool exclusive_event_match(struct perf_event *e1, struct perf_event *e2) return false; } -/* Called under the same ctx::mutex as perf_install_in_context() */ static bool exclusive_event_installable(struct perf_event *event, struct perf_event_context *ctx) { struct perf_event *iter_event; struct pmu *pmu = event->pmu; - if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE)) + lockdep_assert_held(&ctx->mutex); + + if (!is_exclusive_pmu(pmu)) return true; list_for_each_entry(iter_event, &ctx->event_list, event_entry) { @@ -4440,17 +4466,25 @@ static void _free_event(struct perf_event *event) perf_event_free_bpf_prog(event); perf_addr_filters_splice(event, NULL); - kfree(event->addr_filters_offs); + kfree(event->addr_filter_ranges); if (event->destroy) event->destroy(event); - if (event->ctx) - put_ctx(event->ctx); - + /* + * Must be after ->destroy(), due to uprobe_perf_close() using + * hw.target. + */ if (event->hw.target) put_task_struct(event->hw.target); + /* + * perf_event_free_task() relies on put_ctx() being 'last', in particular + * all task references must be cleaned up. + */ + if (event->ctx) + put_ctx(event->ctx); + exclusive_event_destroy(event); module_put(event->pmu->module); @@ -4630,8 +4664,17 @@ again: mutex_unlock(&event->child_mutex); list_for_each_entry_safe(child, tmp, &free_list, child_list) { + void *var = &child->ctx->refcount; + list_del(&child->child_list); free_event(child); + + /* + * Wake any perf_event_free_task() waiting for this event to be + * freed. + */ + smp_mb(); /* pairs with wait_var_event() */ + wake_up_var(var); } no_ctx: @@ -4963,6 +5006,11 @@ static void __perf_event_period(struct perf_event *event, } } +static int perf_event_check_period(struct perf_event *event, u64 value) +{ + return event->pmu->check_period(event, value); +} + static int perf_event_period(struct perf_event *event, u64 __user *arg) { u64 value; @@ -4979,6 +5027,12 @@ static int perf_event_period(struct perf_event *event, u64 __user *arg) if (event->attr.freq && value > sysctl_perf_event_sample_rate) return -EINVAL; + if (perf_event_check_period(event, value)) + return -EINVAL; + + if (!event->attr.freq && (value & (1ULL << 63))) + return -EINVAL; + event_function_call(event, __perf_event_period, &value); return 0; @@ -5388,7 +5442,7 @@ struct ring_buffer *ring_buffer_get(struct perf_event *event) rcu_read_lock(); rb = rcu_dereference(event->rb); if (rb) { - if (!atomic_inc_not_zero(&rb->refcount)) + if (!refcount_inc_not_zero(&rb->refcount)) rb = NULL; } rcu_read_unlock(); @@ -5398,7 +5452,7 @@ struct ring_buffer *ring_buffer_get(struct perf_event *event) void ring_buffer_put(struct ring_buffer *rb) { - if (!atomic_dec_and_test(&rb->refcount)) + if (!refcount_dec_and_test(&rb->refcount)) return; WARN_ON_ONCE(!list_empty(&rb->event_list)); @@ -5459,11 +5513,11 @@ static void perf_mmap_close(struct vm_area_struct *vma) /* now it's safe to free the pages */ atomic_long_sub(rb->aux_nr_pages, &mmap_user->locked_vm); - vma->vm_mm->pinned_vm -= rb->aux_mmap_locked; + atomic64_sub(rb->aux_mmap_locked, &vma->vm_mm->pinned_vm); /* this has to be the last one */ rb_free_aux(rb); - WARN_ON_ONCE(atomic_read(&rb->aux_refcount)); + WARN_ON_ONCE(refcount_read(&rb->aux_refcount)); mutex_unlock(&event->mmap_mutex); } @@ -5532,7 +5586,7 @@ again: */ atomic_long_sub((size >> PAGE_SHIFT) + 1, &mmap_user->locked_vm); - vma->vm_mm->pinned_vm -= mmap_locked; + atomic64_sub(mmap_locked, &vma->vm_mm->pinned_vm); free_uid(mmap_user); out_put: @@ -5680,7 +5734,7 @@ accounting: lock_limit = rlimit(RLIMIT_MEMLOCK); lock_limit >>= PAGE_SHIFT; - locked = vma->vm_mm->pinned_vm + extra; + locked = atomic64_read(&vma->vm_mm->pinned_vm) + extra; if ((locked > lock_limit) && perf_paranoid_tracepoint_raw() && !capable(CAP_IPC_LOCK)) { @@ -5721,7 +5775,7 @@ accounting: unlock: if (!ret) { atomic_long_add(user_extra, &user->locked_vm); - vma->vm_mm->pinned_vm += extra; + atomic64_add(extra, &vma->vm_mm->pinned_vm); atomic_inc(&event->mmap_count); } else if (rb) { @@ -5795,10 +5849,45 @@ void perf_event_wakeup(struct perf_event *event) } } +static void perf_pending_event_disable(struct perf_event *event) +{ + int cpu = READ_ONCE(event->pending_disable); + + if (cpu < 0) + return; + + if (cpu == smp_processor_id()) { + WRITE_ONCE(event->pending_disable, -1); + perf_event_disable_local(event); + return; + } + + /* + * CPU-A CPU-B + * + * perf_event_disable_inatomic() + * @pending_disable = CPU-A; + * irq_work_queue(); + * + * sched-out + * @pending_disable = -1; + * + * sched-in + * perf_event_disable_inatomic() + * @pending_disable = CPU-B; + * irq_work_queue(); // FAILS + * + * irq_work_run() + * perf_pending_event() + * + * But the event runs on CPU-B and wants disabling there. + */ + irq_work_queue_on(&event->pending, cpu); +} + static void perf_pending_event(struct irq_work *entry) { - struct perf_event *event = container_of(entry, - struct perf_event, pending); + struct perf_event *event = container_of(entry, struct perf_event, pending); int rctx; rctx = perf_swevent_get_recursion_context(); @@ -5807,10 +5896,7 @@ static void perf_pending_event(struct irq_work *entry) * and we won't recurse 'further'. */ - if (event->pending_disable) { - event->pending_disable = 0; - perf_event_disable_local(event); - } + perf_pending_event_disable(event); if (event->pending_wakeup) { event->pending_wakeup = 0; @@ -5865,7 +5951,7 @@ static void perf_sample_regs_user(struct perf_regs *regs_user, if (user_mode(regs)) { regs_user->abi = perf_reg_abi(current); regs_user->regs = regs; - } else if (current->mm) { + } else if (!(current->flags & PF_KTHREAD)) { perf_get_regs_user(regs_user, regs, regs_user_copy); } else { regs_user->abi = PERF_SAMPLE_REGS_ABI_NONE; @@ -6489,7 +6575,7 @@ void perf_prepare_sample(struct perf_event_header *header, data->phys_addr = perf_virt_to_phys(data->addr); } -static __always_inline void +static __always_inline int __perf_event_output(struct perf_event *event, struct perf_sample_data *data, struct pt_regs *regs, @@ -6499,13 +6585,15 @@ __perf_event_output(struct perf_event *event, { struct perf_output_handle handle; struct perf_event_header header; + int err; /* protect the callchain buffers */ rcu_read_lock(); perf_prepare_sample(&header, data, event, regs); - if (output_begin(&handle, event, header.size)) + err = output_begin(&handle, event, header.size); + if (err) goto exit; perf_output_sample(&handle, &header, data, event); @@ -6514,6 +6602,7 @@ __perf_event_output(struct perf_event *event, exit: rcu_read_unlock(); + return err; } void @@ -6532,12 +6621,12 @@ perf_event_output_backward(struct perf_event *event, __perf_event_output(event, data, regs, perf_output_begin_backward); } -void +int perf_event_output(struct perf_event *event, struct perf_sample_data *data, struct pt_regs *regs) { - __perf_event_output(event, data, regs, perf_output_begin); + return __perf_event_output(event, data, regs, perf_output_begin); } /* @@ -6678,7 +6767,8 @@ static void perf_event_addr_filters_exec(struct perf_event *event, void *data) raw_spin_lock_irqsave(&ifh->lock, flags); list_for_each_entry(filter, &ifh->list, entry) { if (filter->path.dentry) { - event->addr_filters_offs[count] = 0; + event->addr_filter_ranges[count].start = 0; + event->addr_filter_ranges[count].size = 0; restart++; } @@ -7170,6 +7260,7 @@ static void perf_event_mmap_output(struct perf_event *event, struct perf_output_handle handle; struct perf_sample_data sample; int size = mmap_event->event_id.header.size; + u32 type = mmap_event->event_id.header.type; int ret; if (!perf_event_mmap_match(event, data)) @@ -7213,6 +7304,7 @@ static void perf_event_mmap_output(struct perf_event *event, perf_output_end(&handle); out: mmap_event->event_id.header.size = size; + mmap_event->event_id.header.type = type; } static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) @@ -7358,28 +7450,47 @@ static bool perf_addr_filter_match(struct perf_addr_filter *filter, return true; } +static bool perf_addr_filter_vma_adjust(struct perf_addr_filter *filter, + struct vm_area_struct *vma, + struct perf_addr_filter_range *fr) +{ + unsigned long vma_size = vma->vm_end - vma->vm_start; + unsigned long off = vma->vm_pgoff << PAGE_SHIFT; + struct file *file = vma->vm_file; + + if (!perf_addr_filter_match(filter, file, off, vma_size)) + return false; + + if (filter->offset < off) { + fr->start = vma->vm_start; + fr->size = min(vma_size, filter->size - (off - filter->offset)); + } else { + fr->start = vma->vm_start + filter->offset - off; + fr->size = min(vma->vm_end - fr->start, filter->size); + } + + return true; +} + static void __perf_addr_filters_adjust(struct perf_event *event, void *data) { struct perf_addr_filters_head *ifh = perf_event_addr_filters(event); struct vm_area_struct *vma = data; - unsigned long off = vma->vm_pgoff << PAGE_SHIFT, flags; - struct file *file = vma->vm_file; struct perf_addr_filter *filter; unsigned int restart = 0, count = 0; + unsigned long flags; if (!has_addr_filter(event)) return; - if (!file) + if (!vma->vm_file) return; raw_spin_lock_irqsave(&ifh->lock, flags); list_for_each_entry(filter, &ifh->list, entry) { - if (perf_addr_filter_match(filter, file, off, - vma->vm_end - vma->vm_start)) { - event->addr_filters_offs[count] = vma->vm_start; + if (perf_addr_filter_vma_adjust(filter, vma, + &event->addr_filter_ranges[count])) restart++; - } count++; } @@ -7650,6 +7761,207 @@ static void perf_log_throttle(struct perf_event *event, int enable) perf_output_end(&handle); } +/* + * ksymbol register/unregister tracking + */ + +struct perf_ksymbol_event { + const char *name; + int name_len; + struct { + struct perf_event_header header; + u64 addr; + u32 len; + u16 ksym_type; + u16 flags; + } event_id; +}; + +static int perf_event_ksymbol_match(struct perf_event *event) +{ + return event->attr.ksymbol; +} + +static void perf_event_ksymbol_output(struct perf_event *event, void *data) +{ + struct perf_ksymbol_event *ksymbol_event = data; + struct perf_output_handle handle; + struct perf_sample_data sample; + int ret; + + if (!perf_event_ksymbol_match(event)) + return; + + perf_event_header__init_id(&ksymbol_event->event_id.header, + &sample, event); + ret = perf_output_begin(&handle, event, + ksymbol_event->event_id.header.size); + if (ret) + return; + + perf_output_put(&handle, ksymbol_event->event_id); + __output_copy(&handle, ksymbol_event->name, ksymbol_event->name_len); + perf_event__output_id_sample(event, &handle, &sample); + + perf_output_end(&handle); +} + +void perf_event_ksymbol(u16 ksym_type, u64 addr, u32 len, bool unregister, + const char *sym) +{ + struct perf_ksymbol_event ksymbol_event; + char name[KSYM_NAME_LEN]; + u16 flags = 0; + int name_len; + + if (!atomic_read(&nr_ksymbol_events)) + return; + + if (ksym_type >= PERF_RECORD_KSYMBOL_TYPE_MAX || + ksym_type == PERF_RECORD_KSYMBOL_TYPE_UNKNOWN) + goto err; + + strlcpy(name, sym, KSYM_NAME_LEN); + name_len = strlen(name) + 1; + while (!IS_ALIGNED(name_len, sizeof(u64))) + name[name_len++] = '\0'; + BUILD_BUG_ON(KSYM_NAME_LEN % sizeof(u64)); + + if (unregister) + flags |= PERF_RECORD_KSYMBOL_FLAGS_UNREGISTER; + + ksymbol_event = (struct perf_ksymbol_event){ + .name = name, + .name_len = name_len, + .event_id = { + .header = { + .type = PERF_RECORD_KSYMBOL, + .size = sizeof(ksymbol_event.event_id) + + name_len, + }, + .addr = addr, + .len = len, + .ksym_type = ksym_type, + .flags = flags, + }, + }; + + perf_iterate_sb(perf_event_ksymbol_output, &ksymbol_event, NULL); + return; +err: + WARN_ONCE(1, "%s: Invalid KSYMBOL type 0x%x\n", __func__, ksym_type); +} + +/* + * bpf program load/unload tracking + */ + +struct perf_bpf_event { + struct bpf_prog *prog; + struct { + struct perf_event_header header; + u16 type; + u16 flags; + u32 id; + u8 tag[BPF_TAG_SIZE]; + } event_id; +}; + +static int perf_event_bpf_match(struct perf_event *event) +{ + return event->attr.bpf_event; +} + +static void perf_event_bpf_output(struct perf_event *event, void *data) +{ + struct perf_bpf_event *bpf_event = data; + struct perf_output_handle handle; + struct perf_sample_data sample; + int ret; + + if (!perf_event_bpf_match(event)) + return; + + perf_event_header__init_id(&bpf_event->event_id.header, + &sample, event); + ret = perf_output_begin(&handle, event, + bpf_event->event_id.header.size); + if (ret) + return; + + perf_output_put(&handle, bpf_event->event_id); + perf_event__output_id_sample(event, &handle, &sample); + + perf_output_end(&handle); +} + +static void perf_event_bpf_emit_ksymbols(struct bpf_prog *prog, + enum perf_bpf_event_type type) +{ + bool unregister = type == PERF_BPF_EVENT_PROG_UNLOAD; + char sym[KSYM_NAME_LEN]; + int i; + + if (prog->aux->func_cnt == 0) { + bpf_get_prog_name(prog, sym); + perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_BPF, + (u64)(unsigned long)prog->bpf_func, + prog->jited_len, unregister, sym); + } else { + for (i = 0; i < prog->aux->func_cnt; i++) { + struct bpf_prog *subprog = prog->aux->func[i]; + + bpf_get_prog_name(subprog, sym); + perf_event_ksymbol( + PERF_RECORD_KSYMBOL_TYPE_BPF, + (u64)(unsigned long)subprog->bpf_func, + subprog->jited_len, unregister, sym); + } + } +} + +void perf_event_bpf_event(struct bpf_prog *prog, + enum perf_bpf_event_type type, + u16 flags) +{ + struct perf_bpf_event bpf_event; + + if (type <= PERF_BPF_EVENT_UNKNOWN || + type >= PERF_BPF_EVENT_MAX) + return; + + switch (type) { + case PERF_BPF_EVENT_PROG_LOAD: + case PERF_BPF_EVENT_PROG_UNLOAD: + if (atomic_read(&nr_ksymbol_events)) + perf_event_bpf_emit_ksymbols(prog, type); + break; + default: + break; + } + + if (!atomic_read(&nr_bpf_events)) + return; + + bpf_event = (struct perf_bpf_event){ + .prog = prog, + .event_id = { + .header = { + .type = PERF_RECORD_BPF_EVENT, + .size = sizeof(bpf_event.event_id), + }, + .type = type, + .flags = flags, + .id = prog->aux->id, + }, + }; + + BUILD_BUG_ON(BPF_TAG_SIZE % sizeof(u64)); + + memcpy(bpf_event.event_id.tag, prog->tag, BPF_TAG_SIZE); + perf_iterate_sb(perf_event_bpf_output, &bpf_event, NULL); +} + void perf_event_itrace_started(struct perf_event *event) { event->attach_state |= PERF_ATTACH_ITRACE; @@ -8248,9 +8560,9 @@ static int perf_tp_event_match(struct perf_event *event, if (event->hw.state & PERF_HES_STOPPED) return 0; /* - * All tracepoints are from kernel-space. + * If exclude_kernel, only trace user-space tracepoints (uprobes) */ - if (event->attr.exclude_kernel) + if (event->attr.exclude_kernel && !user_mode(regs)) return 0; if (!perf_tp_filter_match(event, data)) @@ -8768,26 +9080,19 @@ static void perf_addr_filters_splice(struct perf_event *event, * @filter; if so, adjust filter's address range. * Called with mm::mmap_sem down for reading. */ -static unsigned long perf_addr_filter_apply(struct perf_addr_filter *filter, - struct mm_struct *mm) +static void perf_addr_filter_apply(struct perf_addr_filter *filter, + struct mm_struct *mm, + struct perf_addr_filter_range *fr) { struct vm_area_struct *vma; for (vma = mm->mmap; vma; vma = vma->vm_next) { - struct file *file = vma->vm_file; - unsigned long off = vma->vm_pgoff << PAGE_SHIFT; - unsigned long vma_size = vma->vm_end - vma->vm_start; - - if (!file) - continue; - - if (!perf_addr_filter_match(filter, file, off, vma_size)) + if (!vma->vm_file) continue; - return vma->vm_start; + if (perf_addr_filter_vma_adjust(filter, vma, fr)) + return; } - - return 0; } /* @@ -8810,26 +9115,29 @@ static void perf_event_addr_filters_apply(struct perf_event *event) if (task == TASK_TOMBSTONE) return; - if (!ifh->nr_file_filters) - return; - - mm = get_task_mm(event->ctx->task); - if (!mm) - goto restart; + if (ifh->nr_file_filters) { + mm = get_task_mm(event->ctx->task); + if (!mm) + goto restart; - down_read(&mm->mmap_sem); + down_read(&mm->mmap_sem); + } raw_spin_lock_irqsave(&ifh->lock, flags); list_for_each_entry(filter, &ifh->list, entry) { - event->addr_filters_offs[count] = 0; + if (filter->path.dentry) { + /* + * Adjust base offset if the filter is associated to a + * binary that needs to be mapped: + */ + event->addr_filter_ranges[count].start = 0; + event->addr_filter_ranges[count].size = 0; - /* - * Adjust base offset if the filter is associated to a binary - * that needs to be mapped: - */ - if (filter->path.dentry) - event->addr_filters_offs[count] = - perf_addr_filter_apply(filter, mm); + perf_addr_filter_apply(filter, mm, &event->addr_filter_ranges[count]); + } else { + event->addr_filter_ranges[count].start = filter->offset; + event->addr_filter_ranges[count].size = filter->size; + } count++; } @@ -8837,9 +9145,11 @@ static void perf_event_addr_filters_apply(struct perf_event *event) event->addr_filters_gen++; raw_spin_unlock_irqrestore(&ifh->lock, flags); - up_read(&mm->mmap_sem); + if (ifh->nr_file_filters) { + up_read(&mm->mmap_sem); - mmput(mm); + mmput(mm); + } restart: perf_event_stop(event, 1); @@ -8943,6 +9253,7 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr, case IF_SRC_KERNELADDR: case IF_SRC_KERNEL: kernel = 1; + /* fall through */ case IF_SRC_FILEADDR: case IF_SRC_FILE: @@ -9391,6 +9702,11 @@ static int perf_pmu_nop_int(struct pmu *pmu) return 0; } +static int perf_event_nop_int(struct perf_event *event, u64 value) +{ + return 0; +} + static DEFINE_PER_CPU(unsigned int, nop_txn_flags); static void perf_pmu_start_txn(struct pmu *pmu, unsigned int flags) @@ -9586,6 +9902,12 @@ static int pmu_dev_alloc(struct pmu *pmu) if (ret) goto del_dev; + if (pmu->attr_update) + ret = sysfs_update_groups(&pmu->dev->kobj, pmu->attr_update); + + if (ret) + goto del_dev; + out: return ret; @@ -9691,6 +10013,9 @@ got_cpu_context: pmu->pmu_disable = perf_pmu_nop_void; } + if (!pmu->check_period) + pmu->check_period = perf_event_nop_int; + if (!pmu->event_idx) pmu->event_idx = perf_event_idx_default; @@ -9742,6 +10067,12 @@ void perf_pmu_unregister(struct pmu *pmu) } EXPORT_SYMBOL_GPL(perf_pmu_unregister); +static inline bool has_extended_regs(struct perf_event *event) +{ + return (event->attr.sample_regs_user & PERF_REG_EXTENDED_MASK) || + (event->attr.sample_regs_intr & PERF_REG_EXTENDED_MASK); +} + static int perf_try_init_event(struct pmu *pmu, struct perf_event *event) { struct perf_event_context *ctx = NULL; @@ -9772,6 +10103,19 @@ static int perf_try_init_event(struct pmu *pmu, struct perf_event *event) if (ctx) perf_event_ctx_unlock(event->group_leader, ctx); + if (!ret) { + if (!(pmu->capabilities & PERF_PMU_CAP_EXTENDED_REGS) && + has_extended_regs(event)) + ret = -EOPNOTSUPP; + + if (pmu->capabilities & PERF_PMU_CAP_NO_EXCLUDE && + event_has_any_exclude_flag(event)) + ret = -EINVAL; + + if (ret && event->destroy) + event->destroy(event); + } + if (ret) module_put(pmu->module); @@ -9900,6 +10244,10 @@ static void account_event(struct perf_event *event) inc = true; if (is_cgroup_event(event)) inc = true; + if (event->attr.ksymbol) + atomic_inc(&nr_ksymbol_events); + if (event->attr.bpf_event) + atomic_inc(&nr_bpf_events); if (inc) { /* @@ -9980,6 +10328,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, init_waitqueue_head(&event->waitq); + event->pending_disable = -1; init_irq_work(&event->pending, perf_pending_event); mutex_init(&event->mmap_mutex); @@ -10082,14 +10431,28 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, goto err_pmu; if (has_addr_filter(event)) { - event->addr_filters_offs = kcalloc(pmu->nr_addr_filters, - sizeof(unsigned long), - GFP_KERNEL); - if (!event->addr_filters_offs) { + event->addr_filter_ranges = kcalloc(pmu->nr_addr_filters, + sizeof(struct perf_addr_filter_range), + GFP_KERNEL); + if (!event->addr_filter_ranges) { err = -ENOMEM; goto err_per_task; } + /* + * Clone the parent's vma offsets: they are valid until exec() + * even if the mm is not shared with the parent. + */ + if (event->parent) { + struct perf_addr_filters_head *ifh = perf_event_addr_filters(event); + + raw_spin_lock_irq(&ifh->lock); + memcpy(event->addr_filter_ranges, + event->parent->addr_filter_ranges, + pmu->nr_addr_filters * sizeof(struct perf_addr_filter_range)); + raw_spin_unlock_irq(&ifh->lock); + } + /* force hw sync on the address filters */ event->addr_filters_gen = 1; } @@ -10108,7 +10471,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, return event; err_addr_filters: - kfree(event->addr_filters_offs); + kfree(event->addr_filter_ranges); err_per_task: exclusive_event_destroy(event); @@ -10361,11 +10724,11 @@ static int perf_event_set_clock(struct perf_event *event, clockid_t clk_id) break; case CLOCK_BOOTTIME: - event->clock = &ktime_get_boot_ns; + event->clock = &ktime_get_boottime_ns; break; case CLOCK_TAI: - event->clock = &ktime_get_tai_ns; + event->clock = &ktime_get_clocktai_ns; break; default: @@ -10391,7 +10754,7 @@ __perf_event_ctx_lock_double(struct perf_event *group_leader, again: rcu_read_lock(); gctx = READ_ONCE(group_leader->ctx); - if (!atomic_inc_not_zero(&gctx->refcount)) { + if (!refcount_inc_not_zero(&gctx->refcount)) { rcu_read_unlock(); goto again; } @@ -10590,11 +10953,6 @@ SYSCALL_DEFINE5(perf_event_open, goto err_alloc; } - if ((pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE) && group_leader) { - err = -EBUSY; - goto err_context; - } - /* * Look up the group leader (we will attach this event to it): */ @@ -10682,6 +11040,18 @@ SYSCALL_DEFINE5(perf_event_open, move_group = 0; } } + + /* + * Failure to create exclusive events returns -EBUSY. + */ + err = -EBUSY; + if (!exclusive_event_installable(group_leader, ctx)) + goto err_locked; + + for_each_sibling_event(sibling, group_leader) { + if (!exclusive_event_installable(sibling, ctx)) + goto err_locked; + } } else { mutex_lock(&ctx->mutex); } @@ -10718,9 +11088,6 @@ SYSCALL_DEFINE5(perf_event_open, * because we need to serialize with concurrent event creation. */ if (!exclusive_event_installable(event, ctx)) { - /* exclusive and group stuff are assumed mutually exclusive */ - WARN_ON_ONCE(move_group); - err = -EBUSY; goto err_locked; } @@ -11187,11 +11554,11 @@ static void perf_free_event(struct perf_event *event, } /* - * Free an unexposed, unused context as created by inheritance by - * perf_event_init_task below, used by fork() in case of fail. + * Free a context as created by inheritance by perf_event_init_task() below, + * used by fork() in case of fail. * - * Not all locks are strictly required, but take them anyway to be nice and - * help out with the lockdep assertions. + * Even though the task has never lived, the context and events have been + * exposed through the child_list, so we must take care tearing it all down. */ void perf_event_free_task(struct task_struct *task) { @@ -11221,7 +11588,23 @@ void perf_event_free_task(struct task_struct *task) perf_free_event(event, ctx); mutex_unlock(&ctx->mutex); - put_ctx(ctx); + + /* + * perf_event_release_kernel() could've stolen some of our + * child events and still have them on its free_list. In that + * case we must wait for these events to have been freed (in + * particular all their references to this task must've been + * dropped). + * + * Without this copy_process() will unconditionally free this + * task (irrespective of its reference count) and + * _free_event()'s put_task_struct(event->hw.target) will be a + * use-after-free. + * + * Wait for all events to drop their context reference. + */ + wait_var_event(&ctx->refcount, refcount_read(&ctx->refcount) == 1); + put_ctx(ctx); /* must be last */ } } @@ -11608,7 +11991,7 @@ static void __init perf_event_init_all_cpus(void) } } -void perf_swevent_init_cpu(unsigned int cpu) +static void perf_swevent_init_cpu(unsigned int cpu) { struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index 5befb338a18d..c5cd852fe86b 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c @@ -1,18 +1,5 @@ +// SPDX-License-Identifier: GPL-2.0+ /* - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - * * Copyright (C) 2007 Alan Stern * Copyright (C) IBM Corporation, 2009 * Copyright (C) 2009, Frederic Weisbecker <fweisbec@gmail.com> diff --git a/kernel/events/internal.h b/kernel/events/internal.h index 6dc725a7e7bc..3aef4191798c 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -4,13 +4,14 @@ #include <linux/hardirq.h> #include <linux/uaccess.h> +#include <linux/refcount.h> /* Buffer handling */ #define RING_BUFFER_WRITABLE 0x01 struct ring_buffer { - atomic_t refcount; + refcount_t refcount; struct rcu_head rcu_head; #ifdef CONFIG_PERF_USE_VMALLOC struct work_struct work; @@ -23,7 +24,7 @@ struct ring_buffer { atomic_t poll; /* POLL_ for wakeups */ local_t head; /* write position */ - local_t nest; /* nested writers */ + unsigned int nest; /* nested writers */ local_t events; /* event limit */ local_t wakeup; /* wakeup stamp */ local_t lost; /* nr records lost */ @@ -40,7 +41,7 @@ struct ring_buffer { /* AUX area */ long aux_head; - local_t aux_nest; + unsigned int aux_nest; long aux_wakeup; /* last aux_watermark boundary crossed by aux_head */ unsigned long aux_pgoff; int aux_nr_pages; @@ -48,7 +49,7 @@ struct ring_buffer { atomic_t aux_mmap_count; unsigned long aux_mmap_locked; void (*free_aux)(void *); - atomic_t aux_refcount; + refcount_t aux_refcount; void **aux_pages; void *aux_priv; diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index 309ef5a64af5..ffb59a4ef4ff 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Performance events ring-buffer code: * @@ -5,8 +6,6 @@ * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> - * - * For licensing details see kernel-base/COPYING */ #include <linux/perf_event.h> @@ -39,7 +38,12 @@ static void perf_output_get_handle(struct perf_output_handle *handle) struct ring_buffer *rb = handle->rb; preempt_disable(); - local_inc(&rb->nest); + + /* + * Avoid an explicit LOAD/STORE such that architectures with memops + * can use them. + */ + (*(volatile unsigned int *)&rb->nest)++; handle->wakeup = local_read(&rb->wakeup); } @@ -47,17 +51,35 @@ static void perf_output_put_handle(struct perf_output_handle *handle) { struct ring_buffer *rb = handle->rb; unsigned long head; + unsigned int nest; + + /* + * If this isn't the outermost nesting, we don't have to update + * @rb->user_page->data_head. + */ + nest = READ_ONCE(rb->nest); + if (nest > 1) { + WRITE_ONCE(rb->nest, nest - 1); + goto out; + } again: + /* + * In order to avoid publishing a head value that goes backwards, + * we must ensure the load of @rb->head happens after we've + * incremented @rb->nest. + * + * Otherwise we can observe a @rb->head value before one published + * by an IRQ/NMI happening between the load and the increment. + */ + barrier(); head = local_read(&rb->head); /* - * IRQ/NMI can happen here, which means we can miss a head update. + * IRQ/NMI can happen here and advance @rb->head, causing our + * load above to be stale. */ - if (!local_dec_and_test(&rb->nest)) - goto out; - /* * Since the mmap() consumer (userspace) can run on a different CPU: * @@ -85,14 +107,23 @@ again: * See perf_output_begin(). */ smp_wmb(); /* B, matches C */ - rb->user_page->data_head = head; + WRITE_ONCE(rb->user_page->data_head, head); /* - * Now check if we missed an update -- rely on previous implied - * compiler barriers to force a re-read. + * We must publish the head before decrementing the nest count, + * otherwise an IRQ/NMI can publish a more recent head value and our + * write will (temporarily) publish a stale value. */ + barrier(); + WRITE_ONCE(rb->nest, 0); + + /* + * Ensure we decrement @rb->nest before we validate the @rb->head. + * Otherwise we cannot be sure we caught the 'last' nested update. + */ + barrier(); if (unlikely(head != local_read(&rb->head))) { - local_inc(&rb->nest); + WRITE_ONCE(rb->nest, 1); goto again; } @@ -285,7 +316,7 @@ ring_buffer_init(struct ring_buffer *rb, long watermark, int flags) else rb->overwrite = 1; - atomic_set(&rb->refcount, 1); + refcount_set(&rb->refcount, 1); INIT_LIST_HEAD(&rb->event_list); spin_lock_init(&rb->event_lock); @@ -331,6 +362,7 @@ void *perf_aux_output_begin(struct perf_output_handle *handle, struct perf_event *output_event = event; unsigned long aux_head, aux_tail; struct ring_buffer *rb; + unsigned int nest; if (output_event->parent) output_event = output_event->parent; @@ -358,16 +390,19 @@ void *perf_aux_output_begin(struct perf_output_handle *handle, if (!atomic_read(&rb->aux_mmap_count)) goto err; - if (!atomic_inc_not_zero(&rb->aux_refcount)) + if (!refcount_inc_not_zero(&rb->aux_refcount)) goto err; + nest = READ_ONCE(rb->aux_nest); /* * Nesting is not supported for AUX area, make sure nested * writers are caught early */ - if (WARN_ON_ONCE(local_xchg(&rb->aux_nest, 1))) + if (WARN_ON_ONCE(nest)) goto err_put; + WRITE_ONCE(rb->aux_nest, nest + 1); + aux_head = rb->aux_head; handle->rb = rb; @@ -393,9 +428,9 @@ void *perf_aux_output_begin(struct perf_output_handle *handle, * store that will be enabled on successful return */ if (!handle->size) { /* A, matches D */ - event->pending_disable = 1; + event->pending_disable = smp_processor_id(); perf_output_wakeup(handle); - local_set(&rb->aux_nest, 0); + WRITE_ONCE(rb->aux_nest, 0); goto err_put; } } @@ -456,38 +491,35 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size) rb->aux_head += size; } - if (size || handle->aux_flags) { - /* - * Only send RECORD_AUX if we have something useful to communicate - * - * Note: the OVERWRITE records by themselves are not considered - * useful, as they don't communicate any *new* information, - * aside from the short-lived offset, that becomes history at - * the next event sched-in and therefore isn't useful. - * The userspace that needs to copy out AUX data in overwrite - * mode should know to use user_page::aux_head for the actual - * offset. So, from now on we don't output AUX records that - * have *only* OVERWRITE flag set. - */ - - if (handle->aux_flags & ~(u64)PERF_AUX_FLAG_OVERWRITE) - perf_event_aux_event(handle->event, aux_head, size, - handle->aux_flags); - } + /* + * Only send RECORD_AUX if we have something useful to communicate + * + * Note: the OVERWRITE records by themselves are not considered + * useful, as they don't communicate any *new* information, + * aside from the short-lived offset, that becomes history at + * the next event sched-in and therefore isn't useful. + * The userspace that needs to copy out AUX data in overwrite + * mode should know to use user_page::aux_head for the actual + * offset. So, from now on we don't output AUX records that + * have *only* OVERWRITE flag set. + */ + if (size || (handle->aux_flags & ~(u64)PERF_AUX_FLAG_OVERWRITE)) + perf_event_aux_event(handle->event, aux_head, size, + handle->aux_flags); - rb->user_page->aux_head = rb->aux_head; + WRITE_ONCE(rb->user_page->aux_head, rb->aux_head); if (rb_need_aux_wakeup(rb)) wakeup = true; if (wakeup) { if (handle->aux_flags & PERF_AUX_FLAG_TRUNCATED) - handle->event->pending_disable = 1; + handle->event->pending_disable = smp_processor_id(); perf_output_wakeup(handle); } handle->event = NULL; - local_set(&rb->aux_nest, 0); + WRITE_ONCE(rb->aux_nest, 0); /* can't be last */ rb_free_aux(rb); ring_buffer_put(rb); @@ -507,7 +539,7 @@ int perf_aux_output_skip(struct perf_output_handle *handle, unsigned long size) rb->aux_head += size; - rb->user_page->aux_head = rb->aux_head; + WRITE_ONCE(rb->user_page->aux_head, rb->aux_head); if (rb_need_aux_wakeup(rb)) { perf_output_wakeup(handle); handle->wakeup = rb->aux_wakeup + rb->aux_watermark; @@ -599,29 +631,26 @@ int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event, { bool overwrite = !(flags & RING_BUFFER_WRITABLE); int node = (event->cpu == -1) ? -1 : cpu_to_node(event->cpu); - int ret = -ENOMEM, max_order = 0; + int ret = -ENOMEM, max_order; if (!has_aux(event)) return -EOPNOTSUPP; - if (event->pmu->capabilities & PERF_PMU_CAP_AUX_NO_SG) { - /* - * We need to start with the max_order that fits in nr_pages, - * not the other way around, hence ilog2() and not get_order. - */ - max_order = ilog2(nr_pages); + /* + * We need to start with the max_order that fits in nr_pages, + * not the other way around, hence ilog2() and not get_order. + */ + max_order = ilog2(nr_pages); - /* - * PMU requests more than one contiguous chunks of memory - * for SW double buffering - */ - if ((event->pmu->capabilities & PERF_PMU_CAP_AUX_SW_DOUBLEBUF) && - !overwrite) { - if (!max_order) - return -EINVAL; + /* + * PMU requests more than one contiguous chunks of memory + * for SW double buffering + */ + if (!overwrite) { + if (!max_order) + return -EINVAL; - max_order--; - } + max_order--; } rb->aux_pages = kcalloc_node(nr_pages, sizeof(void *), GFP_KERNEL, @@ -658,7 +687,7 @@ int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event, goto out; } - rb->aux_priv = event->pmu->setup_aux(event->cpu, rb->aux_pages, nr_pages, + rb->aux_priv = event->pmu->setup_aux(event, rb->aux_pages, nr_pages, overwrite); if (!rb->aux_priv) goto out; @@ -671,7 +700,7 @@ int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event, * we keep a refcount here to make sure either of the two can * reference them safely. */ - atomic_set(&rb->aux_refcount, 1); + refcount_set(&rb->aux_refcount, 1); rb->aux_overwrite = overwrite; rb->aux_watermark = watermark; @@ -690,7 +719,7 @@ out: void rb_free_aux(struct ring_buffer *rb) { - if (atomic_dec_and_test(&rb->aux_refcount)) + if (refcount_dec_and_test(&rb->aux_refcount)) __rb_free_aux(rb); } @@ -734,7 +763,7 @@ struct ring_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags) size = sizeof(struct ring_buffer); size += nr_pages * sizeof(void *); - if (order_base_2(size) >= MAX_ORDER) + if (order_base_2(size) >= PAGE_SHIFT+MAX_ORDER) goto fail; rb = kzalloc(size, GFP_KERNEL); diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 8aef47ee7bfa..84fa00497c49 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1,20 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0+ /* * User-space Probes (UProbes) * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - * * Copyright (C) IBM Corporation, 2008-2012 * Authors: * Srikar Dronamraju @@ -59,14 +46,14 @@ static DEFINE_SPINLOCK(uprobes_treelock); /* serialize rbtree access */ static struct mutex uprobes_mmap_mutex[UPROBES_HASH_SZ]; #define uprobes_mmap_hash(v) (&uprobes_mmap_mutex[((unsigned long)(v)) % UPROBES_HASH_SZ]) -static struct percpu_rw_semaphore dup_mmap_sem; +DEFINE_STATIC_PERCPU_RWSEM(dup_mmap_sem); /* Have a copy of original instruction */ #define UPROBE_COPY_INSN 0 struct uprobe { struct rb_node rb_node; /* node in the rb tree */ - atomic_t ref; + refcount_t ref; struct rw_semaphore register_rwsem; struct rw_semaphore consumer_rwsem; struct list_head pending_list; @@ -174,7 +161,8 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, struct mmu_notifier_range range; struct mem_cgroup *memcg; - mmu_notifier_range_init(&range, mm, addr, addr + PAGE_SIZE); + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, addr, + addr + PAGE_SIZE); VM_BUG_ON_PAGE(PageTransHuge(old_page), old_page); @@ -560,13 +548,13 @@ set_orig_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long v static struct uprobe *get_uprobe(struct uprobe *uprobe) { - atomic_inc(&uprobe->ref); + refcount_inc(&uprobe->ref); return uprobe; } static void put_uprobe(struct uprobe *uprobe) { - if (atomic_dec_and_test(&uprobe->ref)) { + if (refcount_dec_and_test(&uprobe->ref)) { /* * If application munmap(exec_vma) before uprobe_unregister() * gets called, we don't get a chance to remove uprobe from @@ -657,7 +645,7 @@ static struct uprobe *__insert_uprobe(struct uprobe *uprobe) rb_link_node(&uprobe->rb_node, parent, p); rb_insert_color(&uprobe->rb_node, &uprobes_tree); /* get access + creation ref */ - atomic_set(&uprobe->ref, 2); + refcount_set(&uprobe->ref, 2); return u; } @@ -2041,7 +2029,7 @@ static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs) if (uc->handler) { rc = uc->handler(uc, regs); WARN(rc & ~UPROBE_HANDLER_MASK, - "bad rc=0x%x from %pf()\n", rc, uc->handler); + "bad rc=0x%x from %ps()\n", rc, uc->handler); } if (uc->ret_handler) @@ -2124,7 +2112,7 @@ static void handle_trampoline(struct pt_regs *regs) sigill: uprobe_warn(current, "handle uretprobe, sending SIGILL."); - force_sig(SIGILL, current); + force_sig(SIGILL); } @@ -2240,7 +2228,7 @@ static void handle_singlestep(struct uprobe_task *utask, struct pt_regs *regs) if (unlikely(err)) { uprobe_warn(current, "execute the probed insn, sending SIGILL."); - force_sig(SIGILL, current); + force_sig(SIGILL); } } @@ -2307,16 +2295,12 @@ static struct notifier_block uprobe_exception_nb = { .priority = INT_MAX-1, /* notified after kprobes, kgdb */ }; -static int __init init_uprobes(void) +void __init uprobes_init(void) { int i; for (i = 0; i < UPROBES_HASH_SZ; i++) mutex_init(&uprobes_mmap_mutex[i]); - if (percpu_init_rwsem(&dup_mmap_sem)) - return -ENOMEM; - - return register_die_notifier(&uprobe_exception_nb); + BUG_ON(register_die_notifier(&uprobe_exception_nb)); } -__initcall(init_uprobes); diff --git a/kernel/exit.c b/kernel/exit.c index 2639a30a8aa5..a75b6a7f458a 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * linux/kernel/exit.c * @@ -194,6 +195,7 @@ repeat: rcu_read_unlock(); proc_flush_task(p); + cgroup_release(p); write_lock_irq(&tasklist_lock); ptrace_release_task(p); @@ -421,7 +423,7 @@ retry: * freed task structure. */ if (atomic_read(&mm->mm_users) <= 1) { - mm->owner = NULL; + WRITE_ONCE(mm->owner, NULL); return; } @@ -461,7 +463,7 @@ retry: * most likely racing with swapoff (try_to_unuse()) or /proc or * ptrace or page migration (get_task_mm()). Mark owner as NULL. */ - mm->owner = NULL; + WRITE_ONCE(mm->owner, NULL); return; assign_new_owner: @@ -482,7 +484,7 @@ assign_new_owner: put_task_struct(c); goto retry; } - mm->owner = c; + WRITE_ONCE(mm->owner, c); task_unlock(c); put_task_struct(c); } diff --git a/kernel/extable.c b/kernel/extable.c index 6a5b61ebc66c..e23cce6e6092 100644 --- a/kernel/extable.c +++ b/kernel/extable.c @@ -1,19 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* Rewritten by Rusty Russell, on the backs of many others... Copyright (C) 2001 Rusty Russell, 2002 Rusty Russell IBM. - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program; if not, write to the Free Software - Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include <linux/ftrace.h> #include <linux/memory.h> diff --git a/kernel/fail_function.c b/kernel/fail_function.c index 17f75b545f66..63b349168da7 100644 --- a/kernel/fail_function.c +++ b/kernel/fail_function.c @@ -152,20 +152,13 @@ static int fei_retval_get(void *data, u64 *val) DEFINE_DEBUGFS_ATTRIBUTE(fei_retval_ops, fei_retval_get, fei_retval_set, "%llx\n"); -static int fei_debugfs_add_attr(struct fei_attr *attr) +static void fei_debugfs_add_attr(struct fei_attr *attr) { struct dentry *dir; dir = debugfs_create_dir(attr->kp.symbol_name, fei_debugfs_dir); - if (!dir) - return -ENOMEM; - - if (!debugfs_create_file("retval", 0600, dir, attr, &fei_retval_ops)) { - debugfs_remove_recursive(dir); - return -ENOMEM; - } - return 0; + debugfs_create_file("retval", 0600, dir, attr, &fei_retval_ops); } static void fei_debugfs_remove_attr(struct fei_attr *attr) @@ -210,7 +203,7 @@ static int fei_seq_show(struct seq_file *m, void *v) { struct fei_attr *attr = list_entry(v, struct fei_attr, list); - seq_printf(m, "%pf\n", attr->kp.addr); + seq_printf(m, "%ps\n", attr->kp.addr); return 0; } @@ -306,7 +299,7 @@ static ssize_t fei_write(struct file *file, const char __user *buffer, ret = register_kprobe(&attr->kp); if (!ret) - ret = fei_debugfs_add_attr(attr); + fei_debugfs_add_attr(attr); if (ret < 0) fei_attr_remove(attr); else { @@ -337,19 +330,13 @@ static int __init fei_debugfs_init(void) return PTR_ERR(dir); /* injectable attribute is just a symlink of error_inject/list */ - if (!debugfs_create_symlink("injectable", dir, - "../error_injection/list")) - goto error; + debugfs_create_symlink("injectable", dir, "../error_injection/list"); - if (!debugfs_create_file("inject", 0600, dir, NULL, &fei_ops)) - goto error; + debugfs_create_file("inject", 0600, dir, NULL, &fei_ops); fei_debugfs_dir = dir; return 0; -error: - debugfs_remove_recursive(dir); - return -ENOMEM; } late_initcall(fei_debugfs_init); diff --git a/kernel/fork.c b/kernel/fork.c index b69248e6f0e0..d8ae0f1b4148 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * linux/kernel/fork.c * @@ -11,6 +12,7 @@ * management can be a bitch. See 'mm/memory.c': 'copy_page_range()' */ +#include <linux/anon_inodes.h> #include <linux/slab.h> #include <linux/sched/autogroup.h> #include <linux/sched/mm.h> @@ -21,6 +23,7 @@ #include <linux/sched/task.h> #include <linux/sched/task_stack.h> #include <linux/sched/cputime.h> +#include <linux/seq_file.h> #include <linux/rtmutex.h> #include <linux/init.h> #include <linux/unistd.h> @@ -77,7 +80,6 @@ #include <linux/blkdev.h> #include <linux/fs_struct.h> #include <linux/magic.h> -#include <linux/sched/mm.h> #include <linux/perf_event.h> #include <linux/posix-timers.h> #include <linux/user-return-notifier.h> @@ -121,7 +123,7 @@ unsigned long total_forks; /* Handle normal Linux uptimes. */ int nr_threads; /* The idle threads do not count.. */ -int max_threads; /* tunable limit on nr_threads */ +static int max_threads; /* tunable limit on nr_threads */ DEFINE_PER_CPU(unsigned long, process_counts) = 0; @@ -246,7 +248,11 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node) struct page *page = alloc_pages_node(node, THREADINFO_GFP, THREAD_SIZE_ORDER); - return page ? page_address(page) : NULL; + if (likely(page)) { + tsk->stack = page_address(page); + return tsk->stack; + } + return NULL; #endif } @@ -429,7 +435,7 @@ static void release_task_stack(struct task_struct *tsk) #ifdef CONFIG_THREAD_INFO_IN_TASK void put_task_stack(struct task_struct *tsk) { - if (atomic_dec_and_test(&tsk->stack_refcount)) + if (refcount_dec_and_test(&tsk->stack_refcount)) release_task_stack(tsk); } #endif @@ -447,7 +453,7 @@ void free_task(struct task_struct *tsk) * If the task had a separate stack allocation, it should be gone * by now. */ - WARN_ON_ONCE(atomic_read(&tsk->stack_refcount) != 0); + WARN_ON_ONCE(refcount_read(&tsk->stack_refcount) != 0); #endif rt_mutex_debug_task_free(tsk); ftrace_graph_exit_task(tsk); @@ -671,7 +677,6 @@ void __mmdrop(struct mm_struct *mm) WARN_ON_ONCE(mm == current->active_mm); mm_free_pgd(mm); destroy_context(mm); - hmm_mm_destroy(mm); mmu_notifier_mm_destroy(mm); check_mm(mm); put_user_ns(mm->user_ns); @@ -710,14 +715,14 @@ static inline void free_signal_struct(struct signal_struct *sig) static inline void put_signal_struct(struct signal_struct *sig) { - if (atomic_dec_and_test(&sig->sigcnt)) + if (refcount_dec_and_test(&sig->sigcnt)) free_signal_struct(sig); } void __put_task_struct(struct task_struct *tsk) { WARN_ON(!tsk->exit_state); - WARN_ON(atomic_read(&tsk->usage)); + WARN_ON(refcount_read(&tsk->usage)); WARN_ON(tsk == current); cgroup_free(tsk); @@ -816,6 +821,7 @@ void __init fork_init(void) #endif lockdep_init_task(&init_task); + uprobes_init(); } int __weak arch_dup_task_struct(struct task_struct *dst, @@ -867,7 +873,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) tsk->stack_vm_area = stack_vm_area; #endif #ifdef CONFIG_THREAD_INFO_IN_TASK - atomic_set(&tsk->stack_refcount, 1); + refcount_set(&tsk->stack_refcount, 1); #endif if (err) @@ -891,12 +897,14 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) #ifdef CONFIG_STACKPROTECTOR tsk->stack_canary = get_random_canary(); #endif + if (orig->cpus_ptr == &orig->cpus_mask) + tsk->cpus_ptr = &tsk->cpus_mask; /* * One for us, one for whoever does the "release_task()" (usually * parent) */ - atomic_set(&tsk->usage, 2); + refcount_set(&tsk->usage, 2); #ifdef CONFIG_BLK_DEV_IO_TRACE tsk->btrace_seq = 0; #endif @@ -953,6 +961,15 @@ static void mm_init_aio(struct mm_struct *mm) #endif } +static __always_inline void mm_clear_owner(struct mm_struct *mm, + struct task_struct *p) +{ +#ifdef CONFIG_MEMCG + if (mm->owner == p) + WRITE_ONCE(mm->owner, NULL); +#endif +} + static void mm_init_owner(struct mm_struct *mm, struct task_struct *p) { #ifdef CONFIG_MEMCG @@ -981,7 +998,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, mm_pgtables_bytes_init(mm); mm->map_count = 0; mm->locked_vm = 0; - mm->pinned_vm = 0; + atomic64_set(&mm->pinned_vm, 0); memset(&mm->rss_stat, 0, sizeof(mm->rss_stat)); spin_lock_init(&mm->page_table_lock); spin_lock_init(&mm->arg_lock); @@ -1223,7 +1240,9 @@ static int wait_for_vfork_done(struct task_struct *child, int killed; freezer_do_not_count(); + cgroup_enter_frozen(); killed = wait_for_completion_killable(vfork); + cgroup_leave_frozen(false); freezer_count(); if (killed) { @@ -1299,13 +1318,20 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm) complete_vfork_done(tsk); } -/* - * Allocate a new mm structure and copy contents from the - * mm structure of the passed in task structure. +/** + * dup_mm() - duplicates an existing mm structure + * @tsk: the task_struct with which the new mm will be associated. + * @oldmm: the mm to duplicate. + * + * Allocates a new mm structure and duplicates the provided @oldmm structure + * content into it. + * + * Return: the duplicated mm or NULL on failure. */ -static struct mm_struct *dup_mm(struct task_struct *tsk) +static struct mm_struct *dup_mm(struct task_struct *tsk, + struct mm_struct *oldmm) { - struct mm_struct *mm, *oldmm = current->mm; + struct mm_struct *mm; int err; mm = allocate_mm(); @@ -1332,6 +1358,7 @@ static struct mm_struct *dup_mm(struct task_struct *tsk) free_pt: /* don't put binfmt in mmput, we haven't got module yet */ mm->binfmt = NULL; + mm_init_owner(mm, NULL); mmput(mm); fail_nomem: @@ -1372,7 +1399,7 @@ static int copy_mm(unsigned long clone_flags, struct task_struct *tsk) } retval = -ENOMEM; - mm = dup_mm(tsk); + mm = dup_mm(tsk, current->mm); if (!mm) goto fail_nomem; @@ -1463,7 +1490,7 @@ static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk) struct sighand_struct *sig; if (clone_flags & CLONE_SIGHAND) { - atomic_inc(¤t->sighand->count); + refcount_inc(¤t->sighand->count); return 0; } sig = kmem_cache_alloc(sighand_cachep, GFP_KERNEL); @@ -1471,7 +1498,7 @@ static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk) if (!sig) return -ENOMEM; - atomic_set(&sig->count, 1); + refcount_set(&sig->count, 1); spin_lock_irq(¤t->sighand->siglock); memcpy(sig->action, current->sighand->action, sizeof(sig->action)); spin_unlock_irq(¤t->sighand->siglock); @@ -1480,7 +1507,7 @@ static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk) void __cleanup_sighand(struct sighand_struct *sighand) { - if (atomic_dec_and_test(&sighand->count)) { + if (refcount_dec_and_test(&sighand->count)) { signalfd_cleanup(sighand); /* * sighand_cachep is SLAB_TYPESAFE_BY_RCU so we can free it @@ -1527,7 +1554,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) sig->nr_threads = 1; atomic_set(&sig->live, 1); - atomic_set(&sig->sigcnt, 1); + refcount_set(&sig->sigcnt, 1); /* list_add(thread_node, thread_head) without INIT_LIST_HEAD() */ sig->thread_head = (struct list_head)LIST_HEAD_INIT(tsk->thread_node); @@ -1663,6 +1690,74 @@ static inline void rcu_copy_process(struct task_struct *p) #endif /* #ifdef CONFIG_TASKS_RCU */ } +static int pidfd_release(struct inode *inode, struct file *file) +{ + struct pid *pid = file->private_data; + + file->private_data = NULL; + put_pid(pid); + return 0; +} + +#ifdef CONFIG_PROC_FS +static void pidfd_show_fdinfo(struct seq_file *m, struct file *f) +{ + struct pid_namespace *ns = proc_pid_ns(file_inode(m->file)); + struct pid *pid = f->private_data; + + seq_put_decimal_ull(m, "Pid:\t", pid_nr_ns(pid, ns)); + seq_putc(m, '\n'); +} +#endif + +/* + * Poll support for process exit notification. + */ +static unsigned int pidfd_poll(struct file *file, struct poll_table_struct *pts) +{ + struct task_struct *task; + struct pid *pid = file->private_data; + int poll_flags = 0; + + poll_wait(file, &pid->wait_pidfd, pts); + + rcu_read_lock(); + task = pid_task(pid, PIDTYPE_PID); + /* + * Inform pollers only when the whole thread group exits. + * If the thread group leader exits before all other threads in the + * group, then poll(2) should block, similar to the wait(2) family. + */ + if (!task || (task->exit_state && thread_group_empty(task))) + poll_flags = POLLIN | POLLRDNORM; + rcu_read_unlock(); + + return poll_flags; +} + +const struct file_operations pidfd_fops = { + .release = pidfd_release, + .poll = pidfd_poll, +#ifdef CONFIG_PROC_FS + .show_fdinfo = pidfd_show_fdinfo, +#endif +}; + +static void __delayed_free_task(struct rcu_head *rhp) +{ + struct task_struct *tsk = container_of(rhp, struct task_struct, rcu); + + free_task(tsk); +} + +static __always_inline void delayed_free_task(struct task_struct *tsk) +{ + if (IS_ENABLED(CONFIG_MEMCG)) + call_rcu(&tsk->rcu, __delayed_free_task); + else + free_task(tsk); +} + /* * This creates a new process as a copy of the old one, * but does not actually start it yet. @@ -1672,18 +1767,16 @@ static inline void rcu_copy_process(struct task_struct *p) * flags). The actual kick-off is left to the caller. */ static __latent_entropy struct task_struct *copy_process( - unsigned long clone_flags, - unsigned long stack_start, - unsigned long stack_size, - int __user *child_tidptr, struct pid *pid, int trace, - unsigned long tls, - int node) + int node, + struct kernel_clone_args *args) { - int retval; + int pidfd = -1, retval; struct task_struct *p; struct multiprocess_signals delayed; + struct file *pidfile = NULL; + u64 clone_flags = args->flags; /* * Don't allow sharing the root directory with processes in a different @@ -1731,6 +1824,16 @@ static __latent_entropy struct task_struct *copy_process( return ERR_PTR(-EINVAL); } + if (clone_flags & CLONE_PIDFD) { + /* + * - CLONE_DETACHED is blocked so that we can potentially + * reuse it later for CLONE_PIDFD. + * - CLONE_THREAD is blocked until someone really needs it. + */ + if (clone_flags & (CLONE_DETACHED | CLONE_THREAD)) + return ERR_PTR(-EINVAL); + } + /* * Force any signals received before this point to be delivered * before the fork happens. Collect up signals sent to multiple @@ -1760,11 +1863,11 @@ static __latent_entropy struct task_struct *copy_process( * p->set_child_tid which is (ab)used as a kthread's data pointer for * kernel threads (PF_KTHREAD). */ - p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL; + p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? args->child_tid : NULL; /* * Clear TID on mm_release()? */ - p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr : NULL; + p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? args->child_tid : NULL; ftrace_graph_init_task(p); @@ -1869,9 +1972,6 @@ static __latent_entropy struct task_struct *copy_process( p->pagefault_disabled = 0; #ifdef CONFIG_LOCKDEP - p->lockdep_depth = 0; /* no locks held yet */ - p->curr_chain_key = 0; - p->lockdep_recursion = 0; lockdep_init_task(p); #endif @@ -1923,7 +2023,8 @@ static __latent_entropy struct task_struct *copy_process( retval = copy_io(clone_flags, p); if (retval) goto bad_fork_cleanup_namespaces; - retval = copy_thread_tls(clone_flags, stack_start, stack_size, p, tls); + retval = copy_thread_tls(clone_flags, args->stack, args->stack_size, p, + args->tls); if (retval) goto bad_fork_cleanup_io; @@ -1937,6 +2038,32 @@ static __latent_entropy struct task_struct *copy_process( } } + /* + * This has to happen after we've potentially unshared the file + * descriptor table (so that the pidfd doesn't leak into the child + * if the fd table isn't shared). + */ + if (clone_flags & CLONE_PIDFD) { + retval = get_unused_fd_flags(O_RDWR | O_CLOEXEC); + if (retval < 0) + goto bad_fork_free_pid; + + pidfd = retval; + + pidfile = anon_inode_getfile("[pidfd]", &pidfd_fops, pid, + O_RDWR | O_CLOEXEC); + if (IS_ERR(pidfile)) { + put_unused_fd(pidfd); + retval = PTR_ERR(pidfile); + goto bad_fork_free_pid; + } + get_pid(pid); /* held by pidfile now */ + + retval = put_user(pidfd, args->pidfd); + if (retval) + goto bad_fork_put_pidfd; + } + #ifdef CONFIG_BLOCK p->plug = NULL; #endif @@ -1963,7 +2090,7 @@ static __latent_entropy struct task_struct *copy_process( #ifdef TIF_SYSCALL_EMU clear_tsk_thread_flag(p, TIF_SYSCALL_EMU); #endif - clear_all_latency_tracing(p); + clear_tsk_latency_tracing(p); /* ok, now we should be set up.. */ p->pid = pid_nr(pid); @@ -1975,7 +2102,7 @@ static __latent_entropy struct task_struct *copy_process( if (clone_flags & CLONE_PARENT) p->exit_signal = current->group_leader->exit_signal; else - p->exit_signal = (clone_flags & CSIGNAL); + p->exit_signal = args->exit_signal; p->group_leader = p; p->tgid = p->pid; } @@ -1997,7 +2124,7 @@ static __latent_entropy struct task_struct *copy_process( */ retval = cgroup_can_fork(p); if (retval) - goto bad_fork_free_pid; + goto bad_fork_cgroup_threadgroup_change_end; /* * From this point on we must avoid any synchronous user-space @@ -2008,7 +2135,7 @@ static __latent_entropy struct task_struct *copy_process( */ p->start_time = ktime_get_ns(); - p->real_start_time = ktime_get_boot_ns(); + p->real_start_time = ktime_get_boottime_ns(); /* * Make it visible to the rest of the system, but dont wake it up yet. @@ -2049,6 +2176,9 @@ static __latent_entropy struct task_struct *copy_process( goto bad_fork_cancel_cgroup; } + /* past the last point of failure */ + if (pidfile) + fd_install(pidfd, pidfile); init_task_pid_links(p); if (likely(p->pid)) { @@ -2082,7 +2212,7 @@ static __latent_entropy struct task_struct *copy_process( } else { current->signal->nr_threads++; atomic_inc(¤t->signal->live); - atomic_inc(¤t->signal->sigcnt); + refcount_inc(¤t->signal->sigcnt); task_join_group_stop(p); list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group); @@ -2112,8 +2242,14 @@ bad_fork_cancel_cgroup: spin_unlock(¤t->sighand->siglock); write_unlock_irq(&tasklist_lock); cgroup_cancel_fork(p); -bad_fork_free_pid: +bad_fork_cgroup_threadgroup_change_end: cgroup_threadgroup_change_end(current); +bad_fork_put_pidfd: + if (clone_flags & CLONE_PIDFD) { + fput(pidfile); + put_unused_fd(pidfd); + } +bad_fork_free_pid: if (pid != &init_struct_pid) free_pid(pid); bad_fork_cleanup_thread: @@ -2124,8 +2260,10 @@ bad_fork_cleanup_io: bad_fork_cleanup_namespaces: exit_task_namespaces(p); bad_fork_cleanup_mm: - if (p->mm) + if (p->mm) { + mm_clear_owner(p->mm, p); mmput(p->mm); + } bad_fork_cleanup_signal: if (!(clone_flags & CLONE_THREAD)) free_signal_struct(p->signal); @@ -2156,7 +2294,7 @@ bad_fork_cleanup_count: bad_fork_free: p->state = TASK_DEAD; put_task_stack(p); - free_task(p); + delayed_free_task(p); fork_out: spin_lock_irq(¤t->sighand->siglock); hlist_del_init(&delayed.node); @@ -2177,8 +2315,11 @@ static inline void init_idle_pids(struct task_struct *idle) struct task_struct *fork_idle(int cpu) { struct task_struct *task; - task = copy_process(CLONE_VM, 0, 0, NULL, &init_struct_pid, 0, 0, - cpu_to_node(cpu)); + struct kernel_clone_args args = { + .flags = CLONE_VM, + }; + + task = copy_process(&init_struct_pid, 0, cpu_to_node(cpu), &args); if (!IS_ERR(task)) { init_idle_pids(task); init_idle(task, cpu); @@ -2187,19 +2328,20 @@ struct task_struct *fork_idle(int cpu) return task; } +struct mm_struct *copy_init_mm(void) +{ + return dup_mm(NULL, &init_mm); +} + /* * Ok, this is the main fork-routine. * * It copies the process, and if successful kick-starts * it and waits for it to finish using the VM if required. */ -long _do_fork(unsigned long clone_flags, - unsigned long stack_start, - unsigned long stack_size, - int __user *parent_tidptr, - int __user *child_tidptr, - unsigned long tls) +long _do_fork(struct kernel_clone_args *args) { + u64 clone_flags = args->flags; struct completion vfork; struct pid *pid; struct task_struct *p; @@ -2215,7 +2357,7 @@ long _do_fork(unsigned long clone_flags, if (!(clone_flags & CLONE_UNTRACED)) { if (clone_flags & CLONE_VFORK) trace = PTRACE_EVENT_VFORK; - else if ((clone_flags & CSIGNAL) != SIGCHLD) + else if (args->exit_signal != SIGCHLD) trace = PTRACE_EVENT_CLONE; else trace = PTRACE_EVENT_FORK; @@ -2224,8 +2366,7 @@ long _do_fork(unsigned long clone_flags, trace = 0; } - p = copy_process(clone_flags, stack_start, stack_size, - child_tidptr, NULL, trace, tls, NUMA_NO_NODE); + p = copy_process(NULL, trace, NUMA_NO_NODE, args); add_latent_entropy(); if (IS_ERR(p)) @@ -2241,7 +2382,7 @@ long _do_fork(unsigned long clone_flags, nr = pid_vnr(pid); if (clone_flags & CLONE_PARENT_SETTID) - put_user(nr, parent_tidptr); + put_user(nr, args->parent_tid); if (clone_flags & CLONE_VFORK) { p->vfork_done = &vfork; @@ -2264,6 +2405,16 @@ long _do_fork(unsigned long clone_flags, return nr; } +bool legacy_clone_args_valid(const struct kernel_clone_args *kargs) +{ + /* clone(CLONE_PIDFD) uses parent_tidptr to return a pidfd */ + if ((kargs->flags & CLONE_PIDFD) && + (kargs->flags & CLONE_PARENT_SETTID)) + return false; + + return true; +} + #ifndef CONFIG_HAVE_COPY_THREAD_TLS /* For compatibility with architectures that call do_fork directly rather than * using the syscall entry points below. */ @@ -2273,8 +2424,20 @@ long do_fork(unsigned long clone_flags, int __user *parent_tidptr, int __user *child_tidptr) { - return _do_fork(clone_flags, stack_start, stack_size, - parent_tidptr, child_tidptr, 0); + struct kernel_clone_args args = { + .flags = (clone_flags & ~CSIGNAL), + .pidfd = parent_tidptr, + .child_tid = child_tidptr, + .parent_tid = parent_tidptr, + .exit_signal = (clone_flags & CSIGNAL), + .stack = stack_start, + .stack_size = stack_size, + }; + + if (!legacy_clone_args_valid(&args)) + return -EINVAL; + + return _do_fork(&args); } #endif @@ -2283,15 +2446,25 @@ long do_fork(unsigned long clone_flags, */ pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags) { - return _do_fork(flags|CLONE_VM|CLONE_UNTRACED, (unsigned long)fn, - (unsigned long)arg, NULL, NULL, 0); + struct kernel_clone_args args = { + .flags = ((flags | CLONE_VM | CLONE_UNTRACED) & ~CSIGNAL), + .exit_signal = (flags & CSIGNAL), + .stack = (unsigned long)fn, + .stack_size = (unsigned long)arg, + }; + + return _do_fork(&args); } #ifdef __ARCH_WANT_SYS_FORK SYSCALL_DEFINE0(fork) { #ifdef CONFIG_MMU - return _do_fork(SIGCHLD, 0, 0, NULL, NULL, 0); + struct kernel_clone_args args = { + .exit_signal = SIGCHLD, + }; + + return _do_fork(&args); #else /* can not support in nommu mode */ return -EINVAL; @@ -2302,8 +2475,12 @@ SYSCALL_DEFINE0(fork) #ifdef __ARCH_WANT_SYS_VFORK SYSCALL_DEFINE0(vfork) { - return _do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, 0, - 0, NULL, NULL, 0); + struct kernel_clone_args args = { + .flags = CLONE_VFORK | CLONE_VM, + .exit_signal = SIGCHLD, + }; + + return _do_fork(&args); } #endif @@ -2331,7 +2508,111 @@ SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp, unsigned long, tls) #endif { - return _do_fork(clone_flags, newsp, 0, parent_tidptr, child_tidptr, tls); + struct kernel_clone_args args = { + .flags = (clone_flags & ~CSIGNAL), + .pidfd = parent_tidptr, + .child_tid = child_tidptr, + .parent_tid = parent_tidptr, + .exit_signal = (clone_flags & CSIGNAL), + .stack = newsp, + .tls = tls, + }; + + if (!legacy_clone_args_valid(&args)) + return -EINVAL; + + return _do_fork(&args); +} +#endif + +#ifdef __ARCH_WANT_SYS_CLONE3 +noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs, + struct clone_args __user *uargs, + size_t size) +{ + struct clone_args args; + + if (unlikely(size > PAGE_SIZE)) + return -E2BIG; + + if (unlikely(size < sizeof(struct clone_args))) + return -EINVAL; + + if (unlikely(!access_ok(uargs, size))) + return -EFAULT; + + if (size > sizeof(struct clone_args)) { + unsigned char __user *addr; + unsigned char __user *end; + unsigned char val; + + addr = (void __user *)uargs + sizeof(struct clone_args); + end = (void __user *)uargs + size; + + for (; addr < end; addr++) { + if (get_user(val, addr)) + return -EFAULT; + if (val) + return -E2BIG; + } + + size = sizeof(struct clone_args); + } + + if (copy_from_user(&args, uargs, size)) + return -EFAULT; + + *kargs = (struct kernel_clone_args){ + .flags = args.flags, + .pidfd = u64_to_user_ptr(args.pidfd), + .child_tid = u64_to_user_ptr(args.child_tid), + .parent_tid = u64_to_user_ptr(args.parent_tid), + .exit_signal = args.exit_signal, + .stack = args.stack, + .stack_size = args.stack_size, + .tls = args.tls, + }; + + return 0; +} + +static bool clone3_args_valid(const struct kernel_clone_args *kargs) +{ + /* + * All lower bits of the flag word are taken. + * Verify that no other unknown flags are passed along. + */ + if (kargs->flags & ~CLONE_LEGACY_FLAGS) + return false; + + /* + * - make the CLONE_DETACHED bit reuseable for clone3 + * - make the CSIGNAL bits reuseable for clone3 + */ + if (kargs->flags & (CLONE_DETACHED | CSIGNAL)) + return false; + + if ((kargs->flags & (CLONE_THREAD | CLONE_PARENT)) && + kargs->exit_signal) + return false; + + return true; +} + +SYSCALL_DEFINE2(clone3, struct clone_args __user *, uargs, size_t, size) +{ + int err; + + struct kernel_clone_args kargs; + + err = copy_clone_args_from_user(&kargs, uargs, size); + if (err) + return err; + + if (!clone3_args_valid(&kargs)) + return -EINVAL; + + return _do_fork(&kargs); } #endif @@ -2439,7 +2720,7 @@ static int check_unshare_flags(unsigned long unshare_flags) return -EINVAL; } if (unshare_flags & (CLONE_SIGHAND | CLONE_VM)) { - if (atomic_read(¤t->sighand->count) > 1) + if (refcount_read(¤t->sighand->count) > 1) return -EINVAL; } if (unshare_flags & CLONE_VM) { diff --git a/kernel/freezer.c b/kernel/freezer.c index b162b74611e4..c0738424bb43 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * kernel/freezer.c - Function to freeze a process * diff --git a/kernel/futex.c b/kernel/futex.c index a0514e01c3eb..6d50728ef2e7 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * Fast Userspace Mutexes (which I call "Futexes!"). * (C) Rusty Russell, IBM 2002 @@ -29,20 +30,6 @@ * * "The futexes are also cursed." * "But they come in a choice of three flavours!" - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include <linux/compat.h> #include <linux/slab.h> @@ -68,6 +55,7 @@ #include <linux/freezer.h> #include <linux/memblock.h> #include <linux/fault-inject.h> +#include <linux/refcount.h> #include <asm/futex.h> @@ -212,7 +200,7 @@ struct futex_pi_state { struct rt_mutex pi_mutex; struct task_struct *owner; - atomic_t refcount; + refcount_t refcount; union futex_key key; } __randomize_layout; @@ -321,12 +309,8 @@ static int __init fail_futex_debugfs(void) if (IS_ERR(dir)) return PTR_ERR(dir); - if (!debugfs_create_bool("ignore-private", mode, dir, - &fail_futex.ignore_private)) { - debugfs_remove_recursive(dir); - return -ENOMEM; - } - + debugfs_create_bool("ignore-private", mode, dir, + &fail_futex.ignore_private); return 0; } @@ -487,6 +471,37 @@ enum futex_access { }; /** + * futex_setup_timer - set up the sleeping hrtimer. + * @time: ptr to the given timeout value + * @timeout: the hrtimer_sleeper structure to be set up + * @flags: futex flags + * @range_ns: optional range in ns + * + * Return: Initialized hrtimer_sleeper structure or NULL if no timeout + * value given + */ +static inline struct hrtimer_sleeper * +futex_setup_timer(ktime_t *time, struct hrtimer_sleeper *timeout, + int flags, u64 range_ns) +{ + if (!time) + return NULL; + + hrtimer_init_on_stack(&timeout->timer, (flags & FLAGS_CLOCKRT) ? + CLOCK_REALTIME : CLOCK_MONOTONIC, + HRTIMER_MODE_ABS); + hrtimer_init_sleeper(timeout, current); + + /* + * If range_ns is 0, calling hrtimer_set_expires_range_ns() is + * effectively the same as calling hrtimer_set_expires(). + */ + hrtimer_set_expires_range_ns(&timeout->timer, *time, range_ns); + + return timeout; +} + +/** * get_futex_key() - Get parameters which are the keys for a futex * @uaddr: virtual address of the futex * @fshared: 0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED @@ -546,7 +561,7 @@ again: if (unlikely(should_fail_futex(fshared))) return -EFAULT; - err = get_user_pages_fast(address, 1, 1, &page); + err = get_user_pages_fast(address, 1, FOLL_WRITE, &page); /* * If write access is not required (eg. FUTEX_WAIT), try * and get read-only access. @@ -803,7 +818,7 @@ static int refill_pi_state_cache(void) INIT_LIST_HEAD(&pi_state->list); /* pi_mutex gets initialized later */ pi_state->owner = NULL; - atomic_set(&pi_state->refcount, 1); + refcount_set(&pi_state->refcount, 1); pi_state->key = FUTEX_KEY_INIT; current->pi_state_cache = pi_state; @@ -823,7 +838,7 @@ static struct futex_pi_state *alloc_pi_state(void) static void get_pi_state(struct futex_pi_state *pi_state) { - WARN_ON_ONCE(!atomic_inc_not_zero(&pi_state->refcount)); + WARN_ON_ONCE(!refcount_inc_not_zero(&pi_state->refcount)); } /* @@ -835,7 +850,7 @@ static void put_pi_state(struct futex_pi_state *pi_state) if (!pi_state) return; - if (!atomic_dec_and_test(&pi_state->refcount)) + if (!refcount_dec_and_test(&pi_state->refcount)) return; /* @@ -865,7 +880,7 @@ static void put_pi_state(struct futex_pi_state *pi_state) * refcount is at 0 - put it back to 1. */ pi_state->owner = NULL; - atomic_set(&pi_state->refcount, 1); + refcount_set(&pi_state->refcount, 1); current->pi_state_cache = pi_state; } } @@ -908,7 +923,7 @@ void exit_pi_state_list(struct task_struct *curr) * In that case; drop the locks to let put_pi_state() make * progress and retry the loop. */ - if (!atomic_inc_not_zero(&pi_state->refcount)) { + if (!refcount_inc_not_zero(&pi_state->refcount)) { raw_spin_unlock_irq(&curr->pi_lock); cpu_relax(); raw_spin_lock_irq(&curr->pi_lock); @@ -1064,7 +1079,7 @@ static int attach_to_pi_state(u32 __user *uaddr, u32 uval, * and futex_wait_requeue_pi() as it cannot go to 0 and consequently * free pi_state before we can take a reference ourselves. */ - WARN_ON(!atomic_read(&pi_state->refcount)); + WARN_ON(!refcount_read(&pi_state->refcount)); /* * Now that we have a pi_state, we can acquire wait_lock @@ -1314,13 +1329,15 @@ static int lookup_pi_state(u32 __user *uaddr, u32 uval, static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval) { + int err; u32 uninitialized_var(curval); if (unlikely(should_fail_futex(true))) return -EFAULT; - if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))) - return -EFAULT; + err = cmpxchg_futex_value_locked(&curval, uaddr, uval, newval); + if (unlikely(err)) + return err; /* If user space value changed, let the caller retry */ return curval != uval ? -EAGAIN : 0; @@ -1467,8 +1484,7 @@ static void mark_wake_futex(struct wake_q_head *wake_q, struct futex_q *q) * Queue the task for later wakeup for after we've released * the hb->lock. wake_q_add() grabs reference to p. */ - wake_q_add(wake_q, p); - put_task_struct(p); + wake_q_add_safe(wake_q, p); } /* @@ -1506,10 +1522,8 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_pi_state *pi_ if (unlikely(should_fail_futex(true))) ret = -EFAULT; - if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) { - ret = -EFAULT; - - } else if (curval != uval) { + ret = cmpxchg_futex_value_locked(&curval, uaddr, uval, newval); + if (!ret && (curval != uval)) { /* * If a unconditional UNLOCK_PI operation (user space did not * try the TID->0 transition) raced with a waiter setting the @@ -1704,32 +1718,32 @@ retry_private: double_lock_hb(hb1, hb2); op_ret = futex_atomic_op_inuser(op, uaddr2); if (unlikely(op_ret < 0)) { - double_unlock_hb(hb1, hb2); -#ifndef CONFIG_MMU - /* - * we don't get EFAULT from MMU faults if we don't have an MMU, - * but we might get them from range checking - */ - ret = op_ret; - goto out_put_keys; -#endif - - if (unlikely(op_ret != -EFAULT)) { + if (!IS_ENABLED(CONFIG_MMU) || + unlikely(op_ret != -EFAULT && op_ret != -EAGAIN)) { + /* + * we don't get EFAULT from MMU faults if we don't have + * an MMU, but we might get them from range checking + */ ret = op_ret; goto out_put_keys; } - ret = fault_in_user_writeable(uaddr2); - if (ret) - goto out_put_keys; + if (op_ret == -EFAULT) { + ret = fault_in_user_writeable(uaddr2); + if (ret) + goto out_put_keys; + } - if (!(flags & FLAGS_SHARED)) + if (!(flags & FLAGS_SHARED)) { + cond_resched(); goto retry_private; + } put_futex_key(&key2); put_futex_key(&key1); + cond_resched(); goto retry; } @@ -2354,7 +2368,7 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, u32 uval, uninitialized_var(curval), newval; struct task_struct *oldowner, *newowner; u32 newtid; - int ret; + int ret, err = 0; lockdep_assert_held(q->lock_ptr); @@ -2425,14 +2439,17 @@ retry: if (!pi_state->owner) newtid |= FUTEX_OWNER_DIED; - if (get_futex_value_locked(&uval, uaddr)) - goto handle_fault; + err = get_futex_value_locked(&uval, uaddr); + if (err) + goto handle_err; for (;;) { newval = (uval & FUTEX_OWNER_DIED) | newtid; - if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) - goto handle_fault; + err = cmpxchg_futex_value_locked(&curval, uaddr, uval, newval); + if (err) + goto handle_err; + if (curval == uval) break; uval = curval; @@ -2460,23 +2477,37 @@ retry: return 0; /* - * To handle the page fault we need to drop the locks here. That gives - * the other task (either the highest priority waiter itself or the - * task which stole the rtmutex) the chance to try the fixup of the - * pi_state. So once we are back from handling the fault we need to - * check the pi_state after reacquiring the locks and before trying to - * do another fixup. When the fixup has been done already we simply - * return. + * In order to reschedule or handle a page fault, we need to drop the + * locks here. In the case of a fault, this gives the other task + * (either the highest priority waiter itself or the task which stole + * the rtmutex) the chance to try the fixup of the pi_state. So once we + * are back from handling the fault we need to check the pi_state after + * reacquiring the locks and before trying to do another fixup. When + * the fixup has been done already we simply return. * * Note: we hold both hb->lock and pi_mutex->wait_lock. We can safely * drop hb->lock since the caller owns the hb -> futex_q relation. * Dropping the pi_mutex->wait_lock requires the state revalidate. */ -handle_fault: +handle_err: raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); spin_unlock(q->lock_ptr); - ret = fault_in_user_writeable(uaddr); + switch (err) { + case -EFAULT: + ret = fault_in_user_writeable(uaddr); + break; + + case -EAGAIN: + cond_resched(); + ret = 0; + break; + + default: + WARN_ON_ONCE(1); + ret = err; + break; + } spin_lock(q->lock_ptr); raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); @@ -2679,7 +2710,7 @@ out: static int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, ktime_t *abs_time, u32 bitset) { - struct hrtimer_sleeper timeout, *to = NULL; + struct hrtimer_sleeper timeout, *to; struct restart_block *restart; struct futex_hash_bucket *hb; struct futex_q q = futex_q_init; @@ -2689,17 +2720,8 @@ static int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, return -EINVAL; q.bitset = bitset; - if (abs_time) { - to = &timeout; - - hrtimer_init_on_stack(&to->timer, (flags & FLAGS_CLOCKRT) ? - CLOCK_REALTIME : CLOCK_MONOTONIC, - HRTIMER_MODE_ABS); - hrtimer_init_sleeper(to, current); - hrtimer_set_expires_range_ns(&to->timer, *abs_time, - current->timer_slack_ns); - } - + to = futex_setup_timer(abs_time, &timeout, flags, + current->timer_slack_ns); retry: /* * Prepare to wait on uaddr. On success, holds hb lock and increments @@ -2779,7 +2801,7 @@ static long futex_wait_restart(struct restart_block *restart) static int futex_lock_pi(u32 __user *uaddr, unsigned int flags, ktime_t *time, int trylock) { - struct hrtimer_sleeper timeout, *to = NULL; + struct hrtimer_sleeper timeout, *to; struct futex_pi_state *pi_state = NULL; struct rt_mutex_waiter rt_waiter; struct futex_hash_bucket *hb; @@ -2792,13 +2814,7 @@ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags, if (refill_pi_state_cache()) return -ENOMEM; - if (time) { - to = &timeout; - hrtimer_init_on_stack(&to->timer, CLOCK_REALTIME, - HRTIMER_MODE_ABS); - hrtimer_init_sleeper(to, current); - hrtimer_set_expires(&to->timer, *time); - } + to = futex_setup_timer(time, &timeout, FLAGS_CLOCKRT, 0); retry: ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q.key, FUTEX_WRITE); @@ -3045,10 +3061,8 @@ retry: * A unconditional UNLOCK_PI op raced against a waiter * setting the FUTEX_WAITERS bit. Try again. */ - if (ret == -EAGAIN) { - put_futex_key(&key); - goto retry; - } + if (ret == -EAGAIN) + goto pi_retry; /* * wake_futex_pi has detected invalid state. Tell user * space. @@ -3063,9 +3077,19 @@ retry: * preserve the WAITERS bit not the OWNER_DIED one. We are the * owner. */ - if (cmpxchg_futex_value_locked(&curval, uaddr, uval, 0)) { + if ((ret = cmpxchg_futex_value_locked(&curval, uaddr, uval, 0))) { spin_unlock(&hb->lock); - goto pi_faulted; + switch (ret) { + case -EFAULT: + goto pi_faulted; + + case -EAGAIN: + goto pi_retry; + + default: + WARN_ON_ONCE(1); + goto out_putkey; + } } /* @@ -3079,6 +3103,11 @@ out_putkey: put_futex_key(&key); return ret; +pi_retry: + put_futex_key(&key); + cond_resched(); + goto retry; + pi_faulted: put_futex_key(&key); @@ -3182,7 +3211,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, u32 val, ktime_t *abs_time, u32 bitset, u32 __user *uaddr2) { - struct hrtimer_sleeper timeout, *to = NULL; + struct hrtimer_sleeper timeout, *to; struct futex_pi_state *pi_state = NULL; struct rt_mutex_waiter rt_waiter; struct futex_hash_bucket *hb; @@ -3199,15 +3228,8 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, if (!bitset) return -EINVAL; - if (abs_time) { - to = &timeout; - hrtimer_init_on_stack(&to->timer, (flags & FLAGS_CLOCKRT) ? - CLOCK_REALTIME : CLOCK_MONOTONIC, - HRTIMER_MODE_ABS); - hrtimer_init_sleeper(to, current); - hrtimer_set_expires_range_ns(&to->timer, *abs_time, - current->timer_slack_ns); - } + to = futex_setup_timer(abs_time, &timeout, flags, + current->timer_slack_ns); /* * The waiter is allocated on our stack, manipulated by the requeue @@ -3439,47 +3461,67 @@ err_unlock: static int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, int pi) { u32 uval, uninitialized_var(nval), mval; + int err; + + /* Futex address must be 32bit aligned */ + if ((((unsigned long)uaddr) % sizeof(*uaddr)) != 0) + return -1; retry: if (get_user(uval, uaddr)) return -1; - if ((uval & FUTEX_TID_MASK) == task_pid_vnr(curr)) { - /* - * Ok, this dying thread is truly holding a futex - * of interest. Set the OWNER_DIED bit atomically - * via cmpxchg, and if the value had FUTEX_WAITERS - * set, wake up a waiter (if any). (We have to do a - * futex_wake() even if OWNER_DIED is already set - - * to handle the rare but possible case of recursive - * thread-death.) The rest of the cleanup is done in - * userspace. - */ - mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED; - /* - * We are not holding a lock here, but we want to have - * the pagefault_disable/enable() protection because - * we want to handle the fault gracefully. If the - * access fails we try to fault in the futex with R/W - * verification via get_user_pages. get_user() above - * does not guarantee R/W access. If that fails we - * give up and leave the futex locked. - */ - if (cmpxchg_futex_value_locked(&nval, uaddr, uval, mval)) { + if ((uval & FUTEX_TID_MASK) != task_pid_vnr(curr)) + return 0; + + /* + * Ok, this dying thread is truly holding a futex + * of interest. Set the OWNER_DIED bit atomically + * via cmpxchg, and if the value had FUTEX_WAITERS + * set, wake up a waiter (if any). (We have to do a + * futex_wake() even if OWNER_DIED is already set - + * to handle the rare but possible case of recursive + * thread-death.) The rest of the cleanup is done in + * userspace. + */ + mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED; + + /* + * We are not holding a lock here, but we want to have + * the pagefault_disable/enable() protection because + * we want to handle the fault gracefully. If the + * access fails we try to fault in the futex with R/W + * verification via get_user_pages. get_user() above + * does not guarantee R/W access. If that fails we + * give up and leave the futex locked. + */ + if ((err = cmpxchg_futex_value_locked(&nval, uaddr, uval, mval))) { + switch (err) { + case -EFAULT: if (fault_in_user_writeable(uaddr)) return -1; goto retry; - } - if (nval != uval) + + case -EAGAIN: + cond_resched(); goto retry; - /* - * Wake robust non-PI futexes here. The wakeup of - * PI futexes happens in exit_pi_state(): - */ - if (!pi && (uval & FUTEX_WAITERS)) - futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY); + default: + WARN_ON_ONCE(1); + return err; + } } + + if (nval != uval) + goto retry; + + /* + * Wake robust non-PI futexes here. The wakeup of + * PI futexes happens in exit_pi_state(): + */ + if (!pi && (uval & FUTEX_WAITERS)) + futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY); + return 0; } @@ -3823,7 +3865,7 @@ err_unlock: #endif /* CONFIG_COMPAT */ #ifdef CONFIG_COMPAT_32BIT_TIME -COMPAT_SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, +SYSCALL_DEFINE6(futex_time32, u32 __user *, uaddr, int, op, u32, val, struct old_timespec32 __user *, utime, u32 __user *, uaddr2, u32, val3) { diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig index 1e3823fa799b..3941a9c48f83 100644 --- a/kernel/gcov/Kconfig +++ b/kernel/gcov/Kconfig @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0-only menu "GCOV-based kernel profiling" config GCOV_KERNEL @@ -53,6 +54,7 @@ config GCOV_PROFILE_ALL choice prompt "Specify GCOV format" depends on GCOV_KERNEL + depends on CC_IS_GCC ---help--- The gcov format is usually determined by the GCC version, and the default is chosen according to your GCC version. However, there are @@ -62,7 +64,7 @@ choice config GCOV_FORMAT_3_4 bool "GCC 3.4 format" - depends on CC_IS_GCC && GCC_VERSION < 40700 + depends on GCC_VERSION < 40700 ---help--- Select this option to use the format defined by GCC 3.4. diff --git a/kernel/gcov/Makefile b/kernel/gcov/Makefile index ff06d64df397..d66a74b0f100 100644 --- a/kernel/gcov/Makefile +++ b/kernel/gcov/Makefile @@ -2,5 +2,6 @@ ccflags-y := -DSRCTREE='"$(srctree)"' -DOBJTREE='"$(objtree)"' obj-y := base.o fs.o -obj-$(CONFIG_GCOV_FORMAT_3_4) += gcc_3_4.o -obj-$(CONFIG_GCOV_FORMAT_4_7) += gcc_4_7.o +obj-$(CONFIG_GCOV_FORMAT_3_4) += gcc_base.o gcc_3_4.o +obj-$(CONFIG_GCOV_FORMAT_4_7) += gcc_base.o gcc_4_7.o +obj-$(CONFIG_CC_IS_CLANG) += clang.o diff --git a/kernel/gcov/base.c b/kernel/gcov/base.c index 9c7c8d5c18f2..0ffe9f194080 100644 --- a/kernel/gcov/base.c +++ b/kernel/gcov/base.c @@ -22,88 +22,8 @@ #include <linux/sched.h> #include "gcov.h" -static int gcov_events_enabled; -static DEFINE_MUTEX(gcov_lock); - -/* - * __gcov_init is called by gcc-generated constructor code for each object - * file compiled with -fprofile-arcs. - */ -void __gcov_init(struct gcov_info *info) -{ - static unsigned int gcov_version; - - mutex_lock(&gcov_lock); - if (gcov_version == 0) { - gcov_version = gcov_info_version(info); - /* - * Printing gcc's version magic may prove useful for debugging - * incompatibility reports. - */ - pr_info("version magic: 0x%x\n", gcov_version); - } - /* - * Add new profiling data structure to list and inform event - * listener. - */ - gcov_info_link(info); - if (gcov_events_enabled) - gcov_event(GCOV_ADD, info); - mutex_unlock(&gcov_lock); -} -EXPORT_SYMBOL(__gcov_init); - -/* - * These functions may be referenced by gcc-generated profiling code but serve - * no function for kernel profiling. - */ -void __gcov_flush(void) -{ - /* Unused. */ -} -EXPORT_SYMBOL(__gcov_flush); - -void __gcov_merge_add(gcov_type *counters, unsigned int n_counters) -{ - /* Unused. */ -} -EXPORT_SYMBOL(__gcov_merge_add); - -void __gcov_merge_single(gcov_type *counters, unsigned int n_counters) -{ - /* Unused. */ -} -EXPORT_SYMBOL(__gcov_merge_single); - -void __gcov_merge_delta(gcov_type *counters, unsigned int n_counters) -{ - /* Unused. */ -} -EXPORT_SYMBOL(__gcov_merge_delta); - -void __gcov_merge_ior(gcov_type *counters, unsigned int n_counters) -{ - /* Unused. */ -} -EXPORT_SYMBOL(__gcov_merge_ior); - -void __gcov_merge_time_profile(gcov_type *counters, unsigned int n_counters) -{ - /* Unused. */ -} -EXPORT_SYMBOL(__gcov_merge_time_profile); - -void __gcov_merge_icall_topn(gcov_type *counters, unsigned int n_counters) -{ - /* Unused. */ -} -EXPORT_SYMBOL(__gcov_merge_icall_topn); - -void __gcov_exit(void) -{ - /* Unused. */ -} -EXPORT_SYMBOL(__gcov_exit); +int gcov_events_enabled; +DEFINE_MUTEX(gcov_lock); /** * gcov_enable_events - enable event reporting through gcov_event() @@ -144,7 +64,7 @@ static int gcov_module_notifier(struct notifier_block *nb, unsigned long event, /* Remove entries located in module from linked list. */ while ((info = gcov_info_next(info))) { - if (within_module((unsigned long)info, mod)) { + if (gcov_info_within_module(info, mod)) { gcov_info_unlink(prev, info); if (gcov_events_enabled) gcov_event(GCOV_REMOVE, info); diff --git a/kernel/gcov/clang.c b/kernel/gcov/clang.c new file mode 100644 index 000000000000..c94b820a1b62 --- /dev/null +++ b/kernel/gcov/clang.c @@ -0,0 +1,581 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2019 Google, Inc. + * modified from kernel/gcov/gcc_4_7.c + * + * This software is licensed under the terms of the GNU General Public + * License version 2, as published by the Free Software Foundation, and + * may be copied, distributed, and modified under those terms. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * + * LLVM uses profiling data that's deliberately similar to GCC, but has a + * very different way of exporting that data. LLVM calls llvm_gcov_init() once + * per module, and provides a couple of callbacks that we can use to ask for + * more data. + * + * We care about the "writeout" callback, which in turn calls back into + * compiler-rt/this module to dump all the gathered coverage data to disk: + * + * llvm_gcda_start_file() + * llvm_gcda_emit_function() + * llvm_gcda_emit_arcs() + * llvm_gcda_emit_function() + * llvm_gcda_emit_arcs() + * [... repeats for each function ...] + * llvm_gcda_summary_info() + * llvm_gcda_end_file() + * + * This design is much more stateless and unstructured than gcc's, and is + * intended to run at process exit. This forces us to keep some local state + * about which module we're dealing with at the moment. On the other hand, it + * also means we don't depend as much on how LLVM represents profiling data + * internally. + * + * See LLVM's lib/Transforms/Instrumentation/GCOVProfiling.cpp for more + * details on how this works, particularly GCOVProfiler::emitProfileArcs(), + * GCOVProfiler::insertCounterWriteout(), and + * GCOVProfiler::insertFlush(). + */ + +#define pr_fmt(fmt) "gcov: " fmt + +#include <linux/kernel.h> +#include <linux/list.h> +#include <linux/printk.h> +#include <linux/ratelimit.h> +#include <linux/seq_file.h> +#include <linux/slab.h> +#include <linux/vmalloc.h> +#include "gcov.h" + +typedef void (*llvm_gcov_callback)(void); + +struct gcov_info { + struct list_head head; + + const char *filename; + unsigned int version; + u32 checksum; + + struct list_head functions; +}; + +struct gcov_fn_info { + struct list_head head; + + u32 ident; + u32 checksum; + u8 use_extra_checksum; + u32 cfg_checksum; + + u32 num_counters; + u64 *counters; + const char *function_name; +}; + +static struct gcov_info *current_info; + +static LIST_HEAD(clang_gcov_list); + +void llvm_gcov_init(llvm_gcov_callback writeout, llvm_gcov_callback flush) +{ + struct gcov_info *info = kzalloc(sizeof(*info), GFP_KERNEL); + + if (!info) + return; + + INIT_LIST_HEAD(&info->head); + INIT_LIST_HEAD(&info->functions); + + mutex_lock(&gcov_lock); + + list_add_tail(&info->head, &clang_gcov_list); + current_info = info; + writeout(); + current_info = NULL; + if (gcov_events_enabled) + gcov_event(GCOV_ADD, info); + + mutex_unlock(&gcov_lock); +} +EXPORT_SYMBOL(llvm_gcov_init); + +void llvm_gcda_start_file(const char *orig_filename, const char version[4], + u32 checksum) +{ + current_info->filename = orig_filename; + memcpy(¤t_info->version, version, sizeof(current_info->version)); + current_info->checksum = checksum; +} +EXPORT_SYMBOL(llvm_gcda_start_file); + +void llvm_gcda_emit_function(u32 ident, const char *function_name, + u32 func_checksum, u8 use_extra_checksum, u32 cfg_checksum) +{ + struct gcov_fn_info *info = kzalloc(sizeof(*info), GFP_KERNEL); + + if (!info) + return; + + INIT_LIST_HEAD(&info->head); + info->ident = ident; + info->checksum = func_checksum; + info->use_extra_checksum = use_extra_checksum; + info->cfg_checksum = cfg_checksum; + if (function_name) + info->function_name = kstrdup(function_name, GFP_KERNEL); + + list_add_tail(&info->head, ¤t_info->functions); +} +EXPORT_SYMBOL(llvm_gcda_emit_function); + +void llvm_gcda_emit_arcs(u32 num_counters, u64 *counters) +{ + struct gcov_fn_info *info = list_last_entry(¤t_info->functions, + struct gcov_fn_info, head); + + info->num_counters = num_counters; + info->counters = counters; +} +EXPORT_SYMBOL(llvm_gcda_emit_arcs); + +void llvm_gcda_summary_info(void) +{ +} +EXPORT_SYMBOL(llvm_gcda_summary_info); + +void llvm_gcda_end_file(void) +{ +} +EXPORT_SYMBOL(llvm_gcda_end_file); + +/** + * gcov_info_filename - return info filename + * @info: profiling data set + */ +const char *gcov_info_filename(struct gcov_info *info) +{ + return info->filename; +} + +/** + * gcov_info_version - return info version + * @info: profiling data set + */ +unsigned int gcov_info_version(struct gcov_info *info) +{ + return info->version; +} + +/** + * gcov_info_next - return next profiling data set + * @info: profiling data set + * + * Returns next gcov_info following @info or first gcov_info in the chain if + * @info is %NULL. + */ +struct gcov_info *gcov_info_next(struct gcov_info *info) +{ + if (!info) + return list_first_entry_or_null(&clang_gcov_list, + struct gcov_info, head); + if (list_is_last(&info->head, &clang_gcov_list)) + return NULL; + return list_next_entry(info, head); +} + +/** + * gcov_info_link - link/add profiling data set to the list + * @info: profiling data set + */ +void gcov_info_link(struct gcov_info *info) +{ + list_add_tail(&info->head, &clang_gcov_list); +} + +/** + * gcov_info_unlink - unlink/remove profiling data set from the list + * @prev: previous profiling data set + * @info: profiling data set + */ +void gcov_info_unlink(struct gcov_info *prev, struct gcov_info *info) +{ + /* Generic code unlinks while iterating. */ + __list_del_entry(&info->head); +} + +/** + * gcov_info_within_module - check if a profiling data set belongs to a module + * @info: profiling data set + * @mod: module + * + * Returns true if profiling data belongs module, false otherwise. + */ +bool gcov_info_within_module(struct gcov_info *info, struct module *mod) +{ + return within_module((unsigned long)info->filename, mod); +} + +/* Symbolic links to be created for each profiling data file. */ +const struct gcov_link gcov_link[] = { + { OBJ_TREE, "gcno" }, /* Link to .gcno file in $(objtree). */ + { 0, NULL}, +}; + +/** + * gcov_info_reset - reset profiling data to zero + * @info: profiling data set + */ +void gcov_info_reset(struct gcov_info *info) +{ + struct gcov_fn_info *fn; + + list_for_each_entry(fn, &info->functions, head) + memset(fn->counters, 0, + sizeof(fn->counters[0]) * fn->num_counters); +} + +/** + * gcov_info_is_compatible - check if profiling data can be added + * @info1: first profiling data set + * @info2: second profiling data set + * + * Returns non-zero if profiling data can be added, zero otherwise. + */ +int gcov_info_is_compatible(struct gcov_info *info1, struct gcov_info *info2) +{ + struct gcov_fn_info *fn_ptr1 = list_first_entry_or_null( + &info1->functions, struct gcov_fn_info, head); + struct gcov_fn_info *fn_ptr2 = list_first_entry_or_null( + &info2->functions, struct gcov_fn_info, head); + + if (info1->checksum != info2->checksum) + return false; + if (!fn_ptr1) + return fn_ptr1 == fn_ptr2; + while (!list_is_last(&fn_ptr1->head, &info1->functions) && + !list_is_last(&fn_ptr2->head, &info2->functions)) { + if (fn_ptr1->checksum != fn_ptr2->checksum) + return false; + if (fn_ptr1->use_extra_checksum != fn_ptr2->use_extra_checksum) + return false; + if (fn_ptr1->use_extra_checksum && + fn_ptr1->cfg_checksum != fn_ptr2->cfg_checksum) + return false; + fn_ptr1 = list_next_entry(fn_ptr1, head); + fn_ptr2 = list_next_entry(fn_ptr2, head); + } + return list_is_last(&fn_ptr1->head, &info1->functions) && + list_is_last(&fn_ptr2->head, &info2->functions); +} + +/** + * gcov_info_add - add up profiling data + * @dest: profiling data set to which data is added + * @source: profiling data set which is added + * + * Adds profiling counts of @source to @dest. + */ +void gcov_info_add(struct gcov_info *dst, struct gcov_info *src) +{ + struct gcov_fn_info *dfn_ptr; + struct gcov_fn_info *sfn_ptr = list_first_entry_or_null(&src->functions, + struct gcov_fn_info, head); + + list_for_each_entry(dfn_ptr, &dst->functions, head) { + u32 i; + + for (i = 0; i < sfn_ptr->num_counters; i++) + dfn_ptr->counters[i] += sfn_ptr->counters[i]; + } +} + +static struct gcov_fn_info *gcov_fn_info_dup(struct gcov_fn_info *fn) +{ + size_t cv_size; /* counter values size */ + struct gcov_fn_info *fn_dup = kmemdup(fn, sizeof(*fn), + GFP_KERNEL); + if (!fn_dup) + return NULL; + INIT_LIST_HEAD(&fn_dup->head); + + fn_dup->function_name = kstrdup(fn->function_name, GFP_KERNEL); + if (!fn_dup->function_name) + goto err_name; + + cv_size = fn->num_counters * sizeof(fn->counters[0]); + fn_dup->counters = vmalloc(cv_size); + if (!fn_dup->counters) + goto err_counters; + memcpy(fn_dup->counters, fn->counters, cv_size); + + return fn_dup; + +err_counters: + kfree(fn_dup->function_name); +err_name: + kfree(fn_dup); + return NULL; +} + +/** + * gcov_info_dup - duplicate profiling data set + * @info: profiling data set to duplicate + * + * Return newly allocated duplicate on success, %NULL on error. + */ +struct gcov_info *gcov_info_dup(struct gcov_info *info) +{ + struct gcov_info *dup; + struct gcov_fn_info *fn; + + dup = kmemdup(info, sizeof(*dup), GFP_KERNEL); + if (!dup) + return NULL; + INIT_LIST_HEAD(&dup->head); + INIT_LIST_HEAD(&dup->functions); + dup->filename = kstrdup(info->filename, GFP_KERNEL); + if (!dup->filename) + goto err; + + list_for_each_entry(fn, &info->functions, head) { + struct gcov_fn_info *fn_dup = gcov_fn_info_dup(fn); + + if (!fn_dup) + goto err; + list_add_tail(&fn_dup->head, &dup->functions); + } + + return dup; + +err: + gcov_info_free(dup); + return NULL; +} + +/** + * gcov_info_free - release memory for profiling data set duplicate + * @info: profiling data set duplicate to free + */ +void gcov_info_free(struct gcov_info *info) +{ + struct gcov_fn_info *fn, *tmp; + + list_for_each_entry_safe(fn, tmp, &info->functions, head) { + kfree(fn->function_name); + vfree(fn->counters); + list_del(&fn->head); + kfree(fn); + } + kfree(info->filename); + kfree(info); +} + +#define ITER_STRIDE PAGE_SIZE + +/** + * struct gcov_iterator - specifies current file position in logical records + * @info: associated profiling data + * @buffer: buffer containing file data + * @size: size of buffer + * @pos: current position in file + */ +struct gcov_iterator { + struct gcov_info *info; + void *buffer; + size_t size; + loff_t pos; +}; + +/** + * store_gcov_u32 - store 32 bit number in gcov format to buffer + * @buffer: target buffer or NULL + * @off: offset into the buffer + * @v: value to be stored + * + * Number format defined by gcc: numbers are recorded in the 32 bit + * unsigned binary form of the endianness of the machine generating the + * file. Returns the number of bytes stored. If @buffer is %NULL, doesn't + * store anything. + */ +static size_t store_gcov_u32(void *buffer, size_t off, u32 v) +{ + u32 *data; + + if (buffer) { + data = buffer + off; + *data = v; + } + + return sizeof(*data); +} + +/** + * store_gcov_u64 - store 64 bit number in gcov format to buffer + * @buffer: target buffer or NULL + * @off: offset into the buffer + * @v: value to be stored + * + * Number format defined by gcc: numbers are recorded in the 32 bit + * unsigned binary form of the endianness of the machine generating the + * file. 64 bit numbers are stored as two 32 bit numbers, the low part + * first. Returns the number of bytes stored. If @buffer is %NULL, doesn't store + * anything. + */ +static size_t store_gcov_u64(void *buffer, size_t off, u64 v) +{ + u32 *data; + + if (buffer) { + data = buffer + off; + + data[0] = (v & 0xffffffffUL); + data[1] = (v >> 32); + } + + return sizeof(*data) * 2; +} + +/** + * convert_to_gcda - convert profiling data set to gcda file format + * @buffer: the buffer to store file data or %NULL if no data should be stored + * @info: profiling data set to be converted + * + * Returns the number of bytes that were/would have been stored into the buffer. + */ +static size_t convert_to_gcda(char *buffer, struct gcov_info *info) +{ + struct gcov_fn_info *fi_ptr; + size_t pos = 0; + + /* File header. */ + pos += store_gcov_u32(buffer, pos, GCOV_DATA_MAGIC); + pos += store_gcov_u32(buffer, pos, info->version); + pos += store_gcov_u32(buffer, pos, info->checksum); + + list_for_each_entry(fi_ptr, &info->functions, head) { + u32 i; + u32 len = 2; + + if (fi_ptr->use_extra_checksum) + len++; + + pos += store_gcov_u32(buffer, pos, GCOV_TAG_FUNCTION); + pos += store_gcov_u32(buffer, pos, len); + pos += store_gcov_u32(buffer, pos, fi_ptr->ident); + pos += store_gcov_u32(buffer, pos, fi_ptr->checksum); + if (fi_ptr->use_extra_checksum) + pos += store_gcov_u32(buffer, pos, fi_ptr->cfg_checksum); + + pos += store_gcov_u32(buffer, pos, GCOV_TAG_COUNTER_BASE); + pos += store_gcov_u32(buffer, pos, fi_ptr->num_counters * 2); + for (i = 0; i < fi_ptr->num_counters; i++) + pos += store_gcov_u64(buffer, pos, fi_ptr->counters[i]); + } + + return pos; +} + +/** + * gcov_iter_new - allocate and initialize profiling data iterator + * @info: profiling data set to be iterated + * + * Return file iterator on success, %NULL otherwise. + */ +struct gcov_iterator *gcov_iter_new(struct gcov_info *info) +{ + struct gcov_iterator *iter; + + iter = kzalloc(sizeof(struct gcov_iterator), GFP_KERNEL); + if (!iter) + goto err_free; + + iter->info = info; + /* Dry-run to get the actual buffer size. */ + iter->size = convert_to_gcda(NULL, info); + iter->buffer = vmalloc(iter->size); + if (!iter->buffer) + goto err_free; + + convert_to_gcda(iter->buffer, info); + + return iter; + +err_free: + kfree(iter); + return NULL; +} + + +/** + * gcov_iter_get_info - return profiling data set for given file iterator + * @iter: file iterator + */ +void gcov_iter_free(struct gcov_iterator *iter) +{ + vfree(iter->buffer); + kfree(iter); +} + +/** + * gcov_iter_get_info - return profiling data set for given file iterator + * @iter: file iterator + */ +struct gcov_info *gcov_iter_get_info(struct gcov_iterator *iter) +{ + return iter->info; +} + +/** + * gcov_iter_start - reset file iterator to starting position + * @iter: file iterator + */ +void gcov_iter_start(struct gcov_iterator *iter) +{ + iter->pos = 0; +} + +/** + * gcov_iter_next - advance file iterator to next logical record + * @iter: file iterator + * + * Return zero if new position is valid, non-zero if iterator has reached end. + */ +int gcov_iter_next(struct gcov_iterator *iter) +{ + if (iter->pos < iter->size) + iter->pos += ITER_STRIDE; + + if (iter->pos >= iter->size) + return -EINVAL; + + return 0; +} + +/** + * gcov_iter_write - write data for current pos to seq_file + * @iter: file iterator + * @seq: seq_file handle + * + * Return zero on success, non-zero otherwise. + */ +int gcov_iter_write(struct gcov_iterator *iter, struct seq_file *seq) +{ + size_t len; + + if (iter->pos >= iter->size) + return -EINVAL; + + len = ITER_STRIDE; + if (iter->pos + len > iter->size) + len = iter->size - iter->pos; + + seq_write(seq, iter->buffer + iter->pos, len); + + return 0; +} diff --git a/kernel/gcov/fs.c b/kernel/gcov/fs.c index 6e40ff6be083..e5eb5ea7ea59 100644 --- a/kernel/gcov/fs.c +++ b/kernel/gcov/fs.c @@ -64,7 +64,6 @@ struct gcov_node { static const char objtree[] = OBJTREE; static const char srctree[] = SRCTREE; static struct gcov_node root_node; -static struct dentry *reset_dentry; static LIST_HEAD(all_head); static DEFINE_MUTEX(node_lock); @@ -387,8 +386,6 @@ static void add_links(struct gcov_node *node, struct dentry *parent) goto out_err; node->links[i] = debugfs_create_symlink(deskew(basename), parent, target); - if (!node->links[i]) - goto out_err; kfree(target); } @@ -450,11 +447,6 @@ static struct gcov_node *new_node(struct gcov_node *parent, parent->dentry, node, &gcov_data_fops); } else node->dentry = debugfs_create_dir(node->name, parent->dentry); - if (!node->dentry) { - pr_warn("could not create file\n"); - kfree(node); - return NULL; - } if (info) add_links(node, parent->dentry); list_add(&node->list, &parent->children); @@ -761,32 +753,20 @@ void gcov_event(enum gcov_action action, struct gcov_info *info) /* Create debugfs entries. */ static __init int gcov_fs_init(void) { - int rc = -EIO; - init_node(&root_node, NULL, NULL, NULL); /* * /sys/kernel/debug/gcov will be parent for the reset control file * and all profiling files. */ root_node.dentry = debugfs_create_dir("gcov", NULL); - if (!root_node.dentry) - goto err_remove; /* * Create reset file which resets all profiling counts when written * to. */ - reset_dentry = debugfs_create_file("reset", 0600, root_node.dentry, - NULL, &gcov_reset_fops); - if (!reset_dentry) - goto err_remove; + debugfs_create_file("reset", 0600, root_node.dentry, NULL, + &gcov_reset_fops); /* Replay previous events to get our fs hierarchy up-to-date. */ gcov_enable_events(); return 0; - -err_remove: - pr_err("init failed\n"); - debugfs_remove(root_node.dentry); - - return rc; } device_initcall(gcov_fs_init); diff --git a/kernel/gcov/gcc_3_4.c b/kernel/gcov/gcc_3_4.c index 1e32e66c9563..801ee4b0b969 100644 --- a/kernel/gcov/gcc_3_4.c +++ b/kernel/gcov/gcc_3_4.c @@ -137,6 +137,18 @@ void gcov_info_unlink(struct gcov_info *prev, struct gcov_info *info) gcov_info_head = info->next; } +/** + * gcov_info_within_module - check if a profiling data set belongs to a module + * @info: profiling data set + * @mod: module + * + * Returns true if profiling data belongs module, false otherwise. + */ +bool gcov_info_within_module(struct gcov_info *info, struct module *mod) +{ + return within_module((unsigned long)info, mod); +} + /* Symbolic links to be created for each profiling data file. */ const struct gcov_link gcov_link[] = { { OBJ_TREE, "gcno" }, /* Link to .gcno file in $(objtree). */ @@ -245,8 +257,7 @@ struct gcov_info *gcov_info_dup(struct gcov_info *info) /* Duplicate gcov_info. */ active = num_counter_active(info); - dup = kzalloc(sizeof(struct gcov_info) + - sizeof(struct gcov_ctr_info) * active, GFP_KERNEL); + dup = kzalloc(struct_size(dup, counts, active), GFP_KERNEL); if (!dup) return NULL; dup->version = info->version; @@ -364,8 +375,7 @@ struct gcov_iterator *gcov_iter_new(struct gcov_info *info) { struct gcov_iterator *iter; - iter = kzalloc(sizeof(struct gcov_iterator) + - num_counter_active(info) * sizeof(struct type_info), + iter = kzalloc(struct_size(iter, type_info, num_counter_active(info)), GFP_KERNEL); if (iter) iter->info = info; diff --git a/kernel/gcov/gcc_4_7.c b/kernel/gcov/gcc_4_7.c index ca5e5c0ef853..ec37563674d6 100644 --- a/kernel/gcov/gcc_4_7.c +++ b/kernel/gcov/gcc_4_7.c @@ -150,6 +150,18 @@ void gcov_info_unlink(struct gcov_info *prev, struct gcov_info *info) gcov_info_head = info->next; } +/** + * gcov_info_within_module - check if a profiling data set belongs to a module + * @info: profiling data set + * @mod: module + * + * Returns true if profiling data belongs module, false otherwise. + */ +bool gcov_info_within_module(struct gcov_info *info, struct module *mod) +{ + return within_module((unsigned long)info, mod); +} + /* Symbolic links to be created for each profiling data file. */ const struct gcov_link gcov_link[] = { { OBJ_TREE, "gcno" }, /* Link to .gcno file in $(objtree). */ diff --git a/kernel/gcov/gcc_base.c b/kernel/gcov/gcc_base.c new file mode 100644 index 000000000000..3cf736b9f880 --- /dev/null +++ b/kernel/gcov/gcc_base.c @@ -0,0 +1,86 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/export.h> +#include <linux/kernel.h> +#include <linux/mutex.h> +#include "gcov.h" + +/* + * __gcov_init is called by gcc-generated constructor code for each object + * file compiled with -fprofile-arcs. + */ +void __gcov_init(struct gcov_info *info) +{ + static unsigned int gcov_version; + + mutex_lock(&gcov_lock); + if (gcov_version == 0) { + gcov_version = gcov_info_version(info); + /* + * Printing gcc's version magic may prove useful for debugging + * incompatibility reports. + */ + pr_info("version magic: 0x%x\n", gcov_version); + } + /* + * Add new profiling data structure to list and inform event + * listener. + */ + gcov_info_link(info); + if (gcov_events_enabled) + gcov_event(GCOV_ADD, info); + mutex_unlock(&gcov_lock); +} +EXPORT_SYMBOL(__gcov_init); + +/* + * These functions may be referenced by gcc-generated profiling code but serve + * no function for kernel profiling. + */ +void __gcov_flush(void) +{ + /* Unused. */ +} +EXPORT_SYMBOL(__gcov_flush); + +void __gcov_merge_add(gcov_type *counters, unsigned int n_counters) +{ + /* Unused. */ +} +EXPORT_SYMBOL(__gcov_merge_add); + +void __gcov_merge_single(gcov_type *counters, unsigned int n_counters) +{ + /* Unused. */ +} +EXPORT_SYMBOL(__gcov_merge_single); + +void __gcov_merge_delta(gcov_type *counters, unsigned int n_counters) +{ + /* Unused. */ +} +EXPORT_SYMBOL(__gcov_merge_delta); + +void __gcov_merge_ior(gcov_type *counters, unsigned int n_counters) +{ + /* Unused. */ +} +EXPORT_SYMBOL(__gcov_merge_ior); + +void __gcov_merge_time_profile(gcov_type *counters, unsigned int n_counters) +{ + /* Unused. */ +} +EXPORT_SYMBOL(__gcov_merge_time_profile); + +void __gcov_merge_icall_topn(gcov_type *counters, unsigned int n_counters) +{ + /* Unused. */ +} +EXPORT_SYMBOL(__gcov_merge_icall_topn); + +void __gcov_exit(void) +{ + /* Unused. */ +} +EXPORT_SYMBOL(__gcov_exit); diff --git a/kernel/gcov/gcov.h b/kernel/gcov/gcov.h index de118ad4a024..6ab2c1808c9d 100644 --- a/kernel/gcov/gcov.h +++ b/kernel/gcov/gcov.h @@ -15,6 +15,7 @@ #ifndef GCOV_H #define GCOV_H GCOV_H +#include <linux/module.h> #include <linux/types.h> /* @@ -46,6 +47,7 @@ unsigned int gcov_info_version(struct gcov_info *info); struct gcov_info *gcov_info_next(struct gcov_info *info); void gcov_info_link(struct gcov_info *info); void gcov_info_unlink(struct gcov_info *prev, struct gcov_info *info); +bool gcov_info_within_module(struct gcov_info *info, struct module *mod); /* Base interface. */ enum gcov_action { @@ -83,4 +85,7 @@ struct gcov_link { }; extern const struct gcov_link gcov_link[]; +extern int gcov_events_enabled; +extern struct mutex gcov_lock; + #endif /* GCOV_H */ diff --git a/kernel/gen_kheaders.sh b/kernel/gen_kheaders.sh new file mode 100755 index 000000000000..9ff449888d9c --- /dev/null +++ b/kernel/gen_kheaders.sh @@ -0,0 +1,81 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +# This script generates an archive consisting of kernel headers +# for CONFIG_IKHEADERS. +set -e +sfile="$(readlink -f "$0")" +outdir="$(pwd)" +tarfile=$1 +cpio_dir=$outdir/$tarfile.tmp + +dir_list=" +include/ +arch/$SRCARCH/include/ +" + +# Support incremental builds by skipping archive generation +# if timestamps of files being archived are not changed. + +# This block is useful for debugging the incremental builds. +# Uncomment it for debugging. +# if [ ! -f /tmp/iter ]; then iter=1; echo 1 > /tmp/iter; +# else iter=$(($(cat /tmp/iter) + 1)); echo $iter > /tmp/iter; fi +# find $src_file_list -name "*.h" | xargs ls -l > /tmp/src-ls-$iter +# find $obj_file_list -name "*.h" | xargs ls -l > /tmp/obj-ls-$iter + +# include/generated/compile.h is ignored because it is touched even when none +# of the source files changed. This causes pointless regeneration, so let us +# ignore them for md5 calculation. +pushd $srctree > /dev/null +src_files_md5="$(find $dir_list -name "*.h" | + grep -v "include/generated/compile.h" | + grep -v "include/generated/autoconf.h" | + xargs ls -l | md5sum | cut -d ' ' -f1)" +popd > /dev/null +obj_files_md5="$(find $dir_list -name "*.h" | + grep -v "include/generated/compile.h" | + grep -v "include/generated/autoconf.h" | + xargs ls -l | md5sum | cut -d ' ' -f1)" +# Any changes to this script will also cause a rebuild of the archive. +this_file_md5="$(ls -l $sfile | md5sum | cut -d ' ' -f1)" +if [ -f $tarfile ]; then tarfile_md5="$(md5sum $tarfile | cut -d ' ' -f1)"; fi +if [ -f kernel/kheaders.md5 ] && + [ "$(cat kernel/kheaders.md5|head -1)" == "$src_files_md5" ] && + [ "$(cat kernel/kheaders.md5|head -2|tail -1)" == "$obj_files_md5" ] && + [ "$(cat kernel/kheaders.md5|head -3|tail -1)" == "$this_file_md5" ] && + [ "$(cat kernel/kheaders.md5|tail -1)" == "$tarfile_md5" ]; then + exit +fi + +if [ "${quiet}" != "silent_" ]; then + echo " GEN $tarfile" +fi + +rm -rf $cpio_dir +mkdir $cpio_dir + +pushd $srctree > /dev/null +for f in $dir_list; + do find "$f" -name "*.h"; +done | cpio --quiet -pd $cpio_dir +popd > /dev/null + +# The second CPIO can complain if files already exist which can +# happen with out of tree builds. Just silence CPIO for now. +for f in $dir_list; + do find "$f" -name "*.h"; +done | cpio --quiet -pd $cpio_dir >/dev/null 2>&1 + +# Remove comments except SDPX lines +find $cpio_dir -type f -print0 | + xargs -0 -P8 -n1 perl -pi -e 'BEGIN {undef $/;}; s/\/\*((?!SPDX).)*?\*\///smg;' + +tar -Jcf $tarfile -C $cpio_dir/ . > /dev/null + +echo "$src_files_md5" > kernel/kheaders.md5 +echo "$obj_files_md5" >> kernel/kheaders.md5 +echo "$this_file_md5" >> kernel/kheaders.md5 +echo "$(md5sum $tarfile | cut -d ' ' -f1)" >> kernel/kheaders.md5 + +rm -rf $cpio_dir diff --git a/kernel/hung_task.c b/kernel/hung_task.c index 4a9191617076..14a625c16cb3 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Detect Hung Task * @@ -19,6 +20,7 @@ #include <linux/utsname.h> #include <linux/sched/signal.h> #include <linux/sched/debug.h> +#include <linux/sched/sysctl.h> #include <trace/events/sched.h> @@ -126,7 +128,7 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout) if (sysctl_hung_task_warnings > 0) sysctl_hung_task_warnings--; pr_err("INFO: task %s:%d blocked for more than %ld seconds.\n", - t->comm, t->pid, timeout); + t->comm, t->pid, (jiffies - t->last_switch_time) / HZ); pr_err(" %s %s %.*s\n", print_tainted(), init_utsname()->release, (int)strcspn(init_utsname()->version, " "), diff --git a/kernel/iomem.c b/kernel/iomem.c index f7525e14ebc6..62c92e43aa0d 100644 --- a/kernel/iomem.c +++ b/kernel/iomem.c @@ -55,7 +55,7 @@ static void *try_ram_remap(resource_size_t offset, size_t size, * * MEMREMAP_WB - matches the default mapping for System RAM on * the architecture. This is usually a read-allocate write-back cache. - * Morever, if MEMREMAP_WB is specified and the requested remap region is RAM + * Moreover, if MEMREMAP_WB is specified and the requested remap region is RAM * memremap() will bypass establishing a new mapping and instead return * a pointer into the direct map. * @@ -86,7 +86,7 @@ void *memremap(resource_size_t offset, size_t size, unsigned long flags) /* Try all mapping types requested until one returns non-NULL */ if (flags & MEMREMAP_WB) { /* - * MEMREMAP_WB is special in that it can be satisifed + * MEMREMAP_WB is special in that it can be satisfied * from the direct map. Some archs depend on the * capability of memremap() to autodetect cases where * the requested range is potentially in System RAM. @@ -121,7 +121,7 @@ EXPORT_SYMBOL(memremap); void memunmap(void *addr) { - if (is_vmalloc_addr(addr)) + if (is_ioremap_addr(addr)) iounmap((void __iomem *) addr); } EXPORT_SYMBOL(memunmap); diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index 5f3e2baefca9..f92d9a687372 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0-only menu "IRQ subsystem" # Options selectable by the architecture code @@ -91,6 +92,9 @@ config GENERIC_MSI_IRQ_DOMAIN select IRQ_DOMAIN_HIERARCHY select GENERIC_MSI_IRQ +config IRQ_MSI_IOMMU + bool + config HANDLE_DOMAIN_IRQ bool diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile index ff6e352e3a6c..b4f53717d143 100644 --- a/kernel/irq/Makefile +++ b/kernel/irq/Makefile @@ -2,6 +2,9 @@ obj-y := irqdesc.o handle.o manage.o spurious.o resend.o chip.o dummychip.o devres.o obj-$(CONFIG_IRQ_TIMINGS) += timings.o +ifeq ($(CONFIG_TEST_IRQ_TIMINGS),y) + CFLAGS_timings.o += -DDEBUG +endif obj-$(CONFIG_GENERIC_IRQ_CHIP) += generic-chip.o obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o obj-$(CONFIG_IRQ_DOMAIN) += irqdomain.o diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c index 45b68b4ea48b..4352b08ae48d 100644 --- a/kernel/irq/affinity.c +++ b/kernel/irq/affinity.c @@ -9,7 +9,7 @@ #include <linux/cpu.h> static void irq_spread_init_one(struct cpumask *irqmsk, struct cpumask *nmsk, - int cpus_per_vec) + unsigned int cpus_per_vec) { const struct cpumask *siblmsk; int cpu, sibl; @@ -94,16 +94,17 @@ static int get_nodes_in_cpumask(cpumask_var_t *node_to_cpumask, return nodes; } -static int __irq_build_affinity_masks(const struct irq_affinity *affd, - int startvec, int numvecs, int firstvec, +static int __irq_build_affinity_masks(unsigned int startvec, + unsigned int numvecs, + unsigned int firstvec, cpumask_var_t *node_to_cpumask, const struct cpumask *cpu_mask, struct cpumask *nmsk, struct irq_affinity_desc *masks) { - int n, nodes, cpus_per_vec, extra_vecs, done = 0; - int last_affv = firstvec + numvecs; - int curvec = startvec; + unsigned int n, nodes, cpus_per_vec, extra_vecs, done = 0; + unsigned int last_affv = firstvec + numvecs; + unsigned int curvec = startvec; nodemask_t nodemsk = NODE_MASK_NONE; if (!cpumask_weight(cpu_mask)) @@ -117,18 +118,16 @@ static int __irq_build_affinity_masks(const struct irq_affinity *affd, */ if (numvecs <= nodes) { for_each_node_mask(n, nodemsk) { - cpumask_or(&masks[curvec].mask, - &masks[curvec].mask, - node_to_cpumask[n]); + cpumask_or(&masks[curvec].mask, &masks[curvec].mask, + node_to_cpumask[n]); if (++curvec == last_affv) curvec = firstvec; } - done = numvecs; - goto out; + return numvecs; } for_each_node_mask(n, nodemsk) { - int ncpus, v, vecs_to_assign, vecs_per_node; + unsigned int ncpus, v, vecs_to_assign, vecs_per_node; /* Spread the vectors per node */ vecs_per_node = (numvecs - (curvec - firstvec)) / nodes; @@ -163,8 +162,6 @@ static int __irq_build_affinity_masks(const struct irq_affinity *affd, curvec = firstvec; --nodes; } - -out: return done; } @@ -173,20 +170,24 @@ out: * 1) spread present CPU on these vectors * 2) spread other possible CPUs on these vectors */ -static int irq_build_affinity_masks(const struct irq_affinity *affd, - int startvec, int numvecs, int firstvec, - cpumask_var_t *node_to_cpumask, +static int irq_build_affinity_masks(unsigned int startvec, unsigned int numvecs, + unsigned int firstvec, struct irq_affinity_desc *masks) { - int curvec = startvec, nr_present, nr_others; - int ret = -ENOMEM; + unsigned int curvec = startvec, nr_present, nr_others; + cpumask_var_t *node_to_cpumask; cpumask_var_t nmsk, npresmsk; + int ret = -ENOMEM; if (!zalloc_cpumask_var(&nmsk, GFP_KERNEL)) return ret; if (!zalloc_cpumask_var(&npresmsk, GFP_KERNEL)) - goto fail; + goto fail_nmsk; + + node_to_cpumask = alloc_node_to_cpumask(); + if (!node_to_cpumask) + goto fail_npresmsk; ret = 0; /* Stabilize the cpumasks */ @@ -194,7 +195,7 @@ static int irq_build_affinity_masks(const struct irq_affinity *affd, build_node_to_cpumask(node_to_cpumask); /* Spread on present CPUs starting from affd->pre_vectors */ - nr_present = __irq_build_affinity_masks(affd, curvec, numvecs, + nr_present = __irq_build_affinity_masks(curvec, numvecs, firstvec, node_to_cpumask, cpu_present_mask, nmsk, masks); @@ -209,7 +210,7 @@ static int irq_build_affinity_masks(const struct irq_affinity *affd, else curvec = firstvec + nr_present; cpumask_andnot(npresmsk, cpu_possible_mask, cpu_present_mask); - nr_others = __irq_build_affinity_masks(affd, curvec, numvecs, + nr_others = __irq_build_affinity_masks(curvec, numvecs, firstvec, node_to_cpumask, npresmsk, nmsk, masks); put_online_cpus(); @@ -217,13 +218,22 @@ static int irq_build_affinity_masks(const struct irq_affinity *affd, if (nr_present < numvecs) WARN_ON(nr_present + nr_others < numvecs); + free_node_to_cpumask(node_to_cpumask); + + fail_npresmsk: free_cpumask_var(npresmsk); - fail: + fail_nmsk: free_cpumask_var(nmsk); return ret; } +static void default_calc_sets(struct irq_affinity *affd, unsigned int affvecs) +{ + affd->nr_sets = 1; + affd->set_size[0] = affvecs; +} + /** * irq_create_affinity_masks - Create affinity masks for multiqueue spreading * @nvecs: The total number of vectors @@ -232,50 +242,62 @@ static int irq_build_affinity_masks(const struct irq_affinity *affd, * Returns the irq_affinity_desc pointer or NULL if allocation failed. */ struct irq_affinity_desc * -irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd) +irq_create_affinity_masks(unsigned int nvecs, struct irq_affinity *affd) { - int affvecs = nvecs - affd->pre_vectors - affd->post_vectors; - int curvec, usedvecs; - cpumask_var_t *node_to_cpumask; + unsigned int affvecs, curvec, usedvecs, i; struct irq_affinity_desc *masks = NULL; - int i, nr_sets; /* - * If there aren't any vectors left after applying the pre/post - * vectors don't bother with assigning affinity. + * Determine the number of vectors which need interrupt affinities + * assigned. If the pre/post request exhausts the available vectors + * then nothing to do here except for invoking the calc_sets() + * callback so the device driver can adjust to the situation. If there + * is only a single vector, then managing the queue is pointless as + * well. */ - if (nvecs == affd->pre_vectors + affd->post_vectors) + if (nvecs > 1 && nvecs > affd->pre_vectors + affd->post_vectors) + affvecs = nvecs - affd->pre_vectors - affd->post_vectors; + else + affvecs = 0; + + /* + * Simple invocations do not provide a calc_sets() callback. Install + * the generic one. + */ + if (!affd->calc_sets) + affd->calc_sets = default_calc_sets; + + /* Recalculate the sets */ + affd->calc_sets(affd, affvecs); + + if (WARN_ON_ONCE(affd->nr_sets > IRQ_AFFINITY_MAX_SETS)) return NULL; - node_to_cpumask = alloc_node_to_cpumask(); - if (!node_to_cpumask) + /* Nothing to assign? */ + if (!affvecs) return NULL; masks = kcalloc(nvecs, sizeof(*masks), GFP_KERNEL); if (!masks) - goto outnodemsk; + return NULL; /* Fill out vectors at the beginning that don't need affinity */ for (curvec = 0; curvec < affd->pre_vectors; curvec++) cpumask_copy(&masks[curvec].mask, irq_default_affinity); + /* * Spread on present CPUs starting from affd->pre_vectors. If we * have multiple sets, build each sets affinity mask separately. */ - nr_sets = affd->nr_sets; - if (!nr_sets) - nr_sets = 1; - - for (i = 0, usedvecs = 0; i < nr_sets; i++) { - int this_vecs = affd->sets ? affd->sets[i] : affvecs; + for (i = 0, usedvecs = 0; i < affd->nr_sets; i++) { + unsigned int this_vecs = affd->set_size[i]; int ret; - ret = irq_build_affinity_masks(affd, curvec, this_vecs, - curvec, node_to_cpumask, masks); + ret = irq_build_affinity_masks(curvec, this_vecs, + curvec, masks); if (ret) { kfree(masks); - masks = NULL; - goto outnodemsk; + return NULL; } curvec += this_vecs; usedvecs += this_vecs; @@ -293,8 +315,6 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd) for (i = affd->pre_vectors; i < nvecs - affd->post_vectors; i++) masks[i].is_managed = 1; -outnodemsk: - free_node_to_cpumask(node_to_cpumask); return masks; } @@ -304,25 +324,22 @@ outnodemsk: * @maxvec: The maximum number of vectors available * @affd: Description of the affinity requirements */ -int irq_calc_affinity_vectors(int minvec, int maxvec, const struct irq_affinity *affd) +unsigned int irq_calc_affinity_vectors(unsigned int minvec, unsigned int maxvec, + const struct irq_affinity *affd) { - int resv = affd->pre_vectors + affd->post_vectors; - int vecs = maxvec - resv; - int set_vecs; + unsigned int resv = affd->pre_vectors + affd->post_vectors; + unsigned int set_vecs; if (resv > minvec) return 0; - if (affd->nr_sets) { - int i; - - for (i = 0, set_vecs = 0; i < affd->nr_sets; i++) - set_vecs += affd->sets[i]; + if (affd->calc_sets) { + set_vecs = maxvec - resv; } else { get_online_cpus(); set_vecs = cpumask_weight(cpu_possible_mask); put_online_cpus(); } - return resv + min(set_vecs, vecs); + return resv + min(set_vecs, maxvec - resv); } diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c index 16cbf6beb276..ae60cae24e9a 100644 --- a/kernel/irq/autoprobe.c +++ b/kernel/irq/autoprobe.c @@ -90,7 +90,7 @@ unsigned long probe_irq_on(void) /* It triggered already - consider it spurious. */ if (!(desc->istate & IRQS_WAITING)) { desc->istate &= ~IRQS_AUTODETECT; - irq_shutdown(desc); + irq_shutdown_and_deactivate(desc); } else if (i < 32) mask |= 1 << i; @@ -127,7 +127,7 @@ unsigned int probe_irq_mask(unsigned long val) mask |= 1 << i; desc->istate &= ~IRQS_AUTODETECT; - irq_shutdown(desc); + irq_shutdown_and_deactivate(desc); } raw_spin_unlock_irq(&desc->lock); } @@ -169,7 +169,7 @@ int probe_irq_off(unsigned long val) nr_of_irqs++; } desc->istate &= ~IRQS_AUTODETECT; - irq_shutdown(desc); + irq_shutdown_and_deactivate(desc); } raw_spin_unlock_irq(&desc->lock); } diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 34e969069488..b76703b2c0af 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -314,6 +314,12 @@ void irq_shutdown(struct irq_desc *desc) } irq_state_clr_started(desc); } +} + + +void irq_shutdown_and_deactivate(struct irq_desc *desc) +{ + irq_shutdown(desc); /* * This must be called even if the interrupt was never started up, * because the activation can happen before the interrupt is @@ -730,6 +736,39 @@ out: EXPORT_SYMBOL_GPL(handle_fasteoi_irq); /** + * handle_fasteoi_nmi - irq handler for NMI interrupt lines + * @desc: the interrupt description structure for this irq + * + * A simple NMI-safe handler, considering the restrictions + * from request_nmi. + * + * Only a single callback will be issued to the chip: an ->eoi() + * call when the interrupt has been serviced. This enables support + * for modern forms of interrupt handlers, which handle the flow + * details in hardware, transparently. + */ +void handle_fasteoi_nmi(struct irq_desc *desc) +{ + struct irq_chip *chip = irq_desc_get_chip(desc); + struct irqaction *action = desc->action; + unsigned int irq = irq_desc_get_irq(desc); + irqreturn_t res; + + __kstat_incr_irqs_this_cpu(desc); + + trace_irq_handler_entry(irq, action); + /* + * NMIs cannot be shared, there is only one action. + */ + res = action->handler(irq, action->dev_id); + trace_irq_handler_exit(irq, action, res); + + if (chip->irq_eoi) + chip->irq_eoi(&desc->irq_data); +} +EXPORT_SYMBOL_GPL(handle_fasteoi_nmi); + +/** * handle_edge_irq - edge type IRQ handler * @desc: the interrupt description structure for this irq * @@ -855,7 +894,11 @@ void handle_percpu_irq(struct irq_desc *desc) { struct irq_chip *chip = irq_desc_get_chip(desc); - kstat_incr_irqs_this_cpu(desc); + /* + * PER CPU interrupts are not serialized. Do not touch + * desc->tot_count. + */ + __kstat_incr_irqs_this_cpu(desc); if (chip->irq_ack) chip->irq_ack(&desc->irq_data); @@ -884,7 +927,11 @@ void handle_percpu_devid_irq(struct irq_desc *desc) unsigned int irq = irq_desc_get_irq(desc); irqreturn_t res; - kstat_incr_irqs_this_cpu(desc); + /* + * PER CPU interrupts are not serialized. Do not touch + * desc->tot_count. + */ + __kstat_incr_irqs_this_cpu(desc); if (chip->irq_ack) chip->irq_ack(&desc->irq_data); @@ -908,6 +955,31 @@ void handle_percpu_devid_irq(struct irq_desc *desc) chip->irq_eoi(&desc->irq_data); } +/** + * handle_percpu_devid_fasteoi_nmi - Per CPU local NMI handler with per cpu + * dev ids + * @desc: the interrupt description structure for this irq + * + * Similar to handle_fasteoi_nmi, but handling the dev_id cookie + * as a percpu pointer. + */ +void handle_percpu_devid_fasteoi_nmi(struct irq_desc *desc) +{ + struct irq_chip *chip = irq_desc_get_chip(desc); + struct irqaction *action = desc->action; + unsigned int irq = irq_desc_get_irq(desc); + irqreturn_t res; + + __kstat_incr_irqs_this_cpu(desc); + + trace_irq_handler_entry(irq, action); + res = action->handler(irq, raw_cpu_ptr(action->percpu_dev_id)); + trace_irq_handler_exit(irq, action, res); + + if (chip->irq_eoi) + chip->irq_eoi(&desc->irq_data); +} + static void __irq_do_set_handler(struct irq_desc *desc, irq_flow_handler_t handle, int is_chained, const char *name) @@ -1278,6 +1350,17 @@ void irq_chip_mask_parent(struct irq_data *data) EXPORT_SYMBOL_GPL(irq_chip_mask_parent); /** + * irq_chip_mask_ack_parent - Mask and acknowledge the parent interrupt + * @data: Pointer to interrupt specific data + */ +void irq_chip_mask_ack_parent(struct irq_data *data) +{ + data = data->parent_data; + data->chip->irq_mask_ack(data); +} +EXPORT_SYMBOL_GPL(irq_chip_mask_ack_parent); + +/** * irq_chip_unmask_parent - Unmask the parent interrupt * @data: Pointer to interrupt specific data */ @@ -1376,11 +1459,43 @@ int irq_chip_set_vcpu_affinity_parent(struct irq_data *data, void *vcpu_info) int irq_chip_set_wake_parent(struct irq_data *data, unsigned int on) { data = data->parent_data; + + if (data->chip->flags & IRQCHIP_SKIP_SET_WAKE) + return 0; + if (data->chip->irq_set_wake) return data->chip->irq_set_wake(data, on); return -ENOSYS; } +EXPORT_SYMBOL_GPL(irq_chip_set_wake_parent); + +/** + * irq_chip_request_resources_parent - Request resources on the parent interrupt + * @data: Pointer to interrupt specific data + */ +int irq_chip_request_resources_parent(struct irq_data *data) +{ + data = data->parent_data; + + if (data->chip->irq_request_resources) + return data->chip->irq_request_resources(data); + + return -ENOSYS; +} +EXPORT_SYMBOL_GPL(irq_chip_request_resources_parent); + +/** + * irq_chip_release_resources_parent - Release resources on the parent interrupt + * @data: Pointer to interrupt specific data + */ +void irq_chip_release_resources_parent(struct irq_data *data) +{ + data = data->parent_data; + if (data->chip->irq_release_resources) + data->chip->irq_release_resources(data); +} +EXPORT_SYMBOL_GPL(irq_chip_release_resources_parent); #endif /** diff --git a/kernel/irq/cpuhotplug.c b/kernel/irq/cpuhotplug.c index 5b1072e394b2..6c7ca2e983a5 100644 --- a/kernel/irq/cpuhotplug.c +++ b/kernel/irq/cpuhotplug.c @@ -116,7 +116,7 @@ static bool migrate_one_irq(struct irq_desc *desc) */ if (irqd_affinity_is_managed(d)) { irqd_set_managed_shutdown(d); - irq_shutdown(desc); + irq_shutdown_and_deactivate(desc); return false; } affinity = cpu_online_mask; diff --git a/kernel/irq/debugfs.c b/kernel/irq/debugfs.c index 6f636136cccc..c1eccd4f6520 100644 --- a/kernel/irq/debugfs.c +++ b/kernel/irq/debugfs.c @@ -56,6 +56,7 @@ static const struct irq_bit_descr irqchip_flags[] = { BIT_MASK_DESCR(IRQCHIP_ONESHOT_SAFE), BIT_MASK_DESCR(IRQCHIP_EOI_THREADED), BIT_MASK_DESCR(IRQCHIP_SUPPORTS_LEVEL_MSI), + BIT_MASK_DESCR(IRQCHIP_SUPPORTS_NMI), }; static void @@ -140,6 +141,7 @@ static const struct irq_bit_descr irqdesc_istates[] = { BIT_MASK_DESCR(IRQS_WAITING), BIT_MASK_DESCR(IRQS_PENDING), BIT_MASK_DESCR(IRQS_SUSPENDED), + BIT_MASK_DESCR(IRQS_NMI), }; @@ -150,7 +152,7 @@ static int irq_debug_show(struct seq_file *m, void *p) raw_spin_lock_irq(&desc->lock); data = irq_desc_get_irq_data(desc); - seq_printf(m, "handler: %pf\n", desc->handle_irq); + seq_printf(m, "handler: %ps\n", desc->handle_irq); seq_printf(m, "device: %s\n", desc->dev_name); seq_printf(m, "status: 0x%08x\n", desc->status_use_accessors); irq_debug_show_bits(m, 0, desc->status_use_accessors, irqdesc_states, @@ -203,8 +205,8 @@ static ssize_t irq_debug_write(struct file *file, const char __user *user_buf, chip_bus_lock(desc); raw_spin_lock_irqsave(&desc->lock, flags); - if (irq_settings_is_level(desc)) { - /* Can't do level, sorry */ + if (irq_settings_is_level(desc) || desc->istate & IRQS_NMI) { + /* Can't do level nor NMIs, sorry */ err = -EINVAL; } else { desc->istate |= IRQS_PENDING; @@ -256,8 +258,6 @@ static int __init irq_debugfs_init(void) int irq; root_dir = debugfs_create_dir("irq", NULL); - if (!root_dir) - return -ENOMEM; irq_domain_debugfs_init(root_dir); diff --git a/kernel/irq/devres.c b/kernel/irq/devres.c index 5d5378ea0afe..f6e5515ee077 100644 --- a/kernel/irq/devres.c +++ b/kernel/irq/devres.c @@ -84,8 +84,6 @@ EXPORT_SYMBOL(devm_request_threaded_irq); * @dev: device to request interrupt for * @irq: Interrupt line to allocate * @handler: Function to be called when the IRQ occurs - * @thread_fn: function to be called in a threaded interrupt context. NULL - * for devices which handle everything in @handler * @irqflags: Interrupt type flags * @devname: An ascii name for the claiming device, dev_name(dev) if NULL * @dev_id: A cookie passed back to the handler function @@ -222,9 +220,8 @@ devm_irq_alloc_generic_chip(struct device *dev, const char *name, int num_ct, irq_flow_handler_t handler) { struct irq_chip_generic *gc; - unsigned long sz = sizeof(*gc) + num_ct * sizeof(struct irq_chip_type); - gc = devm_kzalloc(dev, sz, GFP_KERNEL); + gc = devm_kzalloc(dev, struct_size(gc, chip_types, num_ct), GFP_KERNEL); if (gc) irq_init_generic_chip(gc, name, num_ct, irq_base, reg_base, handler); diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 38554bc35375..a4ace611f47f 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -149,7 +149,7 @@ irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc, unsigned int *flags res = action->handler(irq, action->dev_id); trace_irq_handler_exit(irq, action, res); - if (WARN_ONCE(!irqs_disabled(),"irq %u handler %pF enabled interrupts\n", + if (WARN_ONCE(!irqs_disabled(),"irq %u handler %pS enabled interrupts\n", irq, action->handler)) local_irq_disable(); @@ -166,7 +166,7 @@ irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc, unsigned int *flags __irq_wake_thread(desc, action); - /* Fall through to add to randomness */ + /* Fall through - to add to randomness */ case IRQ_HANDLED: *flags |= action->flags; break; diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index ca6afa267070..3924fbe829d4 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -49,6 +49,7 @@ enum { * IRQS_WAITING - irq is waiting * IRQS_PENDING - irq is pending and replayed later * IRQS_SUSPENDED - irq is suspended + * IRQS_NMI - irq line is used to deliver NMIs */ enum { IRQS_AUTODETECT = 0x00000001, @@ -60,6 +61,7 @@ enum { IRQS_PENDING = 0x00000200, IRQS_SUSPENDED = 0x00000800, IRQS_TIMINGS = 0x00001000, + IRQS_NMI = 0x00002000, }; #include "debug.h" @@ -80,6 +82,7 @@ extern int irq_activate_and_startup(struct irq_desc *desc, bool resend); extern int irq_startup(struct irq_desc *desc, bool resend, bool force); extern void irq_shutdown(struct irq_desc *desc); +extern void irq_shutdown_and_deactivate(struct irq_desc *desc); extern void irq_enable(struct irq_desc *desc); extern void irq_disable(struct irq_desc *desc); extern void irq_percpu_enable(struct irq_desc *desc, unsigned int cpu); @@ -94,6 +97,10 @@ static inline void irq_mark_irq(unsigned int irq) { } extern void irq_mark_irq(unsigned int irq); #endif +extern int __irq_get_irqchip_state(struct irq_data *data, + enum irqchip_irq_state which, + bool *state); + extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr); irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc, unsigned int *flags); @@ -242,12 +249,18 @@ static inline void irq_state_set_masked(struct irq_desc *desc) #undef __irqd_to_state -static inline void kstat_incr_irqs_this_cpu(struct irq_desc *desc) +static inline void __kstat_incr_irqs_this_cpu(struct irq_desc *desc) { __this_cpu_inc(*desc->kstat_irqs); __this_cpu_inc(kstat.irqs_sum); } +static inline void kstat_incr_irqs_this_cpu(struct irq_desc *desc) +{ + __kstat_incr_irqs_this_cpu(desc); + desc->tot_count++; +} + static inline int irq_desc_get_node(struct irq_desc *desc) { return irq_common_data_get_node(&desc->irq_common_data); @@ -346,6 +359,16 @@ static inline int irq_timing_decode(u64 value, u64 *timestamp) return value & U16_MAX; } +static __always_inline void irq_timings_push(u64 ts, int irq) +{ + struct irq_timings *timings = this_cpu_ptr(&irq_timings); + + timings->values[timings->count & IRQ_TIMINGS_MASK] = + irq_timing_encode(ts, irq); + + timings->count++; +} + /* * The function record_irq_time is only called in one place in the * interrupts handler. We want this function always inline so the code @@ -359,15 +382,8 @@ static __always_inline void record_irq_time(struct irq_desc *desc) if (!static_branch_likely(&irq_timing_enabled)) return; - if (desc->istate & IRQS_TIMINGS) { - struct irq_timings *timings = this_cpu_ptr(&irq_timings); - - timings->values[timings->count & IRQ_TIMINGS_MASK] = - irq_timing_encode(local_clock(), - irq_desc_get_irq(desc)); - - timings->count++; - } + if (desc->istate & IRQS_TIMINGS) + irq_timings_push(local_clock(), irq_desc_get_irq(desc)); } #else static inline void irq_remove_timings(struct irq_desc *desc) {} diff --git a/kernel/irq/irq_sim.c b/kernel/irq/irq_sim.c index 98a20e1594ce..b992f88c5613 100644 --- a/kernel/irq/irq_sim.c +++ b/kernel/irq/irq_sim.c @@ -25,10 +25,22 @@ static void irq_sim_irqunmask(struct irq_data *data) irq_ctx->enabled = true; } +static int irq_sim_set_type(struct irq_data *data, unsigned int type) +{ + /* We only support rising and falling edge trigger types. */ + if (type & ~IRQ_TYPE_EDGE_BOTH) + return -EINVAL; + + irqd_set_trigger_type(data, type); + + return 0; +} + static struct irq_chip irq_sim_irqchip = { .name = "irq_sim", .irq_mask = irq_sim_irqmask, .irq_unmask = irq_sim_irqunmask, + .irq_set_type = irq_sim_set_type, }; static void irq_sim_handle_irq(struct irq_work *work) diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index ef8ad36cadcf..9484e88dabc2 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -119,6 +119,7 @@ static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node, desc->depth = 1; desc->irq_count = 0; desc->irqs_unhandled = 0; + desc->tot_count = 0; desc->name = NULL; desc->owner = owner; for_each_possible_cpu(cpu) @@ -274,11 +275,12 @@ static struct attribute *irq_attrs[] = { &actions_attr.attr, NULL }; +ATTRIBUTE_GROUPS(irq); static struct kobj_type irq_kobj_type = { .release = irq_kobj_release, .sysfs_ops = &kobj_sysfs_ops, - .default_attrs = irq_attrs, + .default_groups = irq_groups, }; static void irq_sysfs_add(int irq, struct irq_desc *desc) @@ -557,6 +559,7 @@ int __init early_irq_init(void) alloc_masks(&desc[i], node); raw_spin_lock_init(&desc[i].lock); lockdep_set_class(&desc[i].lock, &irq_desc_lock_class); + mutex_init(&desc[i].request_mutex); desc_set_defaults(i, &desc[i], node, NULL, NULL); } return arch_early_irq_init(); @@ -669,6 +672,45 @@ int __handle_domain_irq(struct irq_domain *domain, unsigned int hwirq, set_irq_regs(old_regs); return ret; } + +#ifdef CONFIG_IRQ_DOMAIN +/** + * handle_domain_nmi - Invoke the handler for a HW irq belonging to a domain + * @domain: The domain where to perform the lookup + * @hwirq: The HW irq number to convert to a logical one + * @regs: Register file coming from the low-level handling code + * + * This function must be called from an NMI context. + * + * Returns: 0 on success, or -EINVAL if conversion has failed + */ +int handle_domain_nmi(struct irq_domain *domain, unsigned int hwirq, + struct pt_regs *regs) +{ + struct pt_regs *old_regs = set_irq_regs(regs); + unsigned int irq; + int ret = 0; + + /* + * NMI context needs to be setup earlier in order to deal with tracing. + */ + WARN_ON(!in_nmi()); + + irq = irq_find_mapping(domain, hwirq); + + /* + * ack_bad_irq is not NMI-safe, just report + * an invalid interrupt. + */ + if (likely(irq)) + generic_handle_irq(irq); + else + ret = -EINVAL; + + set_irq_regs(old_regs); + return ret; +} +#endif #endif /* Dynamic interrupt handling */ @@ -908,6 +950,11 @@ unsigned int kstat_irqs_cpu(unsigned int irq, int cpu) *per_cpu_ptr(desc->kstat_irqs, cpu) : 0; } +static bool irq_is_nmi(struct irq_desc *desc) +{ + return desc->istate & IRQS_NMI; +} + /** * kstat_irqs - Get the statistics for an interrupt * @irq: The interrupt number @@ -919,11 +966,16 @@ unsigned int kstat_irqs_cpu(unsigned int irq, int cpu) unsigned int kstat_irqs(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); - int cpu; unsigned int sum = 0; + int cpu; if (!desc || !desc->kstat_irqs) return 0; + if (!irq_settings_is_per_cpu_devid(desc) && + !irq_settings_is_per_cpu(desc) && + !irq_is_nmi(desc)) + return desc->tot_count; + for_each_possible_cpu(cpu) sum += *per_cpu_ptr(desc->kstat_irqs, cpu); return sum; diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 8b0be4bd6565..3078d0e48bba 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -123,7 +123,7 @@ EXPORT_SYMBOL_GPL(irq_domain_free_fwnode); * @ops: domain callbacks * @host_data: Controller private data pointer * - * Allocates and initialize and irq_domain structure. + * Allocates and initializes an irq_domain structure. * Returns pointer to IRQ domain, or NULL on failure. */ struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, int size, @@ -139,7 +139,7 @@ struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, int size, domain = kzalloc_node(sizeof(*domain) + (sizeof(unsigned int) * size), GFP_KERNEL, of_node_to_nid(of_node)); - if (WARN_ON(!domain)) + if (!domain) return NULL; if (fwnode && is_fwnode_irqchip(fwnode)) { @@ -458,6 +458,20 @@ void irq_set_default_host(struct irq_domain *domain) } EXPORT_SYMBOL_GPL(irq_set_default_host); +/** + * irq_get_default_host() - Retrieve the "default" irq domain + * + * Returns: the default domain, if any. + * + * Modern code should never use this. This should only be used on + * systems that cannot implement a firmware->fwnode mapping (which + * both DT and ACPI provide). + */ +struct irq_domain *irq_get_default_host(void) +{ + return irq_default_domain; +} + static void irq_domain_clear_mapping(struct irq_domain *domain, irq_hw_number_t hwirq) { @@ -729,16 +743,17 @@ static int irq_domain_translate(struct irq_domain *d, return 0; } -static void of_phandle_args_to_fwspec(struct of_phandle_args *irq_data, +static void of_phandle_args_to_fwspec(struct device_node *np, const u32 *args, + unsigned int count, struct irq_fwspec *fwspec) { int i; - fwspec->fwnode = irq_data->np ? &irq_data->np->fwnode : NULL; - fwspec->param_count = irq_data->args_count; + fwspec->fwnode = np ? &np->fwnode : NULL; + fwspec->param_count = count; - for (i = 0; i < irq_data->args_count; i++) - fwspec->param[i] = irq_data->args[i]; + for (i = 0; i < count; i++) + fwspec->param[i] = args[i]; } unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec) @@ -836,7 +851,9 @@ unsigned int irq_create_of_mapping(struct of_phandle_args *irq_data) { struct irq_fwspec fwspec; - of_phandle_args_to_fwspec(irq_data, &fwspec); + of_phandle_args_to_fwspec(irq_data->np, irq_data->args, + irq_data->args_count, &fwspec); + return irq_create_fwspec_mapping(&fwspec); } EXPORT_SYMBOL_GPL(irq_create_of_mapping); @@ -928,11 +945,10 @@ int irq_domain_xlate_twocell(struct irq_domain *d, struct device_node *ctrlr, const u32 *intspec, unsigned int intsize, irq_hw_number_t *out_hwirq, unsigned int *out_type) { - if (WARN_ON(intsize < 2)) - return -EINVAL; - *out_hwirq = intspec[0]; - *out_type = intspec[1] & IRQ_TYPE_SENSE_MASK; - return 0; + struct irq_fwspec fwspec; + + of_phandle_args_to_fwspec(ctrlr, intspec, intsize, &fwspec); + return irq_domain_translate_twocell(d, &fwspec, out_hwirq, out_type); } EXPORT_SYMBOL_GPL(irq_domain_xlate_twocell); @@ -968,6 +984,27 @@ const struct irq_domain_ops irq_domain_simple_ops = { }; EXPORT_SYMBOL_GPL(irq_domain_simple_ops); +/** + * irq_domain_translate_twocell() - Generic translate for direct two cell + * bindings + * + * Device Tree IRQ specifier translation function which works with two cell + * bindings where the cell values map directly to the hwirq number + * and linux irq flags. + */ +int irq_domain_translate_twocell(struct irq_domain *d, + struct irq_fwspec *fwspec, + unsigned long *out_hwirq, + unsigned int *out_type) +{ + if (WARN_ON(fwspec->param_count < 2)) + return -EINVAL; + *out_hwirq = fwspec->param[0]; + *out_type = fwspec->param[1] & IRQ_TYPE_SENSE_MASK; + return 0; +} +EXPORT_SYMBOL_GPL(irq_domain_translate_twocell); + int irq_domain_alloc_descs(int virq, unsigned int cnt, irq_hw_number_t hwirq, int node, const struct irq_affinity_desc *affinity) { @@ -1260,7 +1297,7 @@ int irq_domain_alloc_irqs_hierarchy(struct irq_domain *domain, /** * __irq_domain_alloc_irqs - Allocate IRQs from domain * @domain: domain to allocate from - * @irq_base: allocate specified IRQ nubmer if irq_base >= 0 + * @irq_base: allocate specified IRQ number if irq_base >= 0 * @nr_irqs: number of IRQs to allocate * @node: NUMA node id for memory allocation * @arg: domain specific argument @@ -1749,8 +1786,6 @@ void __init irq_domain_debugfs_init(struct dentry *root) struct irq_domain *d; domain_dir = debugfs_create_dir("domains", root); - if (!domain_dir) - return; debugfs_create_file("default", 0444, domain_dir, NULL, &irq_domain_debug_fops); diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 84b54a17b95d..e8f7f179bf77 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -13,6 +13,7 @@ #include <linux/module.h> #include <linux/random.h> #include <linux/interrupt.h> +#include <linux/irqdomain.h> #include <linux/slab.h> #include <linux/sched.h> #include <linux/sched/rt.h> @@ -34,8 +35,9 @@ static int __init setup_forced_irqthreads(char *arg) early_param("threadirqs", setup_forced_irqthreads); #endif -static void __synchronize_hardirq(struct irq_desc *desc) +static void __synchronize_hardirq(struct irq_desc *desc, bool sync_chip) { + struct irq_data *irqd = irq_desc_get_irq_data(desc); bool inprogress; do { @@ -51,6 +53,20 @@ static void __synchronize_hardirq(struct irq_desc *desc) /* Ok, that indicated we're done: double-check carefully. */ raw_spin_lock_irqsave(&desc->lock, flags); inprogress = irqd_irq_inprogress(&desc->irq_data); + + /* + * If requested and supported, check at the chip whether it + * is in flight at the hardware level, i.e. already pending + * in a CPU and waiting for service and acknowledge. + */ + if (!inprogress && sync_chip) { + /* + * Ignore the return code. inprogress is only updated + * when the chip supports it. + */ + __irq_get_irqchip_state(irqd, IRQCHIP_STATE_ACTIVE, + &inprogress); + } raw_spin_unlock_irqrestore(&desc->lock, flags); /* Oops, that failed? */ @@ -73,13 +89,18 @@ static void __synchronize_hardirq(struct irq_desc *desc) * Returns: false if a threaded handler is active. * * This function may be called - with care - from IRQ context. + * + * It does not check whether there is an interrupt in flight at the + * hardware level, but not serviced yet, as this might deadlock when + * called with interrupts disabled and the target CPU of the interrupt + * is the current CPU. */ bool synchronize_hardirq(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); if (desc) { - __synchronize_hardirq(desc); + __synchronize_hardirq(desc, false); return !atomic_read(&desc->threads_active); } @@ -95,14 +116,19 @@ EXPORT_SYMBOL(synchronize_hardirq); * to complete before returning. If you use this function while * holding a resource the IRQ handler may need you will deadlock. * - * This function may be called - with care - from IRQ context. + * Can only be called from preemptible code as it might sleep when + * an interrupt thread is associated to @irq. + * + * It optionally makes sure (when the irq chip supports that method) + * that the interrupt is not pending in any CPU and waiting for + * service. */ void synchronize_irq(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); if (desc) { - __synchronize_hardirq(desc); + __synchronize_hardirq(desc, true); /* * We made sure that no hardirq handler is * running. Now verify that no threaded handlers are @@ -196,6 +222,7 @@ int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask, case IRQ_SET_MASK_OK: case IRQ_SET_MASK_OK_DONE: cpumask_copy(desc->irq_common_data.affinity, mask); + /* fall through */ case IRQ_SET_MASK_OK_NOCOPY: irq_validate_effective_affinity(data); irq_set_thread_affinity(desc); @@ -341,7 +368,7 @@ irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify) /* The release function is promised process context */ might_sleep(); - if (!desc) + if (!desc || desc->istate & IRQS_NMI) return -EINVAL; /* Complete initialisation of *notify */ @@ -356,8 +383,10 @@ irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify) desc->affinity_notify = notify; raw_spin_unlock_irqrestore(&desc->lock, flags); - if (old_notify) + if (old_notify) { + cancel_work_sync(&old_notify->work); kref_put(&old_notify->kref, old_notify->release); + } return 0; } @@ -553,6 +582,21 @@ bool disable_hardirq(unsigned int irq) } EXPORT_SYMBOL_GPL(disable_hardirq); +/** + * disable_nmi_nosync - disable an nmi without waiting + * @irq: Interrupt to disable + * + * Disable the selected interrupt line. Disables and enables are + * nested. + * The interrupt to disable must have been requested through request_nmi. + * Unlike disable_nmi(), this function does not ensure existing + * instances of the IRQ handler have completed before returning. + */ +void disable_nmi_nosync(unsigned int irq) +{ + disable_irq_nosync(irq); +} + void __enable_irq(struct irq_desc *desc) { switch (desc->depth) { @@ -609,6 +653,20 @@ out: } EXPORT_SYMBOL(enable_irq); +/** + * enable_nmi - enable handling of an nmi + * @irq: Interrupt to enable + * + * The interrupt to enable must have been requested through request_nmi. + * Undoes the effect of one call to disable_nmi(). If this + * matches the last disable, processing of interrupts on this + * IRQ line is re-enabled. + */ +void enable_nmi(unsigned int irq) +{ + enable_irq(irq); +} + static int set_irq_wake_real(unsigned int irq, unsigned int on) { struct irq_desc *desc = irq_to_desc(irq); @@ -644,6 +702,12 @@ int irq_set_irq_wake(unsigned int irq, unsigned int on) if (!desc) return -EINVAL; + /* Don't use NMIs as wake up interrupts please */ + if (desc->istate & IRQS_NMI) { + ret = -EINVAL; + goto out_unlock; + } + /* wakeup-capable irqs can be shared between drivers that * don't need to have the same sleep mode behaviors. */ @@ -666,6 +730,8 @@ int irq_set_irq_wake(unsigned int irq, unsigned int on) irqd_clear(&desc->irq_data, IRQD_WAKEUP_STATE); } } + +out_unlock: irq_put_desc_busunlock(desc, flags); return ret; } @@ -726,6 +792,7 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned long flags) case IRQ_SET_MASK_OK_DONE: irqd_clear(&desc->irq_data, IRQD_TRIGGER_MASK); irqd_set(&desc->irq_data, flags); + /* fall through */ case IRQ_SET_MASK_OK_NOCOPY: flags = irqd_get_trigger_type(&desc->irq_data); @@ -740,7 +807,7 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned long flags) ret = 0; break; default: - pr_err("Setting trigger mode %lu for irq %u failed (%pF)\n", + pr_err("Setting trigger mode %lu for irq %u failed (%pS)\n", flags, irq_desc_get_irq(desc), chip->irq_set_type); } if (unmask) @@ -1128,6 +1195,39 @@ static void irq_release_resources(struct irq_desc *desc) c->irq_release_resources(d); } +static bool irq_supports_nmi(struct irq_desc *desc) +{ + struct irq_data *d = irq_desc_get_irq_data(desc); + +#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY + /* Only IRQs directly managed by the root irqchip can be set as NMI */ + if (d->parent_data) + return false; +#endif + /* Don't support NMIs for chips behind a slow bus */ + if (d->chip->irq_bus_lock || d->chip->irq_bus_sync_unlock) + return false; + + return d->chip->flags & IRQCHIP_SUPPORTS_NMI; +} + +static int irq_nmi_setup(struct irq_desc *desc) +{ + struct irq_data *d = irq_desc_get_irq_data(desc); + struct irq_chip *c = d->chip; + + return c->irq_nmi_setup ? c->irq_nmi_setup(d) : -EINVAL; +} + +static void irq_nmi_teardown(struct irq_desc *desc) +{ + struct irq_data *d = irq_desc_get_irq_data(desc); + struct irq_chip *c = d->chip; + + if (c->irq_nmi_teardown) + c->irq_nmi_teardown(d); +} + static int setup_irq_thread(struct irqaction *new, unsigned int irq, bool secondary) { @@ -1302,9 +1402,17 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) * fields must have IRQF_SHARED set and the bits which * set the trigger type must match. Also all must * agree on ONESHOT. + * Interrupt lines used for NMIs cannot be shared. */ unsigned int oldtype; + if (desc->istate & IRQS_NMI) { + pr_err("Invalid attempt to share NMI for %s (irq %d) on irqchip %s.\n", + new->name, irq, desc->irq_data.chip->name); + ret = -EINVAL; + goto out_unlock; + } + /* * If nobody did set the configuration before, inherit * the one provided by the requester. @@ -1617,6 +1725,7 @@ static struct irqaction *__free_irq(struct irq_desc *desc, void *dev_id) /* If this was the last handler, shut down the IRQ line: */ if (!desc->action) { irq_settings_clr_disable_unlazy(desc); + /* Only shutdown. Deactivate after synchronize_hardirq() */ irq_shutdown(desc); } @@ -1645,8 +1754,12 @@ static struct irqaction *__free_irq(struct irq_desc *desc, void *dev_id) unregister_handler_proc(irq, action); - /* Make sure it's not being used on another CPU: */ - synchronize_hardirq(irq); + /* + * Make sure it's not being used on another CPU and if the chip + * supports it also make sure that there is no (not yet serviced) + * interrupt in flight at the hardware level. + */ + __synchronize_hardirq(desc, true); #ifdef CONFIG_DEBUG_SHIRQ /* @@ -1686,6 +1799,14 @@ static struct irqaction *__free_irq(struct irq_desc *desc, void *dev_id) * require it to deallocate resources over the slow bus. */ chip_bus_lock(desc); + /* + * There is no interrupt on the fly anymore. Deactivate it + * completely. + */ + raw_spin_lock_irqsave(&desc->lock, flags); + irq_domain_deactivate_irq(&desc->irq_data); + raw_spin_unlock_irqrestore(&desc->lock, flags); + irq_release_resources(desc); chip_bus_sync_unlock(desc); irq_remove_timings(desc); @@ -1756,6 +1877,59 @@ const void *free_irq(unsigned int irq, void *dev_id) } EXPORT_SYMBOL(free_irq); +/* This function must be called with desc->lock held */ +static const void *__cleanup_nmi(unsigned int irq, struct irq_desc *desc) +{ + const char *devname = NULL; + + desc->istate &= ~IRQS_NMI; + + if (!WARN_ON(desc->action == NULL)) { + irq_pm_remove_action(desc, desc->action); + devname = desc->action->name; + unregister_handler_proc(irq, desc->action); + + kfree(desc->action); + desc->action = NULL; + } + + irq_settings_clr_disable_unlazy(desc); + irq_shutdown_and_deactivate(desc); + + irq_release_resources(desc); + + irq_chip_pm_put(&desc->irq_data); + module_put(desc->owner); + + return devname; +} + +const void *free_nmi(unsigned int irq, void *dev_id) +{ + struct irq_desc *desc = irq_to_desc(irq); + unsigned long flags; + const void *devname; + + if (!desc || WARN_ON(!(desc->istate & IRQS_NMI))) + return NULL; + + if (WARN_ON(irq_settings_is_per_cpu_devid(desc))) + return NULL; + + /* NMI still enabled */ + if (WARN_ON(desc->depth == 0)) + disable_nmi_nosync(irq); + + raw_spin_lock_irqsave(&desc->lock, flags); + + irq_nmi_teardown(desc); + devname = __cleanup_nmi(irq, desc); + + raw_spin_unlock_irqrestore(&desc->lock, flags); + + return devname; +} + /** * request_threaded_irq - allocate an interrupt line * @irq: Interrupt line to allocate @@ -1925,6 +2099,101 @@ int request_any_context_irq(unsigned int irq, irq_handler_t handler, } EXPORT_SYMBOL_GPL(request_any_context_irq); +/** + * request_nmi - allocate an interrupt line for NMI delivery + * @irq: Interrupt line to allocate + * @handler: Function to be called when the IRQ occurs. + * Threaded handler for threaded interrupts. + * @irqflags: Interrupt type flags + * @name: An ascii name for the claiming device + * @dev_id: A cookie passed back to the handler function + * + * This call allocates interrupt resources and enables the + * interrupt line and IRQ handling. It sets up the IRQ line + * to be handled as an NMI. + * + * An interrupt line delivering NMIs cannot be shared and IRQ handling + * cannot be threaded. + * + * Interrupt lines requested for NMI delivering must produce per cpu + * interrupts and have auto enabling setting disabled. + * + * Dev_id must be globally unique. Normally the address of the + * device data structure is used as the cookie. Since the handler + * receives this value it makes sense to use it. + * + * If the interrupt line cannot be used to deliver NMIs, function + * will fail and return a negative value. + */ +int request_nmi(unsigned int irq, irq_handler_t handler, + unsigned long irqflags, const char *name, void *dev_id) +{ + struct irqaction *action; + struct irq_desc *desc; + unsigned long flags; + int retval; + + if (irq == IRQ_NOTCONNECTED) + return -ENOTCONN; + + /* NMI cannot be shared, used for Polling */ + if (irqflags & (IRQF_SHARED | IRQF_COND_SUSPEND | IRQF_IRQPOLL)) + return -EINVAL; + + if (!(irqflags & IRQF_PERCPU)) + return -EINVAL; + + if (!handler) + return -EINVAL; + + desc = irq_to_desc(irq); + + if (!desc || irq_settings_can_autoenable(desc) || + !irq_settings_can_request(desc) || + WARN_ON(irq_settings_is_per_cpu_devid(desc)) || + !irq_supports_nmi(desc)) + return -EINVAL; + + action = kzalloc(sizeof(struct irqaction), GFP_KERNEL); + if (!action) + return -ENOMEM; + + action->handler = handler; + action->flags = irqflags | IRQF_NO_THREAD | IRQF_NOBALANCING; + action->name = name; + action->dev_id = dev_id; + + retval = irq_chip_pm_get(&desc->irq_data); + if (retval < 0) + goto err_out; + + retval = __setup_irq(irq, desc, action); + if (retval) + goto err_irq_setup; + + raw_spin_lock_irqsave(&desc->lock, flags); + + /* Setup NMI state */ + desc->istate |= IRQS_NMI; + retval = irq_nmi_setup(desc); + if (retval) { + __cleanup_nmi(irq, desc); + raw_spin_unlock_irqrestore(&desc->lock, flags); + return -EINVAL; + } + + raw_spin_unlock_irqrestore(&desc->lock, flags); + + return 0; + +err_irq_setup: + irq_chip_pm_put(&desc->irq_data); +err_out: + kfree(action); + + return retval; +} + void enable_percpu_irq(unsigned int irq, unsigned int type) { unsigned int cpu = smp_processor_id(); @@ -1959,6 +2228,11 @@ out: } EXPORT_SYMBOL_GPL(enable_percpu_irq); +void enable_percpu_nmi(unsigned int irq, unsigned int type) +{ + enable_percpu_irq(irq, type); +} + /** * irq_percpu_is_enabled - Check whether the per cpu irq is enabled * @irq: Linux irq number to check for @@ -1998,6 +2272,11 @@ void disable_percpu_irq(unsigned int irq) } EXPORT_SYMBOL_GPL(disable_percpu_irq); +void disable_percpu_nmi(unsigned int irq) +{ + disable_percpu_irq(irq); +} + /* * Internal function to unregister a percpu irqaction. */ @@ -2029,6 +2308,8 @@ static struct irqaction *__free_percpu_irq(unsigned int irq, void __percpu *dev_ /* Found it - now remove it from the list of entries: */ desc->action = NULL; + desc->istate &= ~IRQS_NMI; + raw_spin_unlock_irqrestore(&desc->lock, flags); unregister_handler_proc(irq, action); @@ -2082,6 +2363,19 @@ void free_percpu_irq(unsigned int irq, void __percpu *dev_id) } EXPORT_SYMBOL_GPL(free_percpu_irq); +void free_percpu_nmi(unsigned int irq, void __percpu *dev_id) +{ + struct irq_desc *desc = irq_to_desc(irq); + + if (!desc || !irq_settings_is_per_cpu_devid(desc)) + return; + + if (WARN_ON(!(desc->istate & IRQS_NMI))) + return; + + kfree(__free_percpu_irq(irq, dev_id)); +} + /** * setup_percpu_irq - setup a per-cpu interrupt * @irq: Interrupt line to setup @@ -2172,6 +2466,180 @@ int __request_percpu_irq(unsigned int irq, irq_handler_t handler, EXPORT_SYMBOL_GPL(__request_percpu_irq); /** + * request_percpu_nmi - allocate a percpu interrupt line for NMI delivery + * @irq: Interrupt line to allocate + * @handler: Function to be called when the IRQ occurs. + * @name: An ascii name for the claiming device + * @dev_id: A percpu cookie passed back to the handler function + * + * This call allocates interrupt resources for a per CPU NMI. Per CPU NMIs + * have to be setup on each CPU by calling prepare_percpu_nmi() before + * being enabled on the same CPU by using enable_percpu_nmi(). + * + * Dev_id must be globally unique. It is a per-cpu variable, and + * the handler gets called with the interrupted CPU's instance of + * that variable. + * + * Interrupt lines requested for NMI delivering should have auto enabling + * setting disabled. + * + * If the interrupt line cannot be used to deliver NMIs, function + * will fail returning a negative value. + */ +int request_percpu_nmi(unsigned int irq, irq_handler_t handler, + const char *name, void __percpu *dev_id) +{ + struct irqaction *action; + struct irq_desc *desc; + unsigned long flags; + int retval; + + if (!handler) + return -EINVAL; + + desc = irq_to_desc(irq); + + if (!desc || !irq_settings_can_request(desc) || + !irq_settings_is_per_cpu_devid(desc) || + irq_settings_can_autoenable(desc) || + !irq_supports_nmi(desc)) + return -EINVAL; + + /* The line cannot already be NMI */ + if (desc->istate & IRQS_NMI) + return -EINVAL; + + action = kzalloc(sizeof(struct irqaction), GFP_KERNEL); + if (!action) + return -ENOMEM; + + action->handler = handler; + action->flags = IRQF_PERCPU | IRQF_NO_SUSPEND | IRQF_NO_THREAD + | IRQF_NOBALANCING; + action->name = name; + action->percpu_dev_id = dev_id; + + retval = irq_chip_pm_get(&desc->irq_data); + if (retval < 0) + goto err_out; + + retval = __setup_irq(irq, desc, action); + if (retval) + goto err_irq_setup; + + raw_spin_lock_irqsave(&desc->lock, flags); + desc->istate |= IRQS_NMI; + raw_spin_unlock_irqrestore(&desc->lock, flags); + + return 0; + +err_irq_setup: + irq_chip_pm_put(&desc->irq_data); +err_out: + kfree(action); + + return retval; +} + +/** + * prepare_percpu_nmi - performs CPU local setup for NMI delivery + * @irq: Interrupt line to prepare for NMI delivery + * + * This call prepares an interrupt line to deliver NMI on the current CPU, + * before that interrupt line gets enabled with enable_percpu_nmi(). + * + * As a CPU local operation, this should be called from non-preemptible + * context. + * + * If the interrupt line cannot be used to deliver NMIs, function + * will fail returning a negative value. + */ +int prepare_percpu_nmi(unsigned int irq) +{ + unsigned long flags; + struct irq_desc *desc; + int ret = 0; + + WARN_ON(preemptible()); + + desc = irq_get_desc_lock(irq, &flags, + IRQ_GET_DESC_CHECK_PERCPU); + if (!desc) + return -EINVAL; + + if (WARN(!(desc->istate & IRQS_NMI), + KERN_ERR "prepare_percpu_nmi called for a non-NMI interrupt: irq %u\n", + irq)) { + ret = -EINVAL; + goto out; + } + + ret = irq_nmi_setup(desc); + if (ret) { + pr_err("Failed to setup NMI delivery: irq %u\n", irq); + goto out; + } + +out: + irq_put_desc_unlock(desc, flags); + return ret; +} + +/** + * teardown_percpu_nmi - undoes NMI setup of IRQ line + * @irq: Interrupt line from which CPU local NMI configuration should be + * removed + * + * This call undoes the setup done by prepare_percpu_nmi(). + * + * IRQ line should not be enabled for the current CPU. + * + * As a CPU local operation, this should be called from non-preemptible + * context. + */ +void teardown_percpu_nmi(unsigned int irq) +{ + unsigned long flags; + struct irq_desc *desc; + + WARN_ON(preemptible()); + + desc = irq_get_desc_lock(irq, &flags, + IRQ_GET_DESC_CHECK_PERCPU); + if (!desc) + return; + + if (WARN_ON(!(desc->istate & IRQS_NMI))) + goto out; + + irq_nmi_teardown(desc); +out: + irq_put_desc_unlock(desc, flags); +} + +int __irq_get_irqchip_state(struct irq_data *data, enum irqchip_irq_state which, + bool *state) +{ + struct irq_chip *chip; + int err = -EINVAL; + + do { + chip = irq_data_get_irq_chip(data); + if (chip->irq_get_irqchip_state) + break; +#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY + data = data->parent_data; +#else + data = NULL; +#endif + } while (data); + + if (data) + err = chip->irq_get_irqchip_state(data, which, state); + return err; +} + +/** * irq_get_irqchip_state - returns the irqchip state of a interrupt. * @irq: Interrupt line that is forwarded to a VM * @which: One of IRQCHIP_STATE_* the caller wants to know about @@ -2189,7 +2657,6 @@ int irq_get_irqchip_state(unsigned int irq, enum irqchip_irq_state which, { struct irq_desc *desc; struct irq_data *data; - struct irq_chip *chip; unsigned long flags; int err = -EINVAL; @@ -2199,19 +2666,7 @@ int irq_get_irqchip_state(unsigned int irq, enum irqchip_irq_state which, data = irq_desc_get_irq_data(desc); - do { - chip = irq_data_get_irq_chip(data); - if (chip->irq_get_irqchip_state) - break; -#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY - data = data->parent_data; -#else - data = NULL; -#endif - } while (data); - - if (data) - err = chip->irq_get_irqchip_state(data, which, state); + err = __irq_get_irqchip_state(data, which, state); irq_put_desc_busunlock(desc, flags); return err; diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index 6d2fa6914b30..2ed97a7c9b2a 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -212,9 +212,9 @@ static void __report_bad_irq(struct irq_desc *desc, irqreturn_t action_ret) */ raw_spin_lock_irqsave(&desc->lock, flags); for_each_action_of_desc(desc, action) { - printk(KERN_ERR "[<%p>] %pf", action->handler, action->handler); + printk(KERN_ERR "[<%p>] %ps", action->handler, action->handler); if (action->thread_fn) - printk(KERN_CONT " threaded [<%p>] %pf", + printk(KERN_CONT " threaded [<%p>] %ps", action->thread_fn, action->thread_fn); printk(KERN_CONT "\n"); } diff --git a/kernel/irq/timings.c b/kernel/irq/timings.c index 1e4cb63a5c82..e960d7ce7bcc 100644 --- a/kernel/irq/timings.c +++ b/kernel/irq/timings.c @@ -1,14 +1,17 @@ // SPDX-License-Identifier: GPL-2.0 // Copyright (C) 2016, Linaro Ltd - Daniel Lezcano <daniel.lezcano@linaro.org> +#define pr_fmt(fmt) "irq_timings: " fmt #include <linux/kernel.h> #include <linux/percpu.h> #include <linux/slab.h> #include <linux/static_key.h> +#include <linux/init.h> #include <linux/interrupt.h> #include <linux/idr.h> #include <linux/irq.h> #include <linux/math64.h> +#include <linux/log2.h> #include <trace/events/irq.h> @@ -18,16 +21,6 @@ DEFINE_STATIC_KEY_FALSE(irq_timing_enabled); DEFINE_PER_CPU(struct irq_timings, irq_timings); -struct irqt_stat { - u64 next_evt; - u64 last_ts; - u64 variance; - u32 avg; - u32 nr_samples; - int anomalies; - int valid; -}; - static DEFINE_IDR(irqt_stats); void irq_timings_enable(void) @@ -40,182 +33,473 @@ void irq_timings_disable(void) static_branch_disable(&irq_timing_enabled); } -/** - * irqs_update - update the irq timing statistics with a new timestamp +/* + * The main goal of this algorithm is to predict the next interrupt + * occurrence on the current CPU. + * + * Currently, the interrupt timings are stored in a circular array + * buffer every time there is an interrupt, as a tuple: the interrupt + * number and the associated timestamp when the event occurred <irq, + * timestamp>. + * + * For every interrupt occurring in a short period of time, we can + * measure the elapsed time between the occurrences for the same + * interrupt and we end up with a suite of intervals. The experience + * showed the interrupts are often coming following a periodic + * pattern. + * + * The objective of the algorithm is to find out this periodic pattern + * in a fastest way and use its period to predict the next irq event. + * + * When the next interrupt event is requested, we are in the situation + * where the interrupts are disabled and the circular buffer + * containing the timings is filled with the events which happened + * after the previous next-interrupt-event request. + * + * At this point, we read the circular buffer and we fill the irq + * related statistics structure. After this step, the circular array + * containing the timings is empty because all the values are + * dispatched in their corresponding buffers. + * + * Now for each interrupt, we can predict the next event by using the + * suffix array, log interval and exponential moving average + * + * 1. Suffix array + * + * Suffix array is an array of all the suffixes of a string. It is + * widely used as a data structure for compression, text search, ... + * For instance for the word 'banana', the suffixes will be: 'banana' + * 'anana' 'nana' 'ana' 'na' 'a' + * + * Usually, the suffix array is sorted but for our purpose it is + * not necessary and won't provide any improvement in the context of + * the solved problem where we clearly define the boundaries of the + * search by a max period and min period. + * + * The suffix array will build a suite of intervals of different + * length and will look for the repetition of each suite. If the suite + * is repeating then we have the period because it is the length of + * the suite whatever its position in the buffer. + * + * 2. Log interval + * + * We saw the irq timings allow to compute the interval of the + * occurrences for a specific interrupt. We can reasonibly assume the + * longer is the interval, the higher is the error for the next event + * and we can consider storing those interval values into an array + * where each slot in the array correspond to an interval at the power + * of 2 of the index. For example, index 12 will contain values + * between 2^11 and 2^12. + * + * At the end we have an array of values where at each index defines a + * [2^index - 1, 2 ^ index] interval values allowing to store a large + * number of values inside a small array. + * + * For example, if we have the value 1123, then we store it at + * ilog2(1123) = 10 index value. + * + * Storing those value at the specific index is done by computing an + * exponential moving average for this specific slot. For instance, + * for values 1800, 1123, 1453, ... fall under the same slot (10) and + * the exponential moving average is computed every time a new value + * is stored at this slot. + * + * 3. Exponential Moving Average + * + * The EMA is largely used to track a signal for stocks or as a low + * pass filter. The magic of the formula, is it is very simple and the + * reactivity of the average can be tuned with the factors called + * alpha. + * + * The higher the alphas are, the faster the average respond to the + * signal change. In our case, if a slot in the array is a big + * interval, we can have numbers with a big difference between + * them. The impact of those differences in the average computation + * can be tuned by changing the alpha value. + * + * + * -- The algorithm -- + * + * We saw the different processing above, now let's see how they are + * used together. + * + * For each interrupt: + * For each interval: + * Compute the index = ilog2(interval) + * Compute a new_ema(buffer[index], interval) + * Store the index in a circular buffer + * + * Compute the suffix array of the indexes + * + * For each suffix: + * If the suffix is reverse-found 3 times + * Return suffix + * + * Return Not found + * + * However we can not have endless suffix array to be build, it won't + * make sense and it will add an extra overhead, so we can restrict + * this to a maximum suffix length of 5 and a minimum suffix length of + * 2. The experience showed 5 is the majority of the maximum pattern + * period found for different devices. + * + * The result is a pattern finding less than 1us for an interrupt. + * + * Example based on real values: + * + * Example 1 : MMC write/read interrupt interval: + * + * 223947, 1240, 1384, 1386, 1386, + * 217416, 1236, 1384, 1386, 1387, + * 214719, 1241, 1386, 1387, 1384, + * 213696, 1234, 1384, 1386, 1388, + * 219904, 1240, 1385, 1389, 1385, + * 212240, 1240, 1386, 1386, 1386, + * 214415, 1236, 1384, 1386, 1387, + * 214276, 1234, 1384, 1388, ? + * + * For each element, apply ilog2(value) * - * @irqs: an irqt_stat struct pointer - * @ts: the new timestamp + * 15, 8, 8, 8, 8, + * 15, 8, 8, 8, 8, + * 15, 8, 8, 8, 8, + * 15, 8, 8, 8, 8, + * 15, 8, 8, 8, 8, + * 15, 8, 8, 8, 8, + * 15, 8, 8, 8, 8, + * 15, 8, 8, 8, ? * - * The statistics are computed online, in other words, the code is - * designed to compute the statistics on a stream of values rather - * than doing multiple passes on the values to compute the average, - * then the variance. The integer division introduces a loss of - * precision but with an acceptable error margin regarding the results - * we would have with the double floating precision: we are dealing - * with nanosec, so big numbers, consequently the mantisse is - * negligeable, especially when converting the time in usec - * afterwards. + * Max period of 5, we take the last (max_period * 3) 15 elements as + * we can be confident if the pattern repeats itself three times it is + * a repeating pattern. * - * The computation happens at idle time. When the CPU is not idle, the - * interrupts' timestamps are stored in the circular buffer, when the - * CPU goes idle and this routine is called, all the buffer's values - * are injected in the statistical model continuying to extend the - * statistics from the previous busy-idle cycle. + * 8, + * 15, 8, 8, 8, 8, + * 15, 8, 8, 8, 8, + * 15, 8, 8, 8, ? * - * The observations showed a device will trigger a burst of periodic - * interrupts followed by one or two peaks of longer time, for - * instance when a SD card device flushes its cache, then the periodic - * intervals occur again. A one second inactivity period resets the - * stats, that gives us the certitude the statistical values won't - * exceed 1x10^9, thus the computation won't overflow. + * Suffixes are: * - * Basically, the purpose of the algorithm is to watch the periodic - * interrupts and eliminate the peaks. + * 1) 8, 15, 8, 8, 8 <- max period + * 2) 8, 15, 8, 8 + * 3) 8, 15, 8 + * 4) 8, 15 <- min period * - * An interrupt is considered periodically stable if the interval of - * its occurences follow the normal distribution, thus the values - * comply with: + * From there we search the repeating pattern for each suffix. * - * avg - 3 x stddev < value < avg + 3 x stddev + * buffer: 8, 15, 8, 8, 8, 8, 15, 8, 8, 8, 8, 15, 8, 8, 8 + * | | | | | | | | | | | | | | | + * 8, 15, 8, 8, 8 | | | | | | | | | | + * 8, 15, 8, 8, 8 | | | | | + * 8, 15, 8, 8, 8 * - * Which can be simplified to: + * When moving the suffix, we found exactly 3 matches. * - * -3 x stddev < value - avg < 3 x stddev + * The first suffix with period 5 is repeating. * - * abs(value - avg) < 3 x stddev + * The next event is (3 * max_period) % suffix_period * - * In order to save a costly square root computation, we use the - * variance. For the record, stddev = sqrt(variance). The equation - * above becomes: + * In this example, the result 0, so the next event is suffix[0] => 8 * - * abs(value - avg) < 3 x sqrt(variance) + * However, 8 is the index in the array of exponential moving average + * which was calculated on the fly when storing the values, so the + * interval is ema[8] = 1366 * - * And finally we square it: * - * (value - avg) ^ 2 < (3 x sqrt(variance)) ^ 2 + * Example 2: * - * (value - avg) x (value - avg) < 9 x variance + * 4, 3, 5, 100, + * 3, 3, 5, 117, + * 4, 4, 5, 112, + * 4, 3, 4, 110, + * 3, 5, 3, 117, + * 4, 4, 5, 112, + * 4, 3, 4, 110, + * 3, 4, 5, 112, + * 4, 3, 4, 110 * - * Statistically speaking, any values out of this interval is - * considered as an anomaly and is discarded. However, a normal - * distribution appears when the number of samples is 30 (it is the - * rule of thumb in statistics, cf. "30 samples" on Internet). When - * there are three consecutive anomalies, the statistics are resetted. + * ilog2 * + * 0, 0, 0, 4, + * 0, 0, 0, 4, + * 0, 0, 0, 4, + * 0, 0, 0, 4, + * 0, 0, 0, 4, + * 0, 0, 0, 4, + * 0, 0, 0, 4, + * 0, 0, 0, 4, + * 0, 0, 0, 4 + * + * Max period 5: + * 0, 0, 4, + * 0, 0, 0, 4, + * 0, 0, 0, 4, + * 0, 0, 0, 4 + * + * Suffixes: + * + * 1) 0, 0, 4, 0, 0 + * 2) 0, 0, 4, 0 + * 3) 0, 0, 4 + * 4) 0, 0 + * + * buffer: 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 4 + * | | | | | | X + * 0, 0, 4, 0, 0, | X + * 0, 0 + * + * buffer: 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 4 + * | | | | | | | | | | | | | | | + * 0, 0, 4, 0, | | | | | | | | | | | + * 0, 0, 4, 0, | | | | | | | + * 0, 0, 4, 0, | | | + * 0 0 4 + * + * Pattern is found 3 times, the remaining is 1 which results from + * (max_period * 3) % suffix_period. This value is the index in the + * suffix arrays. The suffix array for a period 4 has the value 4 + * at index 1. + */ +#define EMA_ALPHA_VAL 64 +#define EMA_ALPHA_SHIFT 7 + +#define PREDICTION_PERIOD_MIN 3 +#define PREDICTION_PERIOD_MAX 5 +#define PREDICTION_FACTOR 4 +#define PREDICTION_MAX 10 /* 2 ^ PREDICTION_MAX useconds */ +#define PREDICTION_BUFFER_SIZE 16 /* slots for EMAs, hardly more than 16 */ + +/* + * Number of elements in the circular buffer: If it happens it was + * flushed before, then the number of elements could be smaller than + * IRQ_TIMINGS_SIZE, so the count is used, otherwise the array size is + * used as we wrapped. The index begins from zero when we did not + * wrap. That could be done in a nicer way with the proper circular + * array structure type but with the cost of extra computation in the + * interrupt handler hot path. We choose efficiency. + */ +#define for_each_irqts(i, irqts) \ + for (i = irqts->count < IRQ_TIMINGS_SIZE ? \ + 0 : irqts->count & IRQ_TIMINGS_MASK, \ + irqts->count = min(IRQ_TIMINGS_SIZE, \ + irqts->count); \ + irqts->count > 0; irqts->count--, \ + i = (i + 1) & IRQ_TIMINGS_MASK) + +struct irqt_stat { + u64 last_ts; + u64 ema_time[PREDICTION_BUFFER_SIZE]; + int timings[IRQ_TIMINGS_SIZE]; + int circ_timings[IRQ_TIMINGS_SIZE]; + int count; +}; + +/* + * Exponential moving average computation */ -static void irqs_update(struct irqt_stat *irqs, u64 ts) +static u64 irq_timings_ema_new(u64 value, u64 ema_old) { - u64 old_ts = irqs->last_ts; - u64 variance = 0; - u64 interval; s64 diff; + if (unlikely(!ema_old)) + return value; + + diff = (value - ema_old) * EMA_ALPHA_VAL; /* - * The timestamps are absolute time values, we need to compute - * the timing interval between two interrupts. + * We can use a s64 type variable to be added with the u64 + * ema_old variable as this one will never have its topmost + * bit set, it will be always smaller than 2^63 nanosec + * interrupt interval (292 years). */ - irqs->last_ts = ts; + return ema_old + (diff >> EMA_ALPHA_SHIFT); +} + +static int irq_timings_next_event_index(int *buffer, size_t len, int period_max) +{ + int period; /* - * The interval type is u64 in order to deal with the same - * type in our computation, that prevent mindfuck issues with - * overflow, sign and division. + * Move the beginning pointer to the end minus the max period x 3. + * We are at the point we can begin searching the pattern */ - interval = ts - old_ts; + buffer = &buffer[len - (period_max * 3)]; + + /* Adjust the length to the maximum allowed period x 3 */ + len = period_max * 3; /* - * The interrupt triggered more than one second apart, that - * ends the sequence as predictible for our purpose. In this - * case, assume we have the beginning of a sequence and the - * timestamp is the first value. As it is impossible to - * predict anything at this point, return. - * - * Note the first timestamp of the sequence will always fall - * in this test because the old_ts is zero. That is what we - * want as we need another timestamp to compute an interval. + * The buffer contains the suite of intervals, in a ilog2 + * basis, we are looking for a repetition. We point the + * beginning of the search three times the length of the + * period beginning at the end of the buffer. We do that for + * each suffix. */ - if (interval >= NSEC_PER_SEC) { - memset(irqs, 0, sizeof(*irqs)); - irqs->last_ts = ts; - return; + for (period = period_max; period >= PREDICTION_PERIOD_MIN; period--) { + + /* + * The first comparison always succeed because the + * suffix is deduced from the first n-period bytes of + * the buffer and we compare the initial suffix with + * itself, so we can skip the first iteration. + */ + int idx = period; + size_t size = period; + + /* + * We look if the suite with period 'i' repeat + * itself. If it is truncated at the end, as it + * repeats we can use the period to find out the next + * element with the modulo. + */ + while (!memcmp(buffer, &buffer[idx], size * sizeof(int))) { + + /* + * Move the index in a period basis + */ + idx += size; + + /* + * If this condition is reached, all previous + * memcmp were successful, so the period is + * found. + */ + if (idx == len) + return buffer[len % period]; + + /* + * If the remaining elements to compare are + * smaller than the period, readjust the size + * of the comparison for the last iteration. + */ + if (len - idx < period) + size = len - idx; + } + } + + return -1; +} + +static u64 __irq_timings_next_event(struct irqt_stat *irqs, int irq, u64 now) +{ + int index, i, period_max, count, start, min = INT_MAX; + + if ((now - irqs->last_ts) >= NSEC_PER_SEC) { + irqs->count = irqs->last_ts = 0; + return U64_MAX; } /* - * Pre-compute the delta with the average as the result is - * used several times in this function. + * As we want to find three times the repetition, we need a + * number of intervals greater or equal to three times the + * maximum period, otherwise we truncate the max period. */ - diff = interval - irqs->avg; + period_max = irqs->count > (3 * PREDICTION_PERIOD_MAX) ? + PREDICTION_PERIOD_MAX : irqs->count / 3; /* - * Increment the number of samples. + * If we don't have enough irq timings for this prediction, + * just bail out. */ - irqs->nr_samples++; + if (period_max <= PREDICTION_PERIOD_MIN) + return U64_MAX; /* - * Online variance divided by the number of elements if there - * is more than one sample. Normally the formula is division - * by nr_samples - 1 but we assume the number of element will be - * more than 32 and dividing by 32 instead of 31 is enough - * precise. + * 'count' will depends if the circular buffer wrapped or not */ - if (likely(irqs->nr_samples > 1)) - variance = irqs->variance >> IRQ_TIMINGS_SHIFT; + count = irqs->count < IRQ_TIMINGS_SIZE ? + irqs->count : IRQ_TIMINGS_SIZE; + + start = irqs->count < IRQ_TIMINGS_SIZE ? + 0 : (irqs->count & IRQ_TIMINGS_MASK); /* - * The rule of thumb in statistics for the normal distribution - * is having at least 30 samples in order to have the model to - * apply. Values outside the interval are considered as an - * anomaly. + * Copy the content of the circular buffer into another buffer + * in order to linearize the buffer instead of dealing with + * wrapping indexes and shifted array which will be prone to + * error and extremelly difficult to debug. */ - if ((irqs->nr_samples >= 30) && ((diff * diff) > (9 * variance))) { - /* - * After three consecutive anomalies, we reset the - * stats as it is no longer stable enough. - */ - if (irqs->anomalies++ >= 3) { - memset(irqs, 0, sizeof(*irqs)); - irqs->last_ts = ts; - return; - } - } else { - /* - * The anomalies must be consecutives, so at this - * point, we reset the anomalies counter. - */ - irqs->anomalies = 0; + for (i = 0; i < count; i++) { + int index = (start + i) & IRQ_TIMINGS_MASK; + + irqs->timings[i] = irqs->circ_timings[index]; + min = min_t(int, irqs->timings[i], min); } + index = irq_timings_next_event_index(irqs->timings, count, period_max); + if (index < 0) + return irqs->last_ts + irqs->ema_time[min]; + + return irqs->last_ts + irqs->ema_time[index]; +} + +static __always_inline int irq_timings_interval_index(u64 interval) +{ /* - * The interrupt is considered stable enough to try to predict - * the next event on it. + * The PREDICTION_FACTOR increase the interval size for the + * array of exponential average. */ - irqs->valid = 1; + u64 interval_us = (interval >> 10) / PREDICTION_FACTOR; + + return likely(interval_us) ? ilog2(interval_us) : 0; +} + +static __always_inline void __irq_timings_store(int irq, struct irqt_stat *irqs, + u64 interval) +{ + int index; /* - * Online average algorithm: - * - * new_average = average + ((value - average) / count) - * - * The variance computation depends on the new average - * to be computed here first. - * + * Get the index in the ema table for this interrupt. */ - irqs->avg = irqs->avg + (diff >> IRQ_TIMINGS_SHIFT); + index = irq_timings_interval_index(interval); /* - * Online variance algorithm: - * - * new_variance = variance + (value - average) x (value - new_average) - * - * Warning: irqs->avg is updated with the line above, hence - * 'interval - irqs->avg' is no longer equal to 'diff' + * Store the index as an element of the pattern in another + * circular array. */ - irqs->variance = irqs->variance + (diff * (interval - irqs->avg)); + irqs->circ_timings[irqs->count & IRQ_TIMINGS_MASK] = index; + + irqs->ema_time[index] = irq_timings_ema_new(interval, + irqs->ema_time[index]); + + irqs->count++; +} + +static inline void irq_timings_store(int irq, struct irqt_stat *irqs, u64 ts) +{ + u64 old_ts = irqs->last_ts; + u64 interval; /* - * Update the next event + * The timestamps are absolute time values, we need to compute + * the timing interval between two interrupts. + */ + irqs->last_ts = ts; + + /* + * The interval type is u64 in order to deal with the same + * type in our computation, that prevent mindfuck issues with + * overflow, sign and division. + */ + interval = ts - old_ts; + + /* + * The interrupt triggered more than one second apart, that + * ends the sequence as predictible for our purpose. In this + * case, assume we have the beginning of a sequence and the + * timestamp is the first value. As it is impossible to + * predict anything at this point, return. + * + * Note the first timestamp of the sequence will always fall + * in this test because the old_ts is zero. That is what we + * want as we need another timestamp to compute an interval. */ - irqs->next_evt = ts + irqs->avg; + if (interval >= NSEC_PER_SEC) { + irqs->count = 0; + return; + } + + __irq_timings_store(irq, irqs, interval); } /** @@ -259,6 +543,9 @@ u64 irq_timings_next_event(u64 now) */ lockdep_assert_irqs_disabled(); + if (!irqts->count) + return next_evt; + /* * Number of elements in the circular buffer: If it happens it * was flushed before, then the number of elements could be @@ -269,21 +556,15 @@ u64 irq_timings_next_event(u64 now) * type but with the cost of extra computation in the * interrupt handler hot path. We choose efficiency. * - * Inject measured irq/timestamp to the statistical model - * while decrementing the counter because we consume the data - * from our circular buffer. + * Inject measured irq/timestamp to the pattern prediction + * model while decrementing the counter because we consume the + * data from our circular buffer. */ - for (i = irqts->count & IRQ_TIMINGS_MASK, - irqts->count = min(IRQ_TIMINGS_SIZE, irqts->count); - irqts->count > 0; irqts->count--, i = (i + 1) & IRQ_TIMINGS_MASK) { - + for_each_irqts(i, irqts) { irq = irq_timing_decode(irqts->values[i], &ts); - s = idr_find(&irqt_stats, irq); - if (s) { - irqs = this_cpu_ptr(s); - irqs_update(irqs, ts); - } + if (s) + irq_timings_store(irq, this_cpu_ptr(s), ts); } /* @@ -294,26 +575,12 @@ u64 irq_timings_next_event(u64 now) irqs = this_cpu_ptr(s); - if (!irqs->valid) - continue; - - if (irqs->next_evt <= now) { - irq = i; - next_evt = now; + ts = __irq_timings_next_event(irqs, i, now); + if (ts <= now) + return now; - /* - * This interrupt mustn't use in the future - * until new events occur and update the - * statistics. - */ - irqs->valid = 0; - break; - } - - if (irqs->next_evt < next_evt) { - irq = i; - next_evt = irqs->next_evt; - } + if (ts < next_evt) + next_evt = ts; } return next_evt; @@ -360,3 +627,325 @@ int irq_timings_alloc(int irq) return 0; } + +#ifdef CONFIG_TEST_IRQ_TIMINGS +struct timings_intervals { + u64 *intervals; + size_t count; +}; + +/* + * Intervals are given in nanosecond base + */ +static u64 intervals0[] __initdata = { + 10000, 50000, 200000, 500000, + 10000, 50000, 200000, 500000, + 10000, 50000, 200000, 500000, + 10000, 50000, 200000, 500000, + 10000, 50000, 200000, 500000, + 10000, 50000, 200000, 500000, + 10000, 50000, 200000, 500000, + 10000, 50000, 200000, 500000, + 10000, 50000, 200000, +}; + +static u64 intervals1[] __initdata = { + 223947000, 1240000, 1384000, 1386000, 1386000, + 217416000, 1236000, 1384000, 1386000, 1387000, + 214719000, 1241000, 1386000, 1387000, 1384000, + 213696000, 1234000, 1384000, 1386000, 1388000, + 219904000, 1240000, 1385000, 1389000, 1385000, + 212240000, 1240000, 1386000, 1386000, 1386000, + 214415000, 1236000, 1384000, 1386000, 1387000, + 214276000, 1234000, +}; + +static u64 intervals2[] __initdata = { + 4000, 3000, 5000, 100000, + 3000, 3000, 5000, 117000, + 4000, 4000, 5000, 112000, + 4000, 3000, 4000, 110000, + 3000, 5000, 3000, 117000, + 4000, 4000, 5000, 112000, + 4000, 3000, 4000, 110000, + 3000, 4000, 5000, 112000, + 4000, +}; + +static u64 intervals3[] __initdata = { + 1385000, 212240000, 1240000, + 1386000, 214415000, 1236000, + 1384000, 214276000, 1234000, + 1386000, 214415000, 1236000, + 1385000, 212240000, 1240000, + 1386000, 214415000, 1236000, + 1384000, 214276000, 1234000, + 1386000, 214415000, 1236000, + 1385000, 212240000, 1240000, +}; + +static u64 intervals4[] __initdata = { + 10000, 50000, 10000, 50000, + 10000, 50000, 10000, 50000, + 10000, 50000, 10000, 50000, + 10000, 50000, 10000, 50000, + 10000, 50000, 10000, 50000, + 10000, 50000, 10000, 50000, + 10000, 50000, 10000, 50000, + 10000, 50000, 10000, 50000, + 10000, +}; + +static struct timings_intervals tis[] __initdata = { + { intervals0, ARRAY_SIZE(intervals0) }, + { intervals1, ARRAY_SIZE(intervals1) }, + { intervals2, ARRAY_SIZE(intervals2) }, + { intervals3, ARRAY_SIZE(intervals3) }, + { intervals4, ARRAY_SIZE(intervals4) }, +}; + +static int __init irq_timings_test_next_index(struct timings_intervals *ti) +{ + int _buffer[IRQ_TIMINGS_SIZE]; + int buffer[IRQ_TIMINGS_SIZE]; + int index, start, i, count, period_max; + + count = ti->count - 1; + + period_max = count > (3 * PREDICTION_PERIOD_MAX) ? + PREDICTION_PERIOD_MAX : count / 3; + + /* + * Inject all values except the last one which will be used + * to compare with the next index result. + */ + pr_debug("index suite: "); + + for (i = 0; i < count; i++) { + index = irq_timings_interval_index(ti->intervals[i]); + _buffer[i & IRQ_TIMINGS_MASK] = index; + pr_cont("%d ", index); + } + + start = count < IRQ_TIMINGS_SIZE ? 0 : + count & IRQ_TIMINGS_MASK; + + count = min_t(int, count, IRQ_TIMINGS_SIZE); + + for (i = 0; i < count; i++) { + int index = (start + i) & IRQ_TIMINGS_MASK; + buffer[i] = _buffer[index]; + } + + index = irq_timings_next_event_index(buffer, count, period_max); + i = irq_timings_interval_index(ti->intervals[ti->count - 1]); + + if (index != i) { + pr_err("Expected (%d) and computed (%d) next indexes differ\n", + i, index); + return -EINVAL; + } + + return 0; +} + +static int __init irq_timings_next_index_selftest(void) +{ + int i, ret; + + for (i = 0; i < ARRAY_SIZE(tis); i++) { + + pr_info("---> Injecting intervals number #%d (count=%zd)\n", + i, tis[i].count); + + ret = irq_timings_test_next_index(&tis[i]); + if (ret) + break; + } + + return ret; +} + +static int __init irq_timings_test_irqs(struct timings_intervals *ti) +{ + struct irqt_stat __percpu *s; + struct irqt_stat *irqs; + int i, index, ret, irq = 0xACE5; + + ret = irq_timings_alloc(irq); + if (ret) { + pr_err("Failed to allocate irq timings\n"); + return ret; + } + + s = idr_find(&irqt_stats, irq); + if (!s) { + ret = -EIDRM; + goto out; + } + + irqs = this_cpu_ptr(s); + + for (i = 0; i < ti->count; i++) { + + index = irq_timings_interval_index(ti->intervals[i]); + pr_debug("%d: interval=%llu ema_index=%d\n", + i, ti->intervals[i], index); + + __irq_timings_store(irq, irqs, ti->intervals[i]); + if (irqs->circ_timings[i & IRQ_TIMINGS_MASK] != index) { + pr_err("Failed to store in the circular buffer\n"); + goto out; + } + } + + if (irqs->count != ti->count) { + pr_err("Count differs\n"); + goto out; + } + + ret = 0; +out: + irq_timings_free(irq); + + return ret; +} + +static int __init irq_timings_irqs_selftest(void) +{ + int i, ret; + + for (i = 0; i < ARRAY_SIZE(tis); i++) { + pr_info("---> Injecting intervals number #%d (count=%zd)\n", + i, tis[i].count); + ret = irq_timings_test_irqs(&tis[i]); + if (ret) + break; + } + + return ret; +} + +static int __init irq_timings_test_irqts(struct irq_timings *irqts, + unsigned count) +{ + int start = count >= IRQ_TIMINGS_SIZE ? count - IRQ_TIMINGS_SIZE : 0; + int i, irq, oirq = 0xBEEF; + u64 ots = 0xDEAD, ts; + + /* + * Fill the circular buffer by using the dedicated function. + */ + for (i = 0; i < count; i++) { + pr_debug("%d: index=%d, ts=%llX irq=%X\n", + i, i & IRQ_TIMINGS_MASK, ots + i, oirq + i); + + irq_timings_push(ots + i, oirq + i); + } + + /* + * Compute the first elements values after the index wrapped + * up or not. + */ + ots += start; + oirq += start; + + /* + * Test the circular buffer count is correct. + */ + pr_debug("---> Checking timings array count (%d) is right\n", count); + if (WARN_ON(irqts->count != count)) + return -EINVAL; + + /* + * Test the macro allowing to browse all the irqts. + */ + pr_debug("---> Checking the for_each_irqts() macro\n"); + for_each_irqts(i, irqts) { + + irq = irq_timing_decode(irqts->values[i], &ts); + + pr_debug("index=%d, ts=%llX / %llX, irq=%X / %X\n", + i, ts, ots, irq, oirq); + + if (WARN_ON(ts != ots || irq != oirq)) + return -EINVAL; + + ots++; oirq++; + } + + /* + * The circular buffer should have be flushed when browsed + * with for_each_irqts + */ + pr_debug("---> Checking timings array is empty after browsing it\n"); + if (WARN_ON(irqts->count)) + return -EINVAL; + + return 0; +} + +static int __init irq_timings_irqts_selftest(void) +{ + struct irq_timings *irqts = this_cpu_ptr(&irq_timings); + int i, ret; + + /* + * Test the circular buffer with different number of + * elements. The purpose is to test at the limits (empty, half + * full, full, wrapped with the cursor at the boundaries, + * wrapped several times, etc ... + */ + int count[] = { 0, + IRQ_TIMINGS_SIZE >> 1, + IRQ_TIMINGS_SIZE, + IRQ_TIMINGS_SIZE + (IRQ_TIMINGS_SIZE >> 1), + 2 * IRQ_TIMINGS_SIZE, + (2 * IRQ_TIMINGS_SIZE) + 3, + }; + + for (i = 0; i < ARRAY_SIZE(count); i++) { + + pr_info("---> Checking the timings with %d/%d values\n", + count[i], IRQ_TIMINGS_SIZE); + + ret = irq_timings_test_irqts(irqts, count[i]); + if (ret) + break; + } + + return ret; +} + +static int __init irq_timings_selftest(void) +{ + int ret; + + pr_info("------------------- selftest start -----------------\n"); + + /* + * At this point, we don't except any subsystem to use the irq + * timings but us, so it should not be enabled. + */ + if (static_branch_unlikely(&irq_timing_enabled)) { + pr_warn("irq timings already initialized, skipping selftest\n"); + return 0; + } + + ret = irq_timings_irqts_selftest(); + if (ret) + goto out; + + ret = irq_timings_irqs_selftest(); + if (ret) + goto out; + + ret = irq_timings_next_index_selftest(); +out: + pr_info("---------- selftest end with %s -----------\n", + ret ? "failure" : "success"); + + return ret; +} +early_initcall(irq_timings_selftest); +#endif diff --git a/kernel/irq_work.c b/kernel/irq_work.c index 6b7cdf17ccf8..d42acaf81886 100644 --- a/kernel/irq_work.c +++ b/kernel/irq_work.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2010 Red Hat, Inc., Peter Zijlstra * @@ -56,61 +57,70 @@ void __weak arch_irq_work_raise(void) */ } -/* - * Enqueue the irq_work @work on @cpu unless it's already pending - * somewhere. - * - * Can be re-enqueued while the callback is still in progress. - */ -bool irq_work_queue_on(struct irq_work *work, int cpu) +/* Enqueue on current CPU, work must already be claimed and preempt disabled */ +static void __irq_work_queue_local(struct irq_work *work) { - /* All work should have been flushed before going offline */ - WARN_ON_ONCE(cpu_is_offline(cpu)); - -#ifdef CONFIG_SMP - - /* Arch remote IPI send/receive backend aren't NMI safe */ - WARN_ON_ONCE(in_nmi()); + /* If the work is "lazy", handle it from next tick if any */ + if (work->flags & IRQ_WORK_LAZY) { + if (llist_add(&work->llnode, this_cpu_ptr(&lazy_list)) && + tick_nohz_tick_stopped()) + arch_irq_work_raise(); + } else { + if (llist_add(&work->llnode, this_cpu_ptr(&raised_list))) + arch_irq_work_raise(); + } +} +/* Enqueue the irq work @work on the current CPU */ +bool irq_work_queue(struct irq_work *work) +{ /* Only queue if not already pending */ if (!irq_work_claim(work)) return false; - if (llist_add(&work->llnode, &per_cpu(raised_list, cpu))) - arch_send_call_function_single_ipi(cpu); - -#else /* #ifdef CONFIG_SMP */ - irq_work_queue(work); -#endif /* #else #ifdef CONFIG_SMP */ + /* Queue the entry and raise the IPI if needed. */ + preempt_disable(); + __irq_work_queue_local(work); + preempt_enable(); return true; } +EXPORT_SYMBOL_GPL(irq_work_queue); -/* Enqueue the irq work @work on the current CPU */ -bool irq_work_queue(struct irq_work *work) +/* + * Enqueue the irq_work @work on @cpu unless it's already pending + * somewhere. + * + * Can be re-enqueued while the callback is still in progress. + */ +bool irq_work_queue_on(struct irq_work *work, int cpu) { +#ifndef CONFIG_SMP + return irq_work_queue(work); + +#else /* CONFIG_SMP: */ + /* All work should have been flushed before going offline */ + WARN_ON_ONCE(cpu_is_offline(cpu)); + /* Only queue if not already pending */ if (!irq_work_claim(work)) return false; - /* Queue the entry and raise the IPI if needed. */ preempt_disable(); - - /* If the work is "lazy", handle it from next tick if any */ - if (work->flags & IRQ_WORK_LAZY) { - if (llist_add(&work->llnode, this_cpu_ptr(&lazy_list)) && - tick_nohz_tick_stopped()) - arch_irq_work_raise(); + if (cpu != smp_processor_id()) { + /* Arch remote IPI send/receive backend aren't NMI safe */ + WARN_ON_ONCE(in_nmi()); + if (llist_add(&work->llnode, &per_cpu(raised_list, cpu))) + arch_send_call_function_single_ipi(cpu); } else { - if (llist_add(&work->llnode, this_cpu_ptr(&raised_list))) - arch_irq_work_raise(); + __irq_work_queue_local(work); } - preempt_enable(); return true; +#endif /* CONFIG_SMP */ } -EXPORT_SYMBOL_GPL(irq_work_queue); + bool irq_work_needs_cpu(void) { diff --git a/kernel/jump_label.c b/kernel/jump_label.c index bad96b476eb6..df3008419a1d 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * jump label support * @@ -36,12 +37,26 @@ static int jump_label_cmp(const void *a, const void *b) const struct jump_entry *jea = a; const struct jump_entry *jeb = b; + /* + * Entrires are sorted by key. + */ if (jump_entry_key(jea) < jump_entry_key(jeb)) return -1; if (jump_entry_key(jea) > jump_entry_key(jeb)) return 1; + /* + * In the batching mode, entries should also be sorted by the code + * inside the already sorted list of entries, enabling a bsearch in + * the vector. + */ + if (jump_entry_code(jea) < jump_entry_code(jeb)) + return -1; + + if (jump_entry_code(jea) > jump_entry_code(jeb)) + return 1; + return 0; } @@ -202,11 +217,13 @@ void static_key_disable(struct static_key *key) } EXPORT_SYMBOL_GPL(static_key_disable); -static void __static_key_slow_dec_cpuslocked(struct static_key *key, - unsigned long rate_limit, - struct delayed_work *work) +static bool static_key_slow_try_dec(struct static_key *key) { - lockdep_assert_cpus_held(); + int val; + + val = atomic_fetch_add_unless(&key->enabled, -1, 1); + if (val == 1) + return false; /* * The negative count check is valid even when a negative @@ -215,63 +232,70 @@ static void __static_key_slow_dec_cpuslocked(struct static_key *key, * returns is unbalanced, because all other static_key_slow_inc() * instances block while the update is in progress. */ - if (!atomic_dec_and_mutex_lock(&key->enabled, &jump_label_mutex)) { - WARN(atomic_read(&key->enabled) < 0, - "jump label: negative count!\n"); + WARN(val < 0, "jump label: negative count!\n"); + return true; +} + +static void __static_key_slow_dec_cpuslocked(struct static_key *key) +{ + lockdep_assert_cpus_held(); + + if (static_key_slow_try_dec(key)) return; - } - if (rate_limit) { - atomic_inc(&key->enabled); - schedule_delayed_work(work, rate_limit); - } else { + jump_label_lock(); + if (atomic_dec_and_test(&key->enabled)) jump_label_update(key); - } jump_label_unlock(); } -static void __static_key_slow_dec(struct static_key *key, - unsigned long rate_limit, - struct delayed_work *work) +static void __static_key_slow_dec(struct static_key *key) { cpus_read_lock(); - __static_key_slow_dec_cpuslocked(key, rate_limit, work); + __static_key_slow_dec_cpuslocked(key); cpus_read_unlock(); } -static void jump_label_update_timeout(struct work_struct *work) +void jump_label_update_timeout(struct work_struct *work) { struct static_key_deferred *key = container_of(work, struct static_key_deferred, work.work); - __static_key_slow_dec(&key->key, 0, NULL); + __static_key_slow_dec(&key->key); } +EXPORT_SYMBOL_GPL(jump_label_update_timeout); void static_key_slow_dec(struct static_key *key) { STATIC_KEY_CHECK_USE(key); - __static_key_slow_dec(key, 0, NULL); + __static_key_slow_dec(key); } EXPORT_SYMBOL_GPL(static_key_slow_dec); void static_key_slow_dec_cpuslocked(struct static_key *key) { STATIC_KEY_CHECK_USE(key); - __static_key_slow_dec_cpuslocked(key, 0, NULL); + __static_key_slow_dec_cpuslocked(key); } -void static_key_slow_dec_deferred(struct static_key_deferred *key) +void __static_key_slow_dec_deferred(struct static_key *key, + struct delayed_work *work, + unsigned long timeout) { STATIC_KEY_CHECK_USE(key); - __static_key_slow_dec(&key->key, key->timeout, &key->work); + + if (static_key_slow_try_dec(key)) + return; + + schedule_delayed_work(work, timeout); } -EXPORT_SYMBOL_GPL(static_key_slow_dec_deferred); +EXPORT_SYMBOL_GPL(__static_key_slow_dec_deferred); -void static_key_deferred_flush(struct static_key_deferred *key) +void __static_key_deferred_flush(void *key, struct delayed_work *work) { STATIC_KEY_CHECK_USE(key); - flush_delayed_work(&key->work); + flush_delayed_work(work); } -EXPORT_SYMBOL_GPL(static_key_deferred_flush); +EXPORT_SYMBOL_GPL(__static_key_deferred_flush); void jump_label_rate_limit(struct static_key_deferred *key, unsigned long rl) @@ -374,25 +398,55 @@ static enum jump_label_type jump_label_type(struct jump_entry *entry) return enabled ^ branch; } +static bool jump_label_can_update(struct jump_entry *entry, bool init) +{ + /* + * Cannot update code that was in an init text area. + */ + if (!init && jump_entry_is_init(entry)) + return false; + + if (!kernel_text_address(jump_entry_code(entry))) { + WARN_ONCE(1, "can't patch jump_label at %pS", (void *)jump_entry_code(entry)); + return false; + } + + return true; +} + +#ifndef HAVE_JUMP_LABEL_BATCH static void __jump_label_update(struct static_key *key, struct jump_entry *entry, struct jump_entry *stop, bool init) { for (; (entry < stop) && (jump_entry_key(entry) == key); entry++) { - /* - * An entry->code of 0 indicates an entry which has been - * disabled because it was in an init text area. - */ - if (init || !jump_entry_is_init(entry)) { - if (kernel_text_address(jump_entry_code(entry))) - arch_jump_label_transform(entry, jump_label_type(entry)); - else - WARN_ONCE(1, "can't patch jump_label at %pS", - (void *)jump_entry_code(entry)); + if (jump_label_can_update(entry, init)) + arch_jump_label_transform(entry, jump_label_type(entry)); + } +} +#else +static void __jump_label_update(struct static_key *key, + struct jump_entry *entry, + struct jump_entry *stop, + bool init) +{ + for (; (entry < stop) && (jump_entry_key(entry) == key); entry++) { + + if (!jump_label_can_update(entry, init)) + continue; + + if (!arch_jump_label_transform_queue(entry, jump_label_type(entry))) { + /* + * Queue is full: Apply the current queue and try again. + */ + arch_jump_label_transform_apply(); + BUG_ON(!arch_jump_label_transform_queue(entry, jump_label_type(entry))); } } + arch_jump_label_transform_apply(); } +#endif void __init jump_label_init(void) { diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index f3a04994e063..95a260f9214b 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * kallsyms.c: in-kernel printing of symbolic oopses and stack traces. * @@ -494,7 +495,7 @@ static int get_ksymbol_ftrace_mod(struct kallsym_iter *iter) static int get_ksymbol_bpf(struct kallsym_iter *iter) { - iter->module_name[0] = '\0'; + strlcpy(iter->module_name, "bpf", MODULE_NAME_LEN); iter->exported = 0; return bpf_get_kallsym(iter->pos - iter->pos_ftrace_mod_end, &iter->value, &iter->type, diff --git a/kernel/kcov.c b/kernel/kcov.c index c2277dbdbfb1..2ee38727844a 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c @@ -20,6 +20,7 @@ #include <linux/debugfs.h> #include <linux/uaccess.h> #include <linux/kcov.h> +#include <linux/refcount.h> #include <asm/setup.h> /* Number of 64-bit words written per one comparison: */ @@ -44,7 +45,7 @@ struct kcov { * - opened file descriptor * - task with enabled coverage (we can't unwire it from another task) */ - atomic_t refcount; + refcount_t refcount; /* The lock protects mode, size, area and t. */ spinlock_t lock; enum kcov_mode mode; @@ -228,12 +229,12 @@ EXPORT_SYMBOL(__sanitizer_cov_trace_switch); static void kcov_get(struct kcov *kcov) { - atomic_inc(&kcov->refcount); + refcount_inc(&kcov->refcount); } static void kcov_put(struct kcov *kcov) { - if (atomic_dec_and_test(&kcov->refcount)) { + if (refcount_dec_and_test(&kcov->refcount)) { vfree(kcov->area); kfree(kcov); } @@ -312,7 +313,7 @@ static int kcov_open(struct inode *inode, struct file *filep) if (!kcov) return -ENOMEM; kcov->mode = KCOV_MODE_DISABLED; - atomic_set(&kcov->refcount, 1); + refcount_set(&kcov->refcount, 1); spin_lock_init(&kcov->lock); filep->private_data = kcov; return nonseekable_open(inode, filep); @@ -444,10 +445,8 @@ static int __init kcov_init(void) * there is no need to protect it against removal races. The * use of debugfs_create_file_unsafe() is actually safe here. */ - if (!debugfs_create_file_unsafe("kcov", 0600, NULL, NULL, &kcov_fops)) { - pr_err("failed to create kcov in debugfs\n"); - return -ENOMEM; - } + debugfs_create_file_unsafe("kcov", 0600, NULL, NULL, &kcov_fops); + return 0; } diff --git a/kernel/kexec.c b/kernel/kexec.c index 68559808fdfa..1b018f1a6e0d 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -1,9 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * kexec.c - kexec_load system call * Copyright (C) 2002-2004 Eric Biederman <ebiederm@xmission.com> - * - * This source code is licensed under the GNU General Public License, - * Version 2. See the file COPYING for more details. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index d7140447be75..d5870723b8ad 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -1,9 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * kexec.c - kexec system call core code. * Copyright (C) 2002-2004 Eric Biederman <ebiederm@xmission.com> - * - * This source code is licensed under the GNU General Public License, - * Version 2. See the file COPYING for more details. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt @@ -1150,7 +1148,7 @@ int kernel_kexec(void) error = dpm_suspend_end(PMSG_FREEZE); if (error) goto Resume_devices; - error = disable_nonboot_cpus(); + error = suspend_disable_secondary_cpus(); if (error) goto Enable_cpus; local_irq_disable(); @@ -1183,7 +1181,7 @@ int kernel_kexec(void) Enable_irqs: local_irq_enable(); Enable_cpus: - enable_nonboot_cpus(); + suspend_enable_secondary_cpus(); dpm_resume_start(PMSG_RESTORE); Resume_devices: dpm_resume_end(PMSG_RESTORE); diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index f1d0e00a3971..b8cc032d5620 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -1,12 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * kexec: kexec_file_load system call * * Copyright (C) 2014 Red Hat Inc. * Authors: * Vivek Goyal <vgoyal@redhat.com> - * - * This source code is licensed under the GNU General Public License, - * Version 2. See the file COPYING for more details. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt @@ -198,9 +196,6 @@ kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd, return ret; image->kernel_buf_len = size; - /* IMA needs to pass the measurement list to the next kernel. */ - ima_add_kexec_buffer(image); - /* Call arch image probe handlers */ ret = arch_kexec_kernel_image_probe(image, image->kernel_buf, image->kernel_buf_len); @@ -241,8 +236,14 @@ kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd, ret = -EINVAL; goto out; } + + ima_kexec_cmdline(image->cmdline_buf, + image->cmdline_buf_len - 1); } + /* IMA needs to pass the measurement list to the next kernel. */ + ima_add_kexec_buffer(image); + /* Call arch image load handlers */ ldata = arch_kexec_kernel_image_load(image); @@ -500,13 +501,7 @@ static int locate_mem_hole_callback(struct resource *res, void *arg) return locate_mem_hole_bottom_up(start, end, kbuf); } -#ifdef CONFIG_ARCH_DISCARD_MEMBLOCK -static int kexec_walk_memblock(struct kexec_buf *kbuf, - int (*func)(struct resource *, void *)) -{ - return 0; -} -#else +#ifdef CONFIG_ARCH_KEEP_MEMBLOCK static int kexec_walk_memblock(struct kexec_buf *kbuf, int (*func)(struct resource *, void *)) { @@ -550,6 +545,12 @@ static int kexec_walk_memblock(struct kexec_buf *kbuf, return ret; } +#else +static int kexec_walk_memblock(struct kexec_buf *kbuf, + int (*func)(struct resource *, void *)) +{ + return 0; +} #endif /** @@ -589,7 +590,7 @@ int kexec_locate_mem_hole(struct kexec_buf *kbuf) if (kbuf->mem != KEXEC_BUF_MEM_UNKNOWN) return 0; - if (IS_ENABLED(CONFIG_ARCH_DISCARD_MEMBLOCK)) + if (!IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK)) ret = kexec_walk_resources(kbuf, locate_mem_hole_callback); else ret = kexec_walk_memblock(kbuf, locate_mem_hole_callback); @@ -688,7 +689,6 @@ static int kexec_calculate_store_digests(struct kimage *image) goto out_free_desc; desc->tfm = tfm; - desc->flags = 0; ret = crypto_shash_init(desc); if (ret < 0) diff --git a/kernel/kheaders.c b/kernel/kheaders.c new file mode 100644 index 000000000000..8f69772af77b --- /dev/null +++ b/kernel/kheaders.c @@ -0,0 +1,66 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Provide kernel headers useful to build tracing programs + * such as for running eBPF tracing tools. + * + * (Borrowed code from kernel/configs.c) + */ + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/kobject.h> +#include <linux/init.h> + +/* + * Define kernel_headers_data and kernel_headers_data_end, within which the + * compressed kernel headers are stored. The file is first compressed with xz. + */ + +asm ( +" .pushsection .rodata, \"a\" \n" +" .global kernel_headers_data \n" +"kernel_headers_data: \n" +" .incbin \"kernel/kheaders_data.tar.xz\" \n" +" .global kernel_headers_data_end \n" +"kernel_headers_data_end: \n" +" .popsection \n" +); + +extern char kernel_headers_data; +extern char kernel_headers_data_end; + +static ssize_t +ikheaders_read(struct file *file, struct kobject *kobj, + struct bin_attribute *bin_attr, + char *buf, loff_t off, size_t len) +{ + memcpy(buf, &kernel_headers_data + off, len); + return len; +} + +static struct bin_attribute kheaders_attr __ro_after_init = { + .attr = { + .name = "kheaders.tar.xz", + .mode = 0444, + }, + .read = &ikheaders_read, +}; + +static int __init ikheaders_init(void) +{ + kheaders_attr.size = (&kernel_headers_data_end - + &kernel_headers_data); + return sysfs_create_bin_file(kernel_kobj, &kheaders_attr); +} + +static void __exit ikheaders_cleanup(void) +{ + sysfs_remove_bin_file(kernel_kobj, &kheaders_attr); +} + +module_init(ikheaders_init); +module_exit(ikheaders_cleanup); + +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR("Joel Fernandes"); +MODULE_DESCRIPTION("Echo the kernel header artifacts used to build the kernel"); diff --git a/kernel/kprobes.c b/kernel/kprobes.c index f4ddfdd2d07e..9f5433a52488 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1,21 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * Kernel Probes (KProbes) * kernel/kprobes.c * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - * * Copyright (C) IBM Corporation, 2002, 2004 * * 2002-Oct Created by Vamsi Krishna S <vamsi_krishna@in.ibm.com> Kernel @@ -709,7 +696,6 @@ static void unoptimize_kprobe(struct kprobe *p, bool force) static int reuse_unused_kprobe(struct kprobe *ap) { struct optimized_kprobe *op; - int ret; /* * Unused kprobe MUST be on the way of delayed unoptimizing (means @@ -720,9 +706,8 @@ static int reuse_unused_kprobe(struct kprobe *ap) /* Enable the probe again */ ap->flags &= ~KPROBE_FLAG_DISABLED; /* Optimize it again (remove from op->list) */ - ret = kprobe_optready(ap); - if (ret) - return ret; + if (!kprobe_optready(ap)) + return -EINVAL; optimize_kprobe(ap); return 0; @@ -1396,7 +1381,7 @@ bool __weak arch_within_kprobe_blacklist(unsigned long addr) addr < (unsigned long)__kprobes_text_end; } -bool within_kprobe_blacklist(unsigned long addr) +static bool __within_kprobe_blacklist(unsigned long addr) { struct kprobe_blacklist_entry *ent; @@ -1410,7 +1395,26 @@ bool within_kprobe_blacklist(unsigned long addr) if (addr >= ent->start_addr && addr < ent->end_addr) return true; } + return false; +} +bool within_kprobe_blacklist(unsigned long addr) +{ + char symname[KSYM_NAME_LEN], *p; + + if (__within_kprobe_blacklist(addr)) + return true; + + /* Check if the address is on a suffixed-symbol */ + if (!lookup_symbol_name(addr, symname)) { + p = strchr(symname, '.'); + if (!p) + return false; + *p = '\0'; + addr = (unsigned long)kprobe_lookup_name(symname, 0); + if (addr) + return __within_kprobe_blacklist(addr); + } return false; } @@ -2566,33 +2570,20 @@ static const struct file_operations fops_kp = { static int __init debugfs_kprobe_init(void) { - struct dentry *dir, *file; + struct dentry *dir; unsigned int value = 1; dir = debugfs_create_dir("kprobes", NULL); - if (!dir) - return -ENOMEM; - file = debugfs_create_file("list", 0400, dir, NULL, - &debugfs_kprobes_operations); - if (!file) - goto error; + debugfs_create_file("list", 0400, dir, NULL, + &debugfs_kprobes_operations); - file = debugfs_create_file("enabled", 0600, dir, - &value, &fops_kp); - if (!file) - goto error; + debugfs_create_file("enabled", 0600, dir, &value, &fops_kp); - file = debugfs_create_file("blacklist", 0400, dir, NULL, - &debugfs_kprobe_blacklist_ops); - if (!file) - goto error; + debugfs_create_file("blacklist", 0400, dir, NULL, + &debugfs_kprobe_blacklist_ops); return 0; - -error: - debugfs_remove(dir); - return -ENOMEM; } late_initcall(debugfs_kprobe_init); diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index 46ba853656f6..35859da8bd4f 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c @@ -1,11 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * kernel/ksysfs.c - sysfs attributes in /sys/kernel, which * are not related to any other subsystem * * Copyright (C) 2004 Kay Sievers <kay.sievers@vrfy.org> - * - * This file is release under the GPLv2 - * */ #include <linux/kobject.h> diff --git a/kernel/kthread.c b/kernel/kthread.c index 087d18d771b5..621467c33fef 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* Kernel thread helper functions. * Copyright (C) 2004 IBM Corporation, Rusty Russell. * @@ -11,6 +12,7 @@ #include <linux/kthread.h> #include <linux/completion.h> #include <linux/err.h> +#include <linux/cgroup.h> #include <linux/cpuset.h> #include <linux/unistd.h> #include <linux/file.h> @@ -20,6 +22,7 @@ #include <linux/freezer.h> #include <linux/ptrace.h> #include <linux/uaccess.h> +#include <linux/numa.h> #include <trace/events/sched.h> static DEFINE_SPINLOCK(kthread_create_lock); @@ -101,6 +104,12 @@ bool kthread_should_stop(void) } EXPORT_SYMBOL(kthread_should_stop); +bool __kthread_should_park(struct task_struct *k) +{ + return test_bit(KTHREAD_SHOULD_PARK, &to_kthread(k)->flags); +} +EXPORT_SYMBOL_GPL(__kthread_should_park); + /** * kthread_should_park - should this kthread park now? * @@ -114,7 +123,7 @@ EXPORT_SYMBOL(kthread_should_stop); */ bool kthread_should_park(void) { - return test_bit(KTHREAD_SHOULD_PARK, &to_kthread(current)->flags); + return __kthread_should_park(current); } EXPORT_SYMBOL_GPL(kthread_should_park); @@ -599,7 +608,7 @@ void __kthread_init_worker(struct kthread_worker *worker, struct lock_class_key *key) { memset(worker, 0, sizeof(struct kthread_worker)); - spin_lock_init(&worker->lock); + raw_spin_lock_init(&worker->lock); lockdep_set_class_and_name(&worker->lock, key, name); INIT_LIST_HEAD(&worker->work_list); INIT_LIST_HEAD(&worker->delayed_work_list); @@ -641,21 +650,21 @@ repeat: if (kthread_should_stop()) { __set_current_state(TASK_RUNNING); - spin_lock_irq(&worker->lock); + raw_spin_lock_irq(&worker->lock); worker->task = NULL; - spin_unlock_irq(&worker->lock); + raw_spin_unlock_irq(&worker->lock); return 0; } work = NULL; - spin_lock_irq(&worker->lock); + raw_spin_lock_irq(&worker->lock); if (!list_empty(&worker->work_list)) { work = list_first_entry(&worker->work_list, struct kthread_work, node); list_del_init(&work->node); } worker->current_work = work; - spin_unlock_irq(&worker->lock); + raw_spin_unlock_irq(&worker->lock); if (work) { __set_current_state(TASK_RUNNING); @@ -675,7 +684,7 @@ __kthread_create_worker(int cpu, unsigned int flags, { struct kthread_worker *worker; struct task_struct *task; - int node = -1; + int node = NUMA_NO_NODE; worker = kzalloc(sizeof(*worker), GFP_KERNEL); if (!worker) @@ -812,12 +821,12 @@ bool kthread_queue_work(struct kthread_worker *worker, bool ret = false; unsigned long flags; - spin_lock_irqsave(&worker->lock, flags); + raw_spin_lock_irqsave(&worker->lock, flags); if (!queuing_blocked(worker, work)) { kthread_insert_work(worker, work, &worker->work_list); ret = true; } - spin_unlock_irqrestore(&worker->lock, flags); + raw_spin_unlock_irqrestore(&worker->lock, flags); return ret; } EXPORT_SYMBOL_GPL(kthread_queue_work); @@ -835,6 +844,7 @@ void kthread_delayed_work_timer_fn(struct timer_list *t) struct kthread_delayed_work *dwork = from_timer(dwork, t, timer); struct kthread_work *work = &dwork->work; struct kthread_worker *worker = work->worker; + unsigned long flags; /* * This might happen when a pending work is reinitialized. @@ -843,7 +853,7 @@ void kthread_delayed_work_timer_fn(struct timer_list *t) if (WARN_ON_ONCE(!worker)) return; - spin_lock(&worker->lock); + raw_spin_lock_irqsave(&worker->lock, flags); /* Work must not be used with >1 worker, see kthread_queue_work(). */ WARN_ON_ONCE(work->worker != worker); @@ -852,7 +862,7 @@ void kthread_delayed_work_timer_fn(struct timer_list *t) list_del_init(&work->node); kthread_insert_work(worker, work, &worker->work_list); - spin_unlock(&worker->lock); + raw_spin_unlock_irqrestore(&worker->lock, flags); } EXPORT_SYMBOL(kthread_delayed_work_timer_fn); @@ -908,14 +918,14 @@ bool kthread_queue_delayed_work(struct kthread_worker *worker, unsigned long flags; bool ret = false; - spin_lock_irqsave(&worker->lock, flags); + raw_spin_lock_irqsave(&worker->lock, flags); if (!queuing_blocked(worker, work)) { __kthread_queue_delayed_work(worker, dwork, delay); ret = true; } - spin_unlock_irqrestore(&worker->lock, flags); + raw_spin_unlock_irqrestore(&worker->lock, flags); return ret; } EXPORT_SYMBOL_GPL(kthread_queue_delayed_work); @@ -951,7 +961,7 @@ void kthread_flush_work(struct kthread_work *work) if (!worker) return; - spin_lock_irq(&worker->lock); + raw_spin_lock_irq(&worker->lock); /* Work must not be used with >1 worker, see kthread_queue_work(). */ WARN_ON_ONCE(work->worker != worker); @@ -963,7 +973,7 @@ void kthread_flush_work(struct kthread_work *work) else noop = true; - spin_unlock_irq(&worker->lock); + raw_spin_unlock_irq(&worker->lock); if (!noop) wait_for_completion(&fwork.done); @@ -996,9 +1006,9 @@ static bool __kthread_cancel_work(struct kthread_work *work, bool is_dwork, * any queuing is blocked by setting the canceling counter. */ work->canceling++; - spin_unlock_irqrestore(&worker->lock, *flags); + raw_spin_unlock_irqrestore(&worker->lock, *flags); del_timer_sync(&dwork->timer); - spin_lock_irqsave(&worker->lock, *flags); + raw_spin_lock_irqsave(&worker->lock, *flags); work->canceling--; } @@ -1045,7 +1055,7 @@ bool kthread_mod_delayed_work(struct kthread_worker *worker, unsigned long flags; int ret = false; - spin_lock_irqsave(&worker->lock, flags); + raw_spin_lock_irqsave(&worker->lock, flags); /* Do not bother with canceling when never queued. */ if (!work->worker) @@ -1062,7 +1072,7 @@ bool kthread_mod_delayed_work(struct kthread_worker *worker, fast_queue: __kthread_queue_delayed_work(worker, dwork, delay); out: - spin_unlock_irqrestore(&worker->lock, flags); + raw_spin_unlock_irqrestore(&worker->lock, flags); return ret; } EXPORT_SYMBOL_GPL(kthread_mod_delayed_work); @@ -1076,7 +1086,7 @@ static bool __kthread_cancel_work_sync(struct kthread_work *work, bool is_dwork) if (!worker) goto out; - spin_lock_irqsave(&worker->lock, flags); + raw_spin_lock_irqsave(&worker->lock, flags); /* Work must not be used with >1 worker, see kthread_queue_work(). */ WARN_ON_ONCE(work->worker != worker); @@ -1090,13 +1100,13 @@ static bool __kthread_cancel_work_sync(struct kthread_work *work, bool is_dwork) * In the meantime, block any queuing by setting the canceling counter. */ work->canceling++; - spin_unlock_irqrestore(&worker->lock, flags); + raw_spin_unlock_irqrestore(&worker->lock, flags); kthread_flush_work(work); - spin_lock_irqsave(&worker->lock, flags); + raw_spin_lock_irqsave(&worker->lock, flags); work->canceling--; out_fast: - spin_unlock_irqrestore(&worker->lock, flags); + raw_spin_unlock_irqrestore(&worker->lock, flags); out: return ret; } diff --git a/kernel/latencytop.c b/kernel/latencytop.c index 96b4179cee6a..e3acead004e6 100644 --- a/kernel/latencytop.c +++ b/kernel/latencytop.c @@ -1,13 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * latencytop.c: Latency display infrastructure * * (C) Copyright 2008 Intel Corporation * Author: Arjan van de Ven <arjan@linux.intel.com> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; version 2 - * of the License. */ /* @@ -67,13 +63,10 @@ static struct latency_record latency_record[MAXLR]; int latencytop_enabled; -void clear_all_latency_tracing(struct task_struct *p) +void clear_tsk_latency_tracing(struct task_struct *p) { unsigned long flags; - if (!latencytop_enabled) - return; - raw_spin_lock_irqsave(&latency_lock, flags); memset(&p->latency_record, 0, sizeof(p->latency_record)); p->latency_record_count = 0; @@ -96,9 +89,6 @@ account_global_scheduler_latency(struct task_struct *tsk, int firstnonnull = MAXLR + 1; int i; - if (!latencytop_enabled) - return; - /* skip kernel threads for now */ if (!tsk->mm) return; @@ -120,8 +110,8 @@ account_global_scheduler_latency(struct task_struct *tsk, break; } - /* 0 and ULONG_MAX entries mean end of backtrace: */ - if (record == 0 || record == ULONG_MAX) + /* 0 entry marks end of backtrace: */ + if (!record) break; } if (same) { @@ -141,20 +131,6 @@ account_global_scheduler_latency(struct task_struct *tsk, memcpy(&latency_record[i], lat, sizeof(struct latency_record)); } -/* - * Iterator to store a backtrace into a latency record entry - */ -static inline void store_stacktrace(struct task_struct *tsk, - struct latency_record *lat) -{ - struct stack_trace trace; - - memset(&trace, 0, sizeof(trace)); - trace.max_entries = LT_BACKTRACEDEPTH; - trace.entries = &lat->backtrace[0]; - save_stack_trace_tsk(tsk, &trace); -} - /** * __account_scheduler_latency - record an occurred latency * @tsk - the task struct of the task hitting the latency @@ -191,7 +167,8 @@ __account_scheduler_latency(struct task_struct *tsk, int usecs, int inter) lat.count = 1; lat.time = usecs; lat.max = usecs; - store_stacktrace(tsk, &lat); + + stack_trace_save_tsk(tsk, lat.backtrace, LT_BACKTRACEDEPTH, 0); raw_spin_lock_irqsave(&latency_lock, flags); @@ -210,8 +187,8 @@ __account_scheduler_latency(struct task_struct *tsk, int usecs, int inter) break; } - /* 0 and ULONG_MAX entries mean end of backtrace: */ - if (record == 0 || record == ULONG_MAX) + /* 0 entry is end of backtrace */ + if (!record) break; } if (same) { @@ -252,10 +229,10 @@ static int lstats_show(struct seq_file *m, void *v) lr->count, lr->time, lr->max); for (q = 0; q < LT_BACKTRACEDEPTH; q++) { unsigned long bt = lr->backtrace[q]; + if (!bt) break; - if (bt == ULONG_MAX) - break; + seq_printf(m, " %ps", (void *)bt); } seq_puts(m, "\n"); diff --git a/kernel/livepatch/Kconfig b/kernel/livepatch/Kconfig index ec4565122e65..54102deb50ba 100644 --- a/kernel/livepatch/Kconfig +++ b/kernel/livepatch/Kconfig @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0-only config HAVE_LIVEPATCH bool help diff --git a/kernel/livepatch/Makefile b/kernel/livepatch/Makefile index b36ceda6488e..cf9b5bcdb952 100644 --- a/kernel/livepatch/Makefile +++ b/kernel/livepatch/Makefile @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0-only obj-$(CONFIG_LIVEPATCH) += livepatch.o livepatch-objs := core.o patch.o shadow.o transition.o diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index 5b77a7314e01..c4ce08f43bd6 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -1,21 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * core.c - Kernel Live Patching Core * * Copyright (C) 2014 Seth Jennings <sjenning@redhat.com> * Copyright (C) 2014 SUSE - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see <http://www.gnu.org/licenses/>. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt @@ -30,6 +18,7 @@ #include <linux/elf.h> #include <linux/moduleloader.h> #include <linux/completion.h> +#include <linux/memory.h> #include <asm/cacheflush.h> #include "core.h" #include "patch.h" @@ -45,7 +34,12 @@ */ DEFINE_MUTEX(klp_mutex); -static LIST_HEAD(klp_patches); +/* + * Actively used patches: enabled or in transition. Note that replaced + * or disabled patches are not listed even though the related kernel + * module still can be loaded. + */ +LIST_HEAD(klp_patches); static struct kobject *klp_root_kobj; @@ -82,20 +76,43 @@ static void klp_find_object_module(struct klp_object *obj) mutex_unlock(&module_mutex); } -static bool klp_is_patch_registered(struct klp_patch *patch) +static bool klp_initialized(void) { - struct klp_patch *mypatch; + return !!klp_root_kobj; +} - list_for_each_entry(mypatch, &klp_patches, list) - if (mypatch == patch) - return true; +static struct klp_func *klp_find_func(struct klp_object *obj, + struct klp_func *old_func) +{ + struct klp_func *func; + + klp_for_each_func(obj, func) { + if ((strcmp(old_func->old_name, func->old_name) == 0) && + (old_func->old_sympos == func->old_sympos)) { + return func; + } + } - return false; + return NULL; } -static bool klp_initialized(void) +static struct klp_object *klp_find_object(struct klp_patch *patch, + struct klp_object *old_obj) { - return !!klp_root_kobj; + struct klp_object *obj; + + klp_for_each_object(patch, obj) { + if (klp_is_module(old_obj)) { + if (klp_is_module(obj) && + strcmp(old_obj->name, obj->name) == 0) { + return obj; + } + } else if (!klp_is_module(obj)) { + return obj; + } + } + + return NULL; } struct klp_find_arg { @@ -278,170 +295,6 @@ static int klp_write_object_relocations(struct module *pmod, return ret; } -static int __klp_disable_patch(struct klp_patch *patch) -{ - struct klp_object *obj; - - if (WARN_ON(!patch->enabled)) - return -EINVAL; - - if (klp_transition_patch) - return -EBUSY; - - /* enforce stacking: only the last enabled patch can be disabled */ - if (!list_is_last(&patch->list, &klp_patches) && - list_next_entry(patch, list)->enabled) - return -EBUSY; - - klp_init_transition(patch, KLP_UNPATCHED); - - klp_for_each_object(patch, obj) - if (obj->patched) - klp_pre_unpatch_callback(obj); - - /* - * Enforce the order of the func->transition writes in - * klp_init_transition() and the TIF_PATCH_PENDING writes in - * klp_start_transition(). In the rare case where klp_ftrace_handler() - * is called shortly after klp_update_patch_state() switches the task, - * this ensures the handler sees that func->transition is set. - */ - smp_wmb(); - - klp_start_transition(); - klp_try_complete_transition(); - patch->enabled = false; - - return 0; -} - -/** - * klp_disable_patch() - disables a registered patch - * @patch: The registered, enabled patch to be disabled - * - * Unregisters the patched functions from ftrace. - * - * Return: 0 on success, otherwise error - */ -int klp_disable_patch(struct klp_patch *patch) -{ - int ret; - - mutex_lock(&klp_mutex); - - if (!klp_is_patch_registered(patch)) { - ret = -EINVAL; - goto err; - } - - if (!patch->enabled) { - ret = -EINVAL; - goto err; - } - - ret = __klp_disable_patch(patch); - -err: - mutex_unlock(&klp_mutex); - return ret; -} -EXPORT_SYMBOL_GPL(klp_disable_patch); - -static int __klp_enable_patch(struct klp_patch *patch) -{ - struct klp_object *obj; - int ret; - - if (klp_transition_patch) - return -EBUSY; - - if (WARN_ON(patch->enabled)) - return -EINVAL; - - /* enforce stacking: only the first disabled patch can be enabled */ - if (patch->list.prev != &klp_patches && - !list_prev_entry(patch, list)->enabled) - return -EBUSY; - - /* - * A reference is taken on the patch module to prevent it from being - * unloaded. - */ - if (!try_module_get(patch->mod)) - return -ENODEV; - - pr_notice("enabling patch '%s'\n", patch->mod->name); - - klp_init_transition(patch, KLP_PATCHED); - - /* - * Enforce the order of the func->transition writes in - * klp_init_transition() and the ops->func_stack writes in - * klp_patch_object(), so that klp_ftrace_handler() will see the - * func->transition updates before the handler is registered and the - * new funcs become visible to the handler. - */ - smp_wmb(); - - klp_for_each_object(patch, obj) { - if (!klp_is_object_loaded(obj)) - continue; - - ret = klp_pre_patch_callback(obj); - if (ret) { - pr_warn("pre-patch callback failed for object '%s'\n", - klp_is_module(obj) ? obj->name : "vmlinux"); - goto err; - } - - ret = klp_patch_object(obj); - if (ret) { - pr_warn("failed to patch object '%s'\n", - klp_is_module(obj) ? obj->name : "vmlinux"); - goto err; - } - } - - klp_start_transition(); - klp_try_complete_transition(); - patch->enabled = true; - - return 0; -err: - pr_warn("failed to enable patch '%s'\n", patch->mod->name); - - klp_cancel_transition(); - return ret; -} - -/** - * klp_enable_patch() - enables a registered patch - * @patch: The registered, disabled patch to be enabled - * - * Performs the needed symbol lookups and code relocations, - * then registers the patched functions with ftrace. - * - * Return: 0 on success, otherwise error - */ -int klp_enable_patch(struct klp_patch *patch) -{ - int ret; - - mutex_lock(&klp_mutex); - - if (!klp_is_patch_registered(patch)) { - ret = -EINVAL; - goto err; - } - - ret = __klp_enable_patch(patch); - -err: - mutex_unlock(&klp_mutex); - return ret; -} -EXPORT_SYMBOL_GPL(klp_enable_patch); - /* * Sysfs Interface * @@ -449,11 +302,11 @@ EXPORT_SYMBOL_GPL(klp_enable_patch); * /sys/kernel/livepatch/<patch> * /sys/kernel/livepatch/<patch>/enabled * /sys/kernel/livepatch/<patch>/transition - * /sys/kernel/livepatch/<patch>/signal * /sys/kernel/livepatch/<patch>/force * /sys/kernel/livepatch/<patch>/<object> * /sys/kernel/livepatch/<patch>/<object>/<function,sympos> */ +static int __klp_disable_patch(struct klp_patch *patch); static ssize_t enabled_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) @@ -470,40 +323,32 @@ static ssize_t enabled_store(struct kobject *kobj, struct kobj_attribute *attr, mutex_lock(&klp_mutex); - if (!klp_is_patch_registered(patch)) { - /* - * Module with the patch could either disappear meanwhile or is - * not properly initialized yet. - */ - ret = -EINVAL; - goto err; - } - if (patch->enabled == enabled) { /* already in requested state */ ret = -EINVAL; - goto err; + goto out; } - if (patch == klp_transition_patch) { + /* + * Allow to reverse a pending transition in both ways. It might be + * necessary to complete the transition without forcing and breaking + * the system integrity. + * + * Do not allow to re-enable a disabled patch. + */ + if (patch == klp_transition_patch) klp_reverse_transition(); - } else if (enabled) { - ret = __klp_enable_patch(patch); - if (ret) - goto err; - } else { + else if (!enabled) ret = __klp_disable_patch(patch); - if (ret) - goto err; - } + else + ret = -EINVAL; +out: mutex_unlock(&klp_mutex); + if (ret) + return ret; return count; - -err: - mutex_unlock(&klp_mutex); - return ret; } static ssize_t enabled_show(struct kobject *kobj, @@ -525,35 +370,6 @@ static ssize_t transition_show(struct kobject *kobj, patch == klp_transition_patch); } -static ssize_t signal_store(struct kobject *kobj, struct kobj_attribute *attr, - const char *buf, size_t count) -{ - struct klp_patch *patch; - int ret; - bool val; - - ret = kstrtobool(buf, &val); - if (ret) - return ret; - - if (!val) - return count; - - mutex_lock(&klp_mutex); - - patch = container_of(kobj, struct klp_patch, kobj); - if (patch != klp_transition_patch) { - mutex_unlock(&klp_mutex); - return -EINVAL; - } - - klp_send_signals(); - - mutex_unlock(&klp_mutex); - - return count; -} - static ssize_t force_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { @@ -585,15 +401,132 @@ static ssize_t force_store(struct kobject *kobj, struct kobj_attribute *attr, static struct kobj_attribute enabled_kobj_attr = __ATTR_RW(enabled); static struct kobj_attribute transition_kobj_attr = __ATTR_RO(transition); -static struct kobj_attribute signal_kobj_attr = __ATTR_WO(signal); static struct kobj_attribute force_kobj_attr = __ATTR_WO(force); static struct attribute *klp_patch_attrs[] = { &enabled_kobj_attr.attr, &transition_kobj_attr.attr, - &signal_kobj_attr.attr, &force_kobj_attr.attr, NULL }; +ATTRIBUTE_GROUPS(klp_patch); + +static void klp_free_object_dynamic(struct klp_object *obj) +{ + kfree(obj->name); + kfree(obj); +} + +static void klp_init_func_early(struct klp_object *obj, + struct klp_func *func); +static void klp_init_object_early(struct klp_patch *patch, + struct klp_object *obj); + +static struct klp_object *klp_alloc_object_dynamic(const char *name, + struct klp_patch *patch) +{ + struct klp_object *obj; + + obj = kzalloc(sizeof(*obj), GFP_KERNEL); + if (!obj) + return NULL; + + if (name) { + obj->name = kstrdup(name, GFP_KERNEL); + if (!obj->name) { + kfree(obj); + return NULL; + } + } + + klp_init_object_early(patch, obj); + obj->dynamic = true; + + return obj; +} + +static void klp_free_func_nop(struct klp_func *func) +{ + kfree(func->old_name); + kfree(func); +} + +static struct klp_func *klp_alloc_func_nop(struct klp_func *old_func, + struct klp_object *obj) +{ + struct klp_func *func; + + func = kzalloc(sizeof(*func), GFP_KERNEL); + if (!func) + return NULL; + + if (old_func->old_name) { + func->old_name = kstrdup(old_func->old_name, GFP_KERNEL); + if (!func->old_name) { + kfree(func); + return NULL; + } + } + + klp_init_func_early(obj, func); + /* + * func->new_func is same as func->old_func. These addresses are + * set when the object is loaded, see klp_init_object_loaded(). + */ + func->old_sympos = old_func->old_sympos; + func->nop = true; + + return func; +} + +static int klp_add_object_nops(struct klp_patch *patch, + struct klp_object *old_obj) +{ + struct klp_object *obj; + struct klp_func *func, *old_func; + + obj = klp_find_object(patch, old_obj); + + if (!obj) { + obj = klp_alloc_object_dynamic(old_obj->name, patch); + if (!obj) + return -ENOMEM; + } + + klp_for_each_func(old_obj, old_func) { + func = klp_find_func(obj, old_func); + if (func) + continue; + + func = klp_alloc_func_nop(old_func, obj); + if (!func) + return -ENOMEM; + } + + return 0; +} + +/* + * Add 'nop' functions which simply return to the caller to run + * the original function. The 'nop' functions are added to a + * patch to facilitate a 'replace' mode. + */ +static int klp_add_nops(struct klp_patch *patch) +{ + struct klp_patch *old_patch; + struct klp_object *old_obj; + + klp_for_each_patch(old_patch) { + klp_for_each_object(old_patch, old_obj) { + int err; + + err = klp_add_object_nops(patch, old_obj); + if (err) + return err; + } + } + + return 0; +} static void klp_kobj_release_patch(struct kobject *kobj) { @@ -606,11 +539,17 @@ static void klp_kobj_release_patch(struct kobject *kobj) static struct kobj_type klp_ktype_patch = { .release = klp_kobj_release_patch, .sysfs_ops = &kobj_sysfs_ops, - .default_attrs = klp_patch_attrs, + .default_groups = klp_patch_groups, }; static void klp_kobj_release_object(struct kobject *kobj) { + struct klp_object *obj; + + obj = container_of(kobj, struct klp_object, kobj); + + if (obj->dynamic) + klp_free_object_dynamic(obj); } static struct kobj_type klp_ktype_object = { @@ -620,6 +559,12 @@ static struct kobj_type klp_ktype_object = { static void klp_kobj_release_func(struct kobject *kobj) { + struct klp_func *func; + + func = container_of(kobj, struct klp_func, kobj); + + if (func->nop) + klp_free_func_nop(func); } static struct kobj_type klp_ktype_func = { @@ -627,17 +572,17 @@ static struct kobj_type klp_ktype_func = { .sysfs_ops = &kobj_sysfs_ops, }; -/* - * Free all functions' kobjects in the array up to some limit. When limit is - * NULL, all kobjects are freed. - */ -static void klp_free_funcs_limited(struct klp_object *obj, - struct klp_func *limit) +static void __klp_free_funcs(struct klp_object *obj, bool nops_only) { - struct klp_func *func; + struct klp_func *func, *tmp_func; + + klp_for_each_func_safe(obj, func, tmp_func) { + if (nops_only && !func->nop) + continue; - for (func = obj->funcs; func->old_name && func != limit; func++) + list_del(&func->node); kobject_put(&func->kobj); + } } /* Clean up when a patched object is unloaded */ @@ -647,35 +592,101 @@ static void klp_free_object_loaded(struct klp_object *obj) obj->mod = NULL; - klp_for_each_func(obj, func) - func->old_addr = 0; + klp_for_each_func(obj, func) { + func->old_func = NULL; + + if (func->nop) + func->new_func = NULL; + } } -/* - * Free all objects' kobjects in the array up to some limit. When limit is - * NULL, all kobjects are freed. - */ -static void klp_free_objects_limited(struct klp_patch *patch, - struct klp_object *limit) +static void __klp_free_objects(struct klp_patch *patch, bool nops_only) { - struct klp_object *obj; + struct klp_object *obj, *tmp_obj; - for (obj = patch->objs; obj->funcs && obj != limit; obj++) { - klp_free_funcs_limited(obj, NULL); + klp_for_each_object_safe(patch, obj, tmp_obj) { + __klp_free_funcs(obj, nops_only); + + if (nops_only && !obj->dynamic) + continue; + + list_del(&obj->node); kobject_put(&obj->kobj); } } -static void klp_free_patch(struct klp_patch *patch) +static void klp_free_objects(struct klp_patch *patch) +{ + __klp_free_objects(patch, false); +} + +static void klp_free_objects_dynamic(struct klp_patch *patch) +{ + __klp_free_objects(patch, true); +} + +/* + * This function implements the free operations that can be called safely + * under klp_mutex. + * + * The operation must be completed by calling klp_free_patch_finish() + * outside klp_mutex. + */ +void klp_free_patch_start(struct klp_patch *patch) { - klp_free_objects_limited(patch, NULL); if (!list_empty(&patch->list)) list_del(&patch->list); + + klp_free_objects(patch); +} + +/* + * This function implements the free part that must be called outside + * klp_mutex. + * + * It must be called after klp_free_patch_start(). And it has to be + * the last function accessing the livepatch structures when the patch + * gets disabled. + */ +static void klp_free_patch_finish(struct klp_patch *patch) +{ + /* + * Avoid deadlock with enabled_store() sysfs callback by + * calling this outside klp_mutex. It is safe because + * this is called when the patch gets disabled and it + * cannot get enabled again. + */ + kobject_put(&patch->kobj); + wait_for_completion(&patch->finish); + + /* Put the module after the last access to struct klp_patch. */ + if (!patch->forced) + module_put(patch->mod); +} + +/* + * The livepatch might be freed from sysfs interface created by the patch. + * This work allows to wait until the interface is destroyed in a separate + * context. + */ +static void klp_free_patch_work_fn(struct work_struct *work) +{ + struct klp_patch *patch = + container_of(work, struct klp_patch, free_work); + + klp_free_patch_finish(patch); } static int klp_init_func(struct klp_object *obj, struct klp_func *func) { - if (!func->old_name || !func->new_func) + if (!func->old_name) + return -EINVAL; + + /* + * NOPs get the address later. The patched module must be loaded, + * see klp_init_object_loaded(). + */ + if (!func->new_func && !func->nop) return -EINVAL; if (strlen(func->old_name) >= KSYM_NAME_LEN) @@ -690,9 +701,9 @@ static int klp_init_func(struct klp_object *obj, struct klp_func *func) * object. If the user selects 0 for old_sympos, then 1 will be used * since a unique symbol will be the first occurrence. */ - return kobject_init_and_add(&func->kobj, &klp_ktype_func, - &obj->kobj, "%s,%lu", func->old_name, - func->old_sympos ? func->old_sympos : 1); + return kobject_add(&func->kobj, &obj->kobj, "%s,%lu", + func->old_name, + func->old_sympos ? func->old_sympos : 1); } /* Arches may override this to finish any remaining arch-specific tasks */ @@ -708,24 +719,29 @@ static int klp_init_object_loaded(struct klp_patch *patch, struct klp_func *func; int ret; + mutex_lock(&text_mutex); + module_disable_ro(patch->mod); ret = klp_write_object_relocations(patch->mod, obj); if (ret) { module_enable_ro(patch->mod, true); + mutex_unlock(&text_mutex); return ret; } arch_klp_init_object_loaded(patch, obj); module_enable_ro(patch->mod, true); + mutex_unlock(&text_mutex); + klp_for_each_func(obj, func) { ret = klp_find_object_symbol(obj->name, func->old_name, func->old_sympos, - &func->old_addr); + (unsigned long *)&func->old_func); if (ret) return ret; - ret = kallsyms_lookup_size_offset(func->old_addr, + ret = kallsyms_lookup_size_offset((unsigned long)func->old_func, &func->old_size, NULL); if (!ret) { pr_err("kallsyms size lookup failed for '%s'\n", @@ -733,6 +749,9 @@ static int klp_init_object_loaded(struct klp_patch *patch, return -ENOENT; } + if (func->nop) + func->new_func = func->old_func; + ret = kallsyms_lookup_size_offset((unsigned long)func->new_func, &func->new_size, NULL); if (!ret) { @@ -751,9 +770,6 @@ static int klp_init_object(struct klp_patch *patch, struct klp_object *obj) int ret; const char *name; - if (!obj->funcs) - return -EINVAL; - if (klp_is_module(obj) && strlen(obj->name) >= MODULE_NAME_LEN) return -EINVAL; @@ -763,126 +779,200 @@ static int klp_init_object(struct klp_patch *patch, struct klp_object *obj) klp_find_object_module(obj); name = klp_is_module(obj) ? obj->name : "vmlinux"; - ret = kobject_init_and_add(&obj->kobj, &klp_ktype_object, - &patch->kobj, "%s", name); + ret = kobject_add(&obj->kobj, &patch->kobj, "%s", name); if (ret) return ret; klp_for_each_func(obj, func) { ret = klp_init_func(obj, func); if (ret) - goto free; + return ret; } - if (klp_is_object_loaded(obj)) { + if (klp_is_object_loaded(obj)) ret = klp_init_object_loaded(patch, obj); - if (ret) - goto free; - } - - return 0; -free: - klp_free_funcs_limited(obj, func); - kobject_put(&obj->kobj); return ret; } -static int klp_init_patch(struct klp_patch *patch) +static void klp_init_func_early(struct klp_object *obj, + struct klp_func *func) +{ + kobject_init(&func->kobj, &klp_ktype_func); + list_add_tail(&func->node, &obj->func_list); +} + +static void klp_init_object_early(struct klp_patch *patch, + struct klp_object *obj) +{ + INIT_LIST_HEAD(&obj->func_list); + kobject_init(&obj->kobj, &klp_ktype_object); + list_add_tail(&obj->node, &patch->obj_list); +} + +static int klp_init_patch_early(struct klp_patch *patch) { struct klp_object *obj; - int ret; + struct klp_func *func; if (!patch->objs) return -EINVAL; - mutex_lock(&klp_mutex); - + INIT_LIST_HEAD(&patch->list); + INIT_LIST_HEAD(&patch->obj_list); + kobject_init(&patch->kobj, &klp_ktype_patch); patch->enabled = false; + patch->forced = false; + INIT_WORK(&patch->free_work, klp_free_patch_work_fn); init_completion(&patch->finish); - ret = kobject_init_and_add(&patch->kobj, &klp_ktype_patch, - klp_root_kobj, "%s", patch->mod->name); - if (ret) { - mutex_unlock(&klp_mutex); + klp_for_each_object_static(patch, obj) { + if (!obj->funcs) + return -EINVAL; + + klp_init_object_early(patch, obj); + + klp_for_each_func_static(obj, func) { + klp_init_func_early(obj, func); + } + } + + if (!try_module_get(patch->mod)) + return -ENODEV; + + return 0; +} + +static int klp_init_patch(struct klp_patch *patch) +{ + struct klp_object *obj; + int ret; + + ret = kobject_add(&patch->kobj, klp_root_kobj, "%s", patch->mod->name); + if (ret) return ret; + + if (patch->replace) { + ret = klp_add_nops(patch); + if (ret) + return ret; } klp_for_each_object(patch, obj) { ret = klp_init_object(patch, obj); if (ret) - goto free; + return ret; } list_add_tail(&patch->list, &klp_patches); - mutex_unlock(&klp_mutex); - return 0; +} + +static int __klp_disable_patch(struct klp_patch *patch) +{ + struct klp_object *obj; -free: - klp_free_objects_limited(patch, obj); + if (WARN_ON(!patch->enabled)) + return -EINVAL; - mutex_unlock(&klp_mutex); + if (klp_transition_patch) + return -EBUSY; - kobject_put(&patch->kobj); - wait_for_completion(&patch->finish); + klp_init_transition(patch, KLP_UNPATCHED); - return ret; + klp_for_each_object(patch, obj) + if (obj->patched) + klp_pre_unpatch_callback(obj); + + /* + * Enforce the order of the func->transition writes in + * klp_init_transition() and the TIF_PATCH_PENDING writes in + * klp_start_transition(). In the rare case where klp_ftrace_handler() + * is called shortly after klp_update_patch_state() switches the task, + * this ensures the handler sees that func->transition is set. + */ + smp_wmb(); + + klp_start_transition(); + patch->enabled = false; + klp_try_complete_transition(); + + return 0; } -/** - * klp_unregister_patch() - unregisters a patch - * @patch: Disabled patch to be unregistered - * - * Frees the data structures and removes the sysfs interface. - * - * Return: 0 on success, otherwise error - */ -int klp_unregister_patch(struct klp_patch *patch) +static int __klp_enable_patch(struct klp_patch *patch) { + struct klp_object *obj; int ret; - mutex_lock(&klp_mutex); + if (klp_transition_patch) + return -EBUSY; - if (!klp_is_patch_registered(patch)) { - ret = -EINVAL; - goto err; - } + if (WARN_ON(patch->enabled)) + return -EINVAL; - if (patch->enabled) { - ret = -EBUSY; - goto err; - } + pr_notice("enabling patch '%s'\n", patch->mod->name); - klp_free_patch(patch); + klp_init_transition(patch, KLP_PATCHED); - mutex_unlock(&klp_mutex); + /* + * Enforce the order of the func->transition writes in + * klp_init_transition() and the ops->func_stack writes in + * klp_patch_object(), so that klp_ftrace_handler() will see the + * func->transition updates before the handler is registered and the + * new funcs become visible to the handler. + */ + smp_wmb(); - kobject_put(&patch->kobj); - wait_for_completion(&patch->finish); + klp_for_each_object(patch, obj) { + if (!klp_is_object_loaded(obj)) + continue; + + ret = klp_pre_patch_callback(obj); + if (ret) { + pr_warn("pre-patch callback failed for object '%s'\n", + klp_is_module(obj) ? obj->name : "vmlinux"); + goto err; + } + + ret = klp_patch_object(obj); + if (ret) { + pr_warn("failed to patch object '%s'\n", + klp_is_module(obj) ? obj->name : "vmlinux"); + goto err; + } + } + + klp_start_transition(); + patch->enabled = true; + klp_try_complete_transition(); return 0; err: - mutex_unlock(&klp_mutex); + pr_warn("failed to enable patch '%s'\n", patch->mod->name); + + klp_cancel_transition(); return ret; } -EXPORT_SYMBOL_GPL(klp_unregister_patch); /** - * klp_register_patch() - registers a patch - * @patch: Patch to be registered + * klp_enable_patch() - enable the livepatch + * @patch: patch to be enabled * - * Initializes the data structure associated with the patch and - * creates the sysfs interface. + * Initializes the data structure associated with the patch, creates the sysfs + * interface, performs the needed symbol lookups and code relocations, + * registers the patched functions with ftrace. * - * There is no need to take the reference on the patch module here. It is done - * later when the patch is enabled. + * This function is supposed to be called from the livepatch module_init() + * callback. * * Return: 0 on success, otherwise error */ -int klp_register_patch(struct klp_patch *patch) +int klp_enable_patch(struct klp_patch *patch) { + int ret; + if (!patch || !patch->mod) return -EINVAL; @@ -896,13 +986,91 @@ int klp_register_patch(struct klp_patch *patch) return -ENODEV; if (!klp_have_reliable_stack()) { - pr_err("This architecture doesn't have support for the livepatch consistency model.\n"); - return -ENOSYS; + pr_warn("This architecture doesn't have support for the livepatch consistency model.\n"); + pr_warn("The livepatch transition may never complete.\n"); + } + + mutex_lock(&klp_mutex); + + ret = klp_init_patch_early(patch); + if (ret) { + mutex_unlock(&klp_mutex); + return ret; + } + + ret = klp_init_patch(patch); + if (ret) + goto err; + + ret = __klp_enable_patch(patch); + if (ret) + goto err; + + mutex_unlock(&klp_mutex); + + return 0; + +err: + klp_free_patch_start(patch); + + mutex_unlock(&klp_mutex); + + klp_free_patch_finish(patch); + + return ret; +} +EXPORT_SYMBOL_GPL(klp_enable_patch); + +/* + * This function removes replaced patches. + * + * We could be pretty aggressive here. It is called in the situation where + * these structures are no longer accessible. All functions are redirected + * by the klp_transition_patch. They use either a new code or they are in + * the original code because of the special nop function patches. + * + * The only exception is when the transition was forced. In this case, + * klp_ftrace_handler() might still see the replaced patch on the stack. + * Fortunately, it is carefully designed to work with removed functions + * thanks to RCU. We only have to keep the patches on the system. Also + * this is handled transparently by patch->module_put. + */ +void klp_discard_replaced_patches(struct klp_patch *new_patch) +{ + struct klp_patch *old_patch, *tmp_patch; + + klp_for_each_patch_safe(old_patch, tmp_patch) { + if (old_patch == new_patch) + return; + + old_patch->enabled = false; + klp_unpatch_objects(old_patch); + klp_free_patch_start(old_patch); + schedule_work(&old_patch->free_work); } +} - return klp_init_patch(patch); +/* + * This function removes the dynamically allocated 'nop' functions. + * + * We could be pretty aggressive. NOPs do not change the existing + * behavior except for adding unnecessary delay by the ftrace handler. + * + * It is safe even when the transition was forced. The ftrace handler + * will see a valid ops->func_stack entry thanks to RCU. + * + * We could even free the NOPs structures. They must be the last entry + * in ops->func_stack. Therefore unregister_ftrace_function() is called. + * It does the same as klp_synchronize_transition() to make sure that + * nobody is inside the ftrace handler once the operation finishes. + * + * IMPORTANT: It must be called right after removing the replaced patches! + */ +void klp_discard_nops(struct klp_patch *new_patch) +{ + klp_unpatch_objects_dynamic(klp_transition_patch); + klp_free_objects_dynamic(klp_transition_patch); } -EXPORT_SYMBOL_GPL(klp_register_patch); /* * Remove parts of patches that touch a given kernel module. The list of @@ -915,7 +1083,7 @@ static void klp_cleanup_module_patches_limited(struct module *mod, struct klp_patch *patch; struct klp_object *obj; - list_for_each_entry(patch, &klp_patches, list) { + klp_for_each_patch(patch) { if (patch == limit) break; @@ -923,21 +1091,14 @@ static void klp_cleanup_module_patches_limited(struct module *mod, if (!klp_is_module(obj) || strcmp(obj->name, mod->name)) continue; - /* - * Only unpatch the module if the patch is enabled or - * is in transition. - */ - if (patch->enabled || patch == klp_transition_patch) { - - if (patch != klp_transition_patch) - klp_pre_unpatch_callback(obj); + if (patch != klp_transition_patch) + klp_pre_unpatch_callback(obj); - pr_notice("reverting patch '%s' on unloading module '%s'\n", - patch->mod->name, obj->mod->name); - klp_unpatch_object(obj); + pr_notice("reverting patch '%s' on unloading module '%s'\n", + patch->mod->name, obj->mod->name); + klp_unpatch_object(obj); - klp_post_unpatch_callback(obj); - } + klp_post_unpatch_callback(obj); klp_free_object_loaded(obj); break; @@ -962,7 +1123,7 @@ int klp_module_coming(struct module *mod) */ mod->klp_alive = true; - list_for_each_entry(patch, &klp_patches, list) { + klp_for_each_patch(patch) { klp_for_each_object(patch, obj) { if (!klp_is_module(obj) || strcmp(obj->name, mod->name)) continue; @@ -976,13 +1137,6 @@ int klp_module_coming(struct module *mod) goto err; } - /* - * Only patch the module if the patch is enabled or is - * in transition. - */ - if (!patch->enabled && patch != klp_transition_patch) - break; - pr_notice("applying patch '%s' to loading module '%s'\n", patch->mod->name, obj->mod->name); @@ -1048,14 +1202,6 @@ void klp_module_going(struct module *mod) static int __init klp_init(void) { - int ret; - - ret = klp_check_compiler_support(); - if (ret) { - pr_info("Your compiler is too old; turning off.\n"); - return -EINVAL; - } - klp_root_kobj = kobject_create_and_add("livepatch", kernel_kobj); if (!klp_root_kobj) return -ENOMEM; diff --git a/kernel/livepatch/core.h b/kernel/livepatch/core.h index 48a83d4364cf..ec43a40b853f 100644 --- a/kernel/livepatch/core.h +++ b/kernel/livepatch/core.h @@ -5,6 +5,17 @@ #include <linux/livepatch.h> extern struct mutex klp_mutex; +extern struct list_head klp_patches; + +#define klp_for_each_patch_safe(patch, tmp_patch) \ + list_for_each_entry_safe(patch, tmp_patch, &klp_patches, list) + +#define klp_for_each_patch(patch) \ + list_for_each_entry(patch, &klp_patches, list) + +void klp_free_patch_start(struct klp_patch *patch); +void klp_discard_replaced_patches(struct klp_patch *new_patch); +void klp_discard_nops(struct klp_patch *new_patch); static inline bool klp_is_object_loaded(struct klp_object *obj) { diff --git a/kernel/livepatch/patch.c b/kernel/livepatch/patch.c index 7702cb4064fc..bd43537702bd 100644 --- a/kernel/livepatch/patch.c +++ b/kernel/livepatch/patch.c @@ -1,22 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * patch.c - livepatch patching functions * * Copyright (C) 2014 Seth Jennings <sjenning@redhat.com> * Copyright (C) 2014 SUSE * Copyright (C) 2015 Josh Poimboeuf <jpoimboe@redhat.com> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see <http://www.gnu.org/licenses/>. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt @@ -34,7 +22,7 @@ static LIST_HEAD(klp_ops); -struct klp_ops *klp_find_ops(unsigned long old_addr) +struct klp_ops *klp_find_ops(void *old_func) { struct klp_ops *ops; struct klp_func *func; @@ -42,7 +30,7 @@ struct klp_ops *klp_find_ops(unsigned long old_addr) list_for_each_entry(ops, &klp_ops, node) { func = list_first_entry(&ops->func_stack, struct klp_func, stack_node); - if (func->old_addr == old_addr) + if (func->old_func == old_func) return ops; } @@ -118,7 +106,15 @@ static void notrace klp_ftrace_handler(unsigned long ip, } } + /* + * NOPs are used to replace existing patches with original code. + * Do nothing! Setting pc would cause an infinite loop. + */ + if (func->nop) + goto unlock; + klp_arch_set_pc(regs, (unsigned long)func->new_func); + unlock: preempt_enable_notrace(); } @@ -142,17 +138,18 @@ static void klp_unpatch_func(struct klp_func *func) if (WARN_ON(!func->patched)) return; - if (WARN_ON(!func->old_addr)) + if (WARN_ON(!func->old_func)) return; - ops = klp_find_ops(func->old_addr); + ops = klp_find_ops(func->old_func); if (WARN_ON(!ops)) return; if (list_is_singular(&ops->func_stack)) { unsigned long ftrace_loc; - ftrace_loc = klp_get_ftrace_location(func->old_addr); + ftrace_loc = + klp_get_ftrace_location((unsigned long)func->old_func); if (WARN_ON(!ftrace_loc)) return; @@ -174,17 +171,18 @@ static int klp_patch_func(struct klp_func *func) struct klp_ops *ops; int ret; - if (WARN_ON(!func->old_addr)) + if (WARN_ON(!func->old_func)) return -EINVAL; if (WARN_ON(func->patched)) return -EINVAL; - ops = klp_find_ops(func->old_addr); + ops = klp_find_ops(func->old_func); if (!ops) { unsigned long ftrace_loc; - ftrace_loc = klp_get_ftrace_location(func->old_addr); + ftrace_loc = + klp_get_ftrace_location((unsigned long)func->old_func); if (!ftrace_loc) { pr_err("failed to find location for function '%s'\n", func->old_name); @@ -236,15 +234,26 @@ err: return ret; } -void klp_unpatch_object(struct klp_object *obj) +static void __klp_unpatch_object(struct klp_object *obj, bool nops_only) { struct klp_func *func; - klp_for_each_func(obj, func) + klp_for_each_func(obj, func) { + if (nops_only && !func->nop) + continue; + if (func->patched) klp_unpatch_func(func); + } + + if (obj->dynamic || !nops_only) + obj->patched = false; +} + - obj->patched = false; +void klp_unpatch_object(struct klp_object *obj) +{ + __klp_unpatch_object(obj, false); } int klp_patch_object(struct klp_object *obj) @@ -267,11 +276,21 @@ int klp_patch_object(struct klp_object *obj) return 0; } -void klp_unpatch_objects(struct klp_patch *patch) +static void __klp_unpatch_objects(struct klp_patch *patch, bool nops_only) { struct klp_object *obj; klp_for_each_object(patch, obj) if (obj->patched) - klp_unpatch_object(obj); + __klp_unpatch_object(obj, nops_only); +} + +void klp_unpatch_objects(struct klp_patch *patch) +{ + __klp_unpatch_objects(patch, false); +} + +void klp_unpatch_objects_dynamic(struct klp_patch *patch) +{ + __klp_unpatch_objects(patch, true); } diff --git a/kernel/livepatch/patch.h b/kernel/livepatch/patch.h index e72d8250d04b..d5f2fbe373e0 100644 --- a/kernel/livepatch/patch.h +++ b/kernel/livepatch/patch.h @@ -10,7 +10,7 @@ * struct klp_ops - structure for tracking registered ftrace ops structs * * A single ftrace_ops is shared between all enabled replacement functions - * (klp_func structs) which have the same old_addr. This allows the switch + * (klp_func structs) which have the same old_func. This allows the switch * between function versions to happen instantaneously by updating the klp_ops * struct's func_stack list. The winner is the klp_func at the top of the * func_stack (front of the list). @@ -25,10 +25,11 @@ struct klp_ops { struct ftrace_ops fops; }; -struct klp_ops *klp_find_ops(unsigned long old_addr); +struct klp_ops *klp_find_ops(void *old_func); int klp_patch_object(struct klp_object *obj); void klp_unpatch_object(struct klp_object *obj); void klp_unpatch_objects(struct klp_patch *patch); +void klp_unpatch_objects_dynamic(struct klp_patch *patch); #endif /* _LIVEPATCH_PATCH_H */ diff --git a/kernel/livepatch/shadow.c b/kernel/livepatch/shadow.c index 83958c814439..e5c9fb295ba9 100644 --- a/kernel/livepatch/shadow.c +++ b/kernel/livepatch/shadow.c @@ -1,22 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * shadow.c - Shadow Variables * * Copyright (C) 2014 Josh Poimboeuf <jpoimboe@redhat.com> * Copyright (C) 2014 Seth Jennings <sjenning@redhat.com> * Copyright (C) 2017 Joe Lawrence <joe.lawrence@redhat.com> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see <http://www.gnu.org/licenses/>. */ /** diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c index 304d5eb8a98c..cdf318d86dd6 100644 --- a/kernel/livepatch/transition.c +++ b/kernel/livepatch/transition.c @@ -1,20 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * transition.c - Kernel Live Patching transition functions * * Copyright (C) 2015-2016 Josh Poimboeuf <jpoimboe@redhat.com> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see <http://www.gnu.org/licenses/>. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt @@ -29,11 +17,13 @@ #define MAX_STACK_ENTRIES 100 #define STACK_ERR_BUF_SIZE 128 +#define SIGNALS_TIMEOUT 15 + struct klp_patch *klp_transition_patch; static int klp_target_state = KLP_UNDEFINED; -static bool klp_forced = false; +static unsigned int klp_signals_cnt; /* * This work can be performed periodically to finish patching or unpatching any @@ -87,6 +77,11 @@ static void klp_complete_transition(void) klp_transition_patch->mod->name, klp_target_state == KLP_PATCHED ? "patching" : "unpatching"); + if (klp_transition_patch->replace && klp_target_state == KLP_PATCHED) { + klp_discard_replaced_patches(klp_transition_patch); + klp_discard_nops(klp_transition_patch); + } + if (klp_target_state == KLP_UNPATCHED) { /* * All tasks have transitioned to KLP_UNPATCHED so we can now @@ -136,13 +131,6 @@ static void klp_complete_transition(void) pr_notice("'%s': %s complete\n", klp_transition_patch->mod->name, klp_target_state == KLP_PATCHED ? "patching" : "unpatching"); - /* - * klp_forced set implies unbounded increase of module's ref count if - * the module is disabled/enabled in a loop. - */ - if (!klp_forced && klp_target_state == KLP_UNPATCHED) - module_put(klp_transition_patch->mod); - klp_target_state = KLP_UNDEFINED; klp_transition_patch = NULL; } @@ -202,15 +190,15 @@ void klp_update_patch_state(struct task_struct *task) * Determine whether the given stack trace includes any references to a * to-be-patched or to-be-unpatched function. */ -static int klp_check_stack_func(struct klp_func *func, - struct stack_trace *trace) +static int klp_check_stack_func(struct klp_func *func, unsigned long *entries, + unsigned int nr_entries) { unsigned long func_addr, func_size, address; struct klp_ops *ops; int i; - for (i = 0; i < trace->nr_entries; i++) { - address = trace->entries[i]; + for (i = 0; i < nr_entries; i++) { + address = entries[i]; if (klp_target_state == KLP_UNPATCHED) { /* @@ -224,11 +212,11 @@ static int klp_check_stack_func(struct klp_func *func, * Check for the to-be-patched function * (the previous func). */ - ops = klp_find_ops(func->old_addr); + ops = klp_find_ops(func->old_func); if (list_is_singular(&ops->func_stack)) { /* original function */ - func_addr = func->old_addr; + func_addr = (unsigned long)func->old_func; func_size = func->old_size; } else { /* previously patched function */ @@ -254,29 +242,24 @@ static int klp_check_stack_func(struct klp_func *func, static int klp_check_stack(struct task_struct *task, char *err_buf) { static unsigned long entries[MAX_STACK_ENTRIES]; - struct stack_trace trace; struct klp_object *obj; struct klp_func *func; - int ret; + int ret, nr_entries; - trace.skip = 0; - trace.nr_entries = 0; - trace.max_entries = MAX_STACK_ENTRIES; - trace.entries = entries; - ret = save_stack_trace_tsk_reliable(task, &trace); - WARN_ON_ONCE(ret == -ENOSYS); - if (ret) { + ret = stack_trace_save_tsk_reliable(task, entries, ARRAY_SIZE(entries)); + if (ret < 0) { snprintf(err_buf, STACK_ERR_BUF_SIZE, "%s: %s:%d has an unreliable stack\n", __func__, task->comm, task->pid); return ret; } + nr_entries = ret; klp_for_each_object(klp_transition_patch, obj) { if (!obj->patched) continue; klp_for_each_func(obj, func) { - ret = klp_check_stack_func(func, &trace); + ret = klp_check_stack_func(func, entries, nr_entries); if (ret) { snprintf(err_buf, STACK_ERR_BUF_SIZE, "%s: %s:%d is sleeping on function %s\n", @@ -297,11 +280,11 @@ static int klp_check_stack(struct task_struct *task, char *err_buf) */ static bool klp_try_switch_task(struct task_struct *task) { + static char err_buf[STACK_ERR_BUF_SIZE]; struct rq *rq; struct rq_flags flags; int ret; bool success = false; - char err_buf[STACK_ERR_BUF_SIZE]; err_buf[0] = '\0'; @@ -310,6 +293,13 @@ static bool klp_try_switch_task(struct task_struct *task) return true; /* + * For arches which don't have reliable stack traces, we have to rely + * on other methods (e.g., switching tasks at kernel exit). + */ + if (!klp_have_reliable_stack()) + return false; + + /* * Now try to check the stack for any to-be-patched or to-be-unpatched * functions. If all goes well, switch the task to the target patch * state. @@ -344,7 +334,47 @@ done: pr_debug("%s", err_buf); return success; +} + +/* + * Sends a fake signal to all non-kthread tasks with TIF_PATCH_PENDING set. + * Kthreads with TIF_PATCH_PENDING set are woken up. + */ +static void klp_send_signals(void) +{ + struct task_struct *g, *task; + + if (klp_signals_cnt == SIGNALS_TIMEOUT) + pr_notice("signaling remaining tasks\n"); + + read_lock(&tasklist_lock); + for_each_process_thread(g, task) { + if (!klp_patch_pending(task)) + continue; + /* + * There is a small race here. We could see TIF_PATCH_PENDING + * set and decide to wake up a kthread or send a fake signal. + * Meanwhile the task could migrate itself and the action + * would be meaningless. It is not serious though. + */ + if (task->flags & PF_KTHREAD) { + /* + * Wake up a kthread which sleeps interruptedly and + * still has not been migrated. + */ + wake_up_state(task, TASK_INTERRUPTIBLE); + } else { + /* + * Send fake signal to all non-kthread tasks which are + * still not migrated. + */ + spin_lock_irq(&task->sighand->siglock); + signal_wake_up(task, 0); + spin_unlock_irq(&task->sighand->siglock); + } + } + read_unlock(&tasklist_lock); } /* @@ -359,6 +389,7 @@ void klp_try_complete_transition(void) { unsigned int cpu; struct task_struct *g, *task; + struct klp_patch *patch; bool complete = true; WARN_ON_ONCE(klp_target_state == KLP_UNDEFINED); @@ -396,6 +427,10 @@ void klp_try_complete_transition(void) put_online_cpus(); if (!complete) { + if (klp_signals_cnt && !(klp_signals_cnt % SIGNALS_TIMEOUT)) + klp_send_signals(); + klp_signals_cnt++; + /* * Some tasks weren't able to be switched over. Try again * later and/or wait for other methods like kernel exit @@ -407,7 +442,18 @@ void klp_try_complete_transition(void) } /* we're done, now cleanup the data structures */ + patch = klp_transition_patch; klp_complete_transition(); + + /* + * It would make more sense to free the patch in + * klp_complete_transition() but it is called also + * from klp_cancel_transition(). + */ + if (!patch->enabled) { + klp_free_patch_start(patch); + schedule_work(&patch->free_work); + } } /* @@ -446,6 +492,8 @@ void klp_start_transition(void) if (task->patch_state != klp_target_state) set_tsk_thread_flag(task, TIF_PATCH_PENDING); } + + klp_signals_cnt = 0; } /* @@ -569,47 +617,6 @@ void klp_copy_process(struct task_struct *child) } /* - * Sends a fake signal to all non-kthread tasks with TIF_PATCH_PENDING set. - * Kthreads with TIF_PATCH_PENDING set are woken up. Only admin can request this - * action currently. - */ -void klp_send_signals(void) -{ - struct task_struct *g, *task; - - pr_notice("signaling remaining tasks\n"); - - read_lock(&tasklist_lock); - for_each_process_thread(g, task) { - if (!klp_patch_pending(task)) - continue; - - /* - * There is a small race here. We could see TIF_PATCH_PENDING - * set and decide to wake up a kthread or send a fake signal. - * Meanwhile the task could migrate itself and the action - * would be meaningless. It is not serious though. - */ - if (task->flags & PF_KTHREAD) { - /* - * Wake up a kthread which sleeps interruptedly and - * still has not been migrated. - */ - wake_up_state(task, TASK_INTERRUPTIBLE); - } else { - /* - * Send fake signal to all non-kthread tasks which are - * still not migrated. - */ - spin_lock_irq(&task->sighand->siglock); - signal_wake_up(task, 0); - spin_unlock_irq(&task->sighand->siglock); - } - } - read_unlock(&tasklist_lock); -} - -/* * Drop TIF_PATCH_PENDING of all tasks on admin's request. This forces an * existing transition to finish. * @@ -620,6 +627,7 @@ void klp_send_signals(void) */ void klp_force_transition(void) { + struct klp_patch *patch; struct task_struct *g, *task; unsigned int cpu; @@ -633,5 +641,6 @@ void klp_force_transition(void) for_each_possible_cpu(cpu) klp_update_patch_state(idle_task(cpu)); - klp_forced = true; + klp_for_each_patch(patch) + patch->forced = true; } diff --git a/kernel/livepatch/transition.h b/kernel/livepatch/transition.h index f9d0bc016067..322db16233de 100644 --- a/kernel/livepatch/transition.h +++ b/kernel/livepatch/transition.h @@ -11,7 +11,6 @@ void klp_cancel_transition(void); void klp_start_transition(void); void klp_try_complete_transition(void); void klp_reverse_transition(void); -void klp_send_signals(void); void klp_force_transition(void); #endif /* _LIVEPATCH_TRANSITION_H */ diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile index 392c7f23af76..45452facff3b 100644 --- a/kernel/locking/Makefile +++ b/kernel/locking/Makefile @@ -25,8 +25,7 @@ obj-$(CONFIG_RT_MUTEXES) += rtmutex.o obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o -obj-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o -obj-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem-xadd.o obj-$(CONFIG_QUEUED_RWLOCKS) += qrwlock.o obj-$(CONFIG_LOCK_TORTURE_TEST) += locktorture.o obj-$(CONFIG_WW_MUTEX_SELFTEST) += test-ww_mutex.o +obj-$(CONFIG_LOCK_EVENT_COUNTS) += lock_events.o diff --git a/kernel/locking/lock_events.c b/kernel/locking/lock_events.c new file mode 100644 index 000000000000..fa2c2f951c6b --- /dev/null +++ b/kernel/locking/lock_events.c @@ -0,0 +1,179 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * Authors: Waiman Long <waiman.long@hpe.com> + */ + +/* + * Collect locking event counts + */ +#include <linux/debugfs.h> +#include <linux/sched.h> +#include <linux/sched/clock.h> +#include <linux/fs.h> + +#include "lock_events.h" + +#undef LOCK_EVENT +#define LOCK_EVENT(name) [LOCKEVENT_ ## name] = #name, + +#define LOCK_EVENTS_DIR "lock_event_counts" + +/* + * When CONFIG_LOCK_EVENT_COUNTS is enabled, event counts of different + * types of locks will be reported under the <debugfs>/lock_event_counts/ + * directory. See lock_events_list.h for the list of available locking + * events. + * + * Writing to the special ".reset_counts" file will reset all the above + * locking event counts. This is a very slow operation and so should not + * be done frequently. + * + * These event counts are implemented as per-cpu variables which are + * summed and computed whenever the corresponding debugfs files are read. This + * minimizes added overhead making the counts usable even in a production + * environment. + */ +static const char * const lockevent_names[lockevent_num + 1] = { + +#include "lock_events_list.h" + + [LOCKEVENT_reset_cnts] = ".reset_counts", +}; + +/* + * Per-cpu counts + */ +DEFINE_PER_CPU(unsigned long, lockevents[lockevent_num]); + +/* + * The lockevent_read() function can be overridden. + */ +ssize_t __weak lockevent_read(struct file *file, char __user *user_buf, + size_t count, loff_t *ppos) +{ + char buf[64]; + int cpu, id, len; + u64 sum = 0; + + /* + * Get the counter ID stored in file->f_inode->i_private + */ + id = (long)file_inode(file)->i_private; + + if (id >= lockevent_num) + return -EBADF; + + for_each_possible_cpu(cpu) + sum += per_cpu(lockevents[id], cpu); + len = snprintf(buf, sizeof(buf) - 1, "%llu\n", sum); + + return simple_read_from_buffer(user_buf, count, ppos, buf, len); +} + +/* + * Function to handle write request + * + * When idx = reset_cnts, reset all the counts. + */ +static ssize_t lockevent_write(struct file *file, const char __user *user_buf, + size_t count, loff_t *ppos) +{ + int cpu; + + /* + * Get the counter ID stored in file->f_inode->i_private + */ + if ((long)file_inode(file)->i_private != LOCKEVENT_reset_cnts) + return count; + + for_each_possible_cpu(cpu) { + int i; + unsigned long *ptr = per_cpu_ptr(lockevents, cpu); + + for (i = 0 ; i < lockevent_num; i++) + WRITE_ONCE(ptr[i], 0); + } + return count; +} + +/* + * Debugfs data structures + */ +static const struct file_operations fops_lockevent = { + .read = lockevent_read, + .write = lockevent_write, + .llseek = default_llseek, +}; + +#ifdef CONFIG_PARAVIRT_SPINLOCKS +#include <asm/paravirt.h> + +static bool __init skip_lockevent(const char *name) +{ + static int pv_on __initdata = -1; + + if (pv_on < 0) + pv_on = !pv_is_native_spin_unlock(); + /* + * Skip PV qspinlock events on bare metal. + */ + if (!pv_on && !memcmp(name, "pv_", 3)) + return true; + return false; +} +#else +static inline bool skip_lockevent(const char *name) +{ + return false; +} +#endif + +/* + * Initialize debugfs for the locking event counts. + */ +static int __init init_lockevent_counts(void) +{ + struct dentry *d_counts = debugfs_create_dir(LOCK_EVENTS_DIR, NULL); + int i; + + if (!d_counts) + goto out; + + /* + * Create the debugfs files + * + * As reading from and writing to the stat files can be slow, only + * root is allowed to do the read/write to limit impact to system + * performance. + */ + for (i = 0; i < lockevent_num; i++) { + if (skip_lockevent(lockevent_names[i])) + continue; + if (!debugfs_create_file(lockevent_names[i], 0400, d_counts, + (void *)(long)i, &fops_lockevent)) + goto fail_undo; + } + + if (!debugfs_create_file(lockevent_names[LOCKEVENT_reset_cnts], 0200, + d_counts, (void *)(long)LOCKEVENT_reset_cnts, + &fops_lockevent)) + goto fail_undo; + + return 0; +fail_undo: + debugfs_remove_recursive(d_counts); +out: + pr_warn("Could not create '%s' debugfs entries\n", LOCK_EVENTS_DIR); + return -ENOMEM; +} +fs_initcall(init_lockevent_counts); diff --git a/kernel/locking/lock_events.h b/kernel/locking/lock_events.h new file mode 100644 index 000000000000..8c7e7d25f09c --- /dev/null +++ b/kernel/locking/lock_events.h @@ -0,0 +1,60 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * Authors: Waiman Long <longman@redhat.com> + */ + +#ifndef __LOCKING_LOCK_EVENTS_H +#define __LOCKING_LOCK_EVENTS_H + +enum lock_events { + +#include "lock_events_list.h" + + lockevent_num, /* Total number of lock event counts */ + LOCKEVENT_reset_cnts = lockevent_num, +}; + +#ifdef CONFIG_LOCK_EVENT_COUNTS +/* + * Per-cpu counters + */ +DECLARE_PER_CPU(unsigned long, lockevents[lockevent_num]); + +/* + * Increment the statistical counters. use raw_cpu_inc() because of lower + * overhead and we don't care if we loose the occasional update. + */ +static inline void __lockevent_inc(enum lock_events event, bool cond) +{ + if (cond) + raw_cpu_inc(lockevents[event]); +} + +#define lockevent_inc(ev) __lockevent_inc(LOCKEVENT_ ##ev, true) +#define lockevent_cond_inc(ev, c) __lockevent_inc(LOCKEVENT_ ##ev, c) + +static inline void __lockevent_add(enum lock_events event, int inc) +{ + raw_cpu_add(lockevents[event], inc); +} + +#define lockevent_add(ev, c) __lockevent_add(LOCKEVENT_ ##ev, c) + +#else /* CONFIG_LOCK_EVENT_COUNTS */ + +#define lockevent_inc(ev) +#define lockevent_add(ev, c) +#define lockevent_cond_inc(ev, c) + +#endif /* CONFIG_LOCK_EVENT_COUNTS */ +#endif /* __LOCKING_LOCK_EVENTS_H */ diff --git a/kernel/locking/lock_events_list.h b/kernel/locking/lock_events_list.h new file mode 100644 index 000000000000..239039d0ce21 --- /dev/null +++ b/kernel/locking/lock_events_list.h @@ -0,0 +1,71 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * Authors: Waiman Long <longman@redhat.com> + */ + +#ifndef LOCK_EVENT +#define LOCK_EVENT(name) LOCKEVENT_ ## name, +#endif + +#ifdef CONFIG_QUEUED_SPINLOCKS +#ifdef CONFIG_PARAVIRT_SPINLOCKS +/* + * Locking events for PV qspinlock. + */ +LOCK_EVENT(pv_hash_hops) /* Average # of hops per hashing operation */ +LOCK_EVENT(pv_kick_unlock) /* # of vCPU kicks issued at unlock time */ +LOCK_EVENT(pv_kick_wake) /* # of vCPU kicks for pv_latency_wake */ +LOCK_EVENT(pv_latency_kick) /* Average latency (ns) of vCPU kick */ +LOCK_EVENT(pv_latency_wake) /* Average latency (ns) of kick-to-wakeup */ +LOCK_EVENT(pv_lock_stealing) /* # of lock stealing operations */ +LOCK_EVENT(pv_spurious_wakeup) /* # of spurious wakeups in non-head vCPUs */ +LOCK_EVENT(pv_wait_again) /* # of wait's after queue head vCPU kick */ +LOCK_EVENT(pv_wait_early) /* # of early vCPU wait's */ +LOCK_EVENT(pv_wait_head) /* # of vCPU wait's at the queue head */ +LOCK_EVENT(pv_wait_node) /* # of vCPU wait's at non-head queue node */ +#endif /* CONFIG_PARAVIRT_SPINLOCKS */ + +/* + * Locking events for qspinlock + * + * Subtracting lock_use_node[234] from lock_slowpath will give you + * lock_use_node1. + */ +LOCK_EVENT(lock_pending) /* # of locking ops via pending code */ +LOCK_EVENT(lock_slowpath) /* # of locking ops via MCS lock queue */ +LOCK_EVENT(lock_use_node2) /* # of locking ops that use 2nd percpu node */ +LOCK_EVENT(lock_use_node3) /* # of locking ops that use 3rd percpu node */ +LOCK_EVENT(lock_use_node4) /* # of locking ops that use 4th percpu node */ +LOCK_EVENT(lock_no_node) /* # of locking ops w/o using percpu node */ +#endif /* CONFIG_QUEUED_SPINLOCKS */ + +/* + * Locking events for rwsem + */ +LOCK_EVENT(rwsem_sleep_reader) /* # of reader sleeps */ +LOCK_EVENT(rwsem_sleep_writer) /* # of writer sleeps */ +LOCK_EVENT(rwsem_wake_reader) /* # of reader wakeups */ +LOCK_EVENT(rwsem_wake_writer) /* # of writer wakeups */ +LOCK_EVENT(rwsem_opt_rlock) /* # of opt-acquired read locks */ +LOCK_EVENT(rwsem_opt_wlock) /* # of opt-acquired write locks */ +LOCK_EVENT(rwsem_opt_fail) /* # of failed optspins */ +LOCK_EVENT(rwsem_opt_nospin) /* # of disabled optspins */ +LOCK_EVENT(rwsem_opt_norspin) /* # of disabled reader-only optspins */ +LOCK_EVENT(rwsem_opt_rlock2) /* # of opt-acquired 2ndary read locks */ +LOCK_EVENT(rwsem_rlock) /* # of read locks acquired */ +LOCK_EVENT(rwsem_rlock_fast) /* # of fast read locks acquired */ +LOCK_EVENT(rwsem_rlock_fail) /* # of failed read lock acquisitions */ +LOCK_EVENT(rwsem_rlock_handoff) /* # of read lock handoffs */ +LOCK_EVENT(rwsem_wlock) /* # of write locks acquired */ +LOCK_EVENT(rwsem_wlock_fail) /* # of failed write lock acquisitions */ +LOCK_EVENT(rwsem_wlock_handoff) /* # of write lock handoffs */ diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 95932333a48b..341f52117f88 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * kernel/lockdep.c * @@ -45,11 +46,14 @@ #include <linux/hash.h> #include <linux/ftrace.h> #include <linux/stringify.h> +#include <linux/bitmap.h> #include <linux/bitops.h> #include <linux/gfp.h> #include <linux/random.h> #include <linux/jhash.h> #include <linux/nmi.h> +#include <linux/rcupdate.h> +#include <linux/kprobes.h> #include <asm/sections.h> @@ -81,6 +85,7 @@ module_param(lock_stat, int, 0644); * code to recurse back into the lockdep code... */ static arch_spinlock_t lockdep_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; +static struct task_struct *lockdep_selftest_task_struct; static int graph_lock(void) { @@ -130,29 +135,44 @@ static inline int debug_locks_off_graph_unlock(void) unsigned long nr_list_entries; static struct lock_list list_entries[MAX_LOCKDEP_ENTRIES]; +static DECLARE_BITMAP(list_entries_in_use, MAX_LOCKDEP_ENTRIES); /* * All data structures here are protected by the global debug_lock. * - * Mutex key structs only get allocated, once during bootup, and never - * get freed - this significantly simplifies the debugging code. + * nr_lock_classes is the number of elements of lock_classes[] that is + * in use. */ +#define KEYHASH_BITS (MAX_LOCKDEP_KEYS_BITS - 1) +#define KEYHASH_SIZE (1UL << KEYHASH_BITS) +static struct hlist_head lock_keys_hash[KEYHASH_SIZE]; unsigned long nr_lock_classes; #ifndef CONFIG_DEBUG_LOCKDEP static #endif struct lock_class lock_classes[MAX_LOCKDEP_KEYS]; +static DECLARE_BITMAP(lock_classes_in_use, MAX_LOCKDEP_KEYS); static inline struct lock_class *hlock_class(struct held_lock *hlock) { - if (!hlock->class_idx) { + unsigned int class_idx = hlock->class_idx; + + /* Don't re-read hlock->class_idx, can't use READ_ONCE() on bitfield */ + barrier(); + + if (!test_bit(class_idx, lock_classes_in_use)) { /* * Someone passed in garbage, we give up. */ DEBUG_LOCKS_WARN_ON(1); return NULL; } - return lock_classes + hlock->class_idx - 1; + + /* + * At this point, if the passed hlock->class_idx is still garbage, + * we just have to live with it + */ + return lock_classes + class_idx; } #ifdef CONFIG_LOCK_STAT @@ -277,11 +297,42 @@ static inline void lock_release_holdtime(struct held_lock *hlock) #endif /* - * We keep a global list of all lock classes. The list only grows, - * never shrinks. The list is only accessed with the lockdep - * spinlock lock held. + * We keep a global list of all lock classes. The list is only accessed with + * the lockdep spinlock lock held. free_lock_classes is a list with free + * elements. These elements are linked together by the lock_entry member in + * struct lock_class. */ LIST_HEAD(all_lock_classes); +static LIST_HEAD(free_lock_classes); + +/** + * struct pending_free - information about data structures about to be freed + * @zapped: Head of a list with struct lock_class elements. + * @lock_chains_being_freed: Bitmap that indicates which lock_chains[] elements + * are about to be freed. + */ +struct pending_free { + struct list_head zapped; + DECLARE_BITMAP(lock_chains_being_freed, MAX_LOCKDEP_CHAINS); +}; + +/** + * struct delayed_free - data structures used for delayed freeing + * + * A data structure for delayed freeing of data structures that may be + * accessed by RCU readers at the time these were freed. + * + * @rcu_head: Used to schedule an RCU callback for freeing data structures. + * @index: Index of @pf to which freed data structures are added. + * @scheduled: Whether or not an RCU callback has been scheduled. + * @pf: Array with information about data structures about to be freed. + */ +static struct delayed_free { + struct rcu_head rcu_head; + int index; + int scheduled; + struct pending_free pf[2]; +} delayed_free; /* * The lockdep classes are in a hash-table as well, for fast lookup: @@ -319,6 +370,13 @@ static inline u64 iterate_chain_key(u64 key, u32 idx) return k0 | (u64)k1 << 32; } +void lockdep_init_task(struct task_struct *task) +{ + task->lockdep_depth = 0; /* no locks held yet */ + task->curr_chain_key = INITIAL_CHAIN_KEY; + task->lockdep_recursion = 0; +} + void lockdep_off(void) { current->lockdep_recursion++; @@ -331,6 +389,11 @@ void lockdep_on(void) } EXPORT_SYMBOL(lockdep_on); +void lockdep_set_selftest_task(struct task_struct *task) +{ + lockdep_selftest_task_struct = task; +} + /* * Debugging switches: */ @@ -374,13 +437,6 @@ static int verbose(struct lock_class *class) return 0; } -/* - * Stack-trace: tightly packed array of stack backtrace - * addresses. Protected by the graph_lock. - */ -unsigned long nr_stack_trace_entries; -static unsigned long stack_trace[MAX_STACK_TRACE_ENTRIES]; - static void print_lockdep_off(const char *bug_msg) { printk(KERN_DEBUG "%s\n", bug_msg); @@ -390,29 +446,23 @@ static void print_lockdep_off(const char *bug_msg) #endif } -static int save_trace(struct stack_trace *trace) -{ - trace->nr_entries = 0; - trace->max_entries = MAX_STACK_TRACE_ENTRIES - nr_stack_trace_entries; - trace->entries = stack_trace + nr_stack_trace_entries; - - trace->skip = 3; - - save_stack_trace(trace); +unsigned long nr_stack_trace_entries; - /* - * Some daft arches put -1 at the end to indicate its a full trace. - * - * <rant> this is buggy anyway, since it takes a whole extra entry so a - * complete trace that maxes out the entries provided will be reported - * as incomplete, friggin useless </rant> - */ - if (trace->nr_entries != 0 && - trace->entries[trace->nr_entries-1] == ULONG_MAX) - trace->nr_entries--; +#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) +/* + * Stack-trace: tightly packed array of stack backtrace + * addresses. Protected by the graph_lock. + */ +static unsigned long stack_trace[MAX_STACK_TRACE_ENTRIES]; - trace->max_entries = trace->nr_entries; +static int save_trace(struct lock_trace *trace) +{ + unsigned long *entries = stack_trace + nr_stack_trace_entries; + unsigned int max_entries; + trace->offset = nr_stack_trace_entries; + max_entries = MAX_STACK_TRACE_ENTRIES - nr_stack_trace_entries; + trace->nr_entries = stack_trace_save(entries, max_entries, 3); nr_stack_trace_entries += trace->nr_entries; if (nr_stack_trace_entries >= MAX_STACK_TRACE_ENTRIES-1) { @@ -427,6 +477,7 @@ static int save_trace(struct stack_trace *trace) return 1; } +#endif unsigned int nr_hardirq_chains; unsigned int nr_softirq_chains; @@ -440,6 +491,7 @@ unsigned int max_lockdep_depth; DEFINE_PER_CPU(struct lockdep_stats, lockdep_stats); #endif +#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) /* * Locking printouts: */ @@ -457,6 +509,7 @@ static const char *usage_str[] = #undef LOCKDEP_STATE [LOCK_USED] = "INITIAL USE", }; +#endif const char * __get_key_name(struct lockdep_subclass_key *key, char *str) { @@ -470,15 +523,26 @@ static inline unsigned long lock_flag(enum lock_usage_bit bit) static char get_usage_char(struct lock_class *class, enum lock_usage_bit bit) { + /* + * The usage character defaults to '.' (i.e., irqs disabled and not in + * irq context), which is the safest usage category. + */ char c = '.'; - if (class->usage_mask & lock_flag(bit + 2)) + /* + * The order of the following usage checks matters, which will + * result in the outcome character as follows: + * + * - '+': irq is enabled and not in irq context + * - '-': in irq context and irq is disabled + * - '?': in irq context and irq is enabled + */ + if (class->usage_mask & lock_flag(bit + LOCK_USAGE_DIR_MASK)) { c = '+'; - if (class->usage_mask & lock_flag(bit)) { - c = '-'; - if (class->usage_mask & lock_flag(bit + 2)) + if (class->usage_mask & lock_flag(bit)) c = '?'; - } + } else if (class->usage_mask & lock_flag(bit)) + c = '-'; return c; } @@ -542,19 +606,22 @@ static void print_lock(struct held_lock *hlock) /* * We can be called locklessly through debug_show_all_locks() so be * extra careful, the hlock might have been released and cleared. + * + * If this indeed happens, lets pretend it does not hurt to continue + * to print the lock unless the hlock class_idx does not point to a + * registered class. The rationale here is: since we don't attempt + * to distinguish whether we are in this situation, if it just + * happened we can't count on class_idx to tell either. */ - unsigned int class_idx = hlock->class_idx; - - /* Don't re-read hlock->class_idx, can't use READ_ONCE() on bitfields: */ - barrier(); + struct lock_class *lock = hlock_class(hlock); - if (!class_idx || (class_idx - 1) >= MAX_LOCKDEP_KEYS) { + if (!lock) { printk(KERN_CONT "<RELEASED>\n"); return; } printk(KERN_CONT "%p", hlock->instance); - print_lock_name(lock_classes + class_idx - 1); + print_lock_name(lock); printk(KERN_CONT ", at: %pS\n", (void *)hlock->acquire_ip); } @@ -599,12 +666,15 @@ static int very_verbose(struct lock_class *class) * Is this the address of a static object: */ #ifdef __KERNEL__ -static int static_obj(void *obj) +static int static_obj(const void *obj) { unsigned long start = (unsigned long) &_stext, end = (unsigned long) &_end, addr = (unsigned long) obj; + if (arch_is_kernel_initmem_freed(addr)) + return 0; + /* * static variable? */ @@ -699,7 +769,8 @@ look_up_lock_class(const struct lockdep_map *lock, unsigned int subclass) * Huh! same key, different name? Did someone trample * on some memory? We're most confused. */ - WARN_ON_ONCE(class->name != lock->name); + WARN_ON_ONCE(class->name != lock->name && + lock->key != &__lockdep_no_validate__); return class; } } @@ -716,6 +787,17 @@ static bool assign_lock_key(struct lockdep_map *lock) { unsigned long can_addr, addr = (unsigned long)lock; +#ifdef __KERNEL__ + /* + * lockdep_free_key_range() assumes that struct lock_class_key + * objects do not overlap. Since we use the address of lock + * objects as class key for static objects, check whether the + * size of lock_class_key objects does not exceed the size of + * the smallest lock object. + */ + BUILD_BUG_ON(sizeof(struct lock_class_key) > sizeof(raw_spinlock_t)); +#endif + if (__is_kernel_percpu_address(addr, &can_addr)) lock->key = (void *)can_addr; else if (__is_module_percpu_address(addr, &can_addr)) @@ -735,6 +817,289 @@ static bool assign_lock_key(struct lockdep_map *lock) return true; } +#ifdef CONFIG_DEBUG_LOCKDEP + +/* Check whether element @e occurs in list @h */ +static bool in_list(struct list_head *e, struct list_head *h) +{ + struct list_head *f; + + list_for_each(f, h) { + if (e == f) + return true; + } + + return false; +} + +/* + * Check whether entry @e occurs in any of the locks_after or locks_before + * lists. + */ +static bool in_any_class_list(struct list_head *e) +{ + struct lock_class *class; + int i; + + for (i = 0; i < ARRAY_SIZE(lock_classes); i++) { + class = &lock_classes[i]; + if (in_list(e, &class->locks_after) || + in_list(e, &class->locks_before)) + return true; + } + return false; +} + +static bool class_lock_list_valid(struct lock_class *c, struct list_head *h) +{ + struct lock_list *e; + + list_for_each_entry(e, h, entry) { + if (e->links_to != c) { + printk(KERN_INFO "class %s: mismatch for lock entry %ld; class %s <> %s", + c->name ? : "(?)", + (unsigned long)(e - list_entries), + e->links_to && e->links_to->name ? + e->links_to->name : "(?)", + e->class && e->class->name ? e->class->name : + "(?)"); + return false; + } + } + return true; +} + +#ifdef CONFIG_PROVE_LOCKING +static u16 chain_hlocks[MAX_LOCKDEP_CHAIN_HLOCKS]; +#endif + +static bool check_lock_chain_key(struct lock_chain *chain) +{ +#ifdef CONFIG_PROVE_LOCKING + u64 chain_key = INITIAL_CHAIN_KEY; + int i; + + for (i = chain->base; i < chain->base + chain->depth; i++) + chain_key = iterate_chain_key(chain_key, chain_hlocks[i]); + /* + * The 'unsigned long long' casts avoid that a compiler warning + * is reported when building tools/lib/lockdep. + */ + if (chain->chain_key != chain_key) { + printk(KERN_INFO "chain %lld: key %#llx <> %#llx\n", + (unsigned long long)(chain - lock_chains), + (unsigned long long)chain->chain_key, + (unsigned long long)chain_key); + return false; + } +#endif + return true; +} + +static bool in_any_zapped_class_list(struct lock_class *class) +{ + struct pending_free *pf; + int i; + + for (i = 0, pf = delayed_free.pf; i < ARRAY_SIZE(delayed_free.pf); i++, pf++) { + if (in_list(&class->lock_entry, &pf->zapped)) + return true; + } + + return false; +} + +static bool __check_data_structures(void) +{ + struct lock_class *class; + struct lock_chain *chain; + struct hlist_head *head; + struct lock_list *e; + int i; + + /* Check whether all classes occur in a lock list. */ + for (i = 0; i < ARRAY_SIZE(lock_classes); i++) { + class = &lock_classes[i]; + if (!in_list(&class->lock_entry, &all_lock_classes) && + !in_list(&class->lock_entry, &free_lock_classes) && + !in_any_zapped_class_list(class)) { + printk(KERN_INFO "class %px/%s is not in any class list\n", + class, class->name ? : "(?)"); + return false; + } + } + + /* Check whether all classes have valid lock lists. */ + for (i = 0; i < ARRAY_SIZE(lock_classes); i++) { + class = &lock_classes[i]; + if (!class_lock_list_valid(class, &class->locks_before)) + return false; + if (!class_lock_list_valid(class, &class->locks_after)) + return false; + } + + /* Check the chain_key of all lock chains. */ + for (i = 0; i < ARRAY_SIZE(chainhash_table); i++) { + head = chainhash_table + i; + hlist_for_each_entry_rcu(chain, head, entry) { + if (!check_lock_chain_key(chain)) + return false; + } + } + + /* + * Check whether all list entries that are in use occur in a class + * lock list. + */ + for_each_set_bit(i, list_entries_in_use, ARRAY_SIZE(list_entries)) { + e = list_entries + i; + if (!in_any_class_list(&e->entry)) { + printk(KERN_INFO "list entry %d is not in any class list; class %s <> %s\n", + (unsigned int)(e - list_entries), + e->class->name ? : "(?)", + e->links_to->name ? : "(?)"); + return false; + } + } + + /* + * Check whether all list entries that are not in use do not occur in + * a class lock list. + */ + for_each_clear_bit(i, list_entries_in_use, ARRAY_SIZE(list_entries)) { + e = list_entries + i; + if (in_any_class_list(&e->entry)) { + printk(KERN_INFO "list entry %d occurs in a class list; class %s <> %s\n", + (unsigned int)(e - list_entries), + e->class && e->class->name ? e->class->name : + "(?)", + e->links_to && e->links_to->name ? + e->links_to->name : "(?)"); + return false; + } + } + + return true; +} + +int check_consistency = 0; +module_param(check_consistency, int, 0644); + +static void check_data_structures(void) +{ + static bool once = false; + + if (check_consistency && !once) { + if (!__check_data_structures()) { + once = true; + WARN_ON(once); + } + } +} + +#else /* CONFIG_DEBUG_LOCKDEP */ + +static inline void check_data_structures(void) { } + +#endif /* CONFIG_DEBUG_LOCKDEP */ + +/* + * Initialize the lock_classes[] array elements, the free_lock_classes list + * and also the delayed_free structure. + */ +static void init_data_structures_once(void) +{ + static bool ds_initialized, rcu_head_initialized; + int i; + + if (likely(rcu_head_initialized)) + return; + + if (system_state >= SYSTEM_SCHEDULING) { + init_rcu_head(&delayed_free.rcu_head); + rcu_head_initialized = true; + } + + if (ds_initialized) + return; + + ds_initialized = true; + + INIT_LIST_HEAD(&delayed_free.pf[0].zapped); + INIT_LIST_HEAD(&delayed_free.pf[1].zapped); + + for (i = 0; i < ARRAY_SIZE(lock_classes); i++) { + list_add_tail(&lock_classes[i].lock_entry, &free_lock_classes); + INIT_LIST_HEAD(&lock_classes[i].locks_after); + INIT_LIST_HEAD(&lock_classes[i].locks_before); + } +} + +static inline struct hlist_head *keyhashentry(const struct lock_class_key *key) +{ + unsigned long hash = hash_long((uintptr_t)key, KEYHASH_BITS); + + return lock_keys_hash + hash; +} + +/* Register a dynamically allocated key. */ +void lockdep_register_key(struct lock_class_key *key) +{ + struct hlist_head *hash_head; + struct lock_class_key *k; + unsigned long flags; + + if (WARN_ON_ONCE(static_obj(key))) + return; + hash_head = keyhashentry(key); + + raw_local_irq_save(flags); + if (!graph_lock()) + goto restore_irqs; + hlist_for_each_entry_rcu(k, hash_head, hash_entry) { + if (WARN_ON_ONCE(k == key)) + goto out_unlock; + } + hlist_add_head_rcu(&key->hash_entry, hash_head); +out_unlock: + graph_unlock(); +restore_irqs: + raw_local_irq_restore(flags); +} +EXPORT_SYMBOL_GPL(lockdep_register_key); + +/* Check whether a key has been registered as a dynamic key. */ +static bool is_dynamic_key(const struct lock_class_key *key) +{ + struct hlist_head *hash_head; + struct lock_class_key *k; + bool found = false; + + if (WARN_ON_ONCE(static_obj(key))) + return false; + + /* + * If lock debugging is disabled lock_keys_hash[] may contain + * pointers to memory that has already been freed. Avoid triggering + * a use-after-free in that case by returning early. + */ + if (!debug_locks) + return true; + + hash_head = keyhashentry(key); + + rcu_read_lock(); + hlist_for_each_entry_rcu(k, hash_head, hash_entry) { + if (k == key) { + found = true; + break; + } + } + rcu_read_unlock(); + + return found; +} + /* * Register a lock's class in the hash-table, if the class is not present * yet. Otherwise we look it up. We cache the result in the lock object @@ -756,7 +1121,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) if (!lock->key) { if (!assign_lock_key(lock)) return NULL; - } else if (!static_obj(lock->key)) { + } else if (!static_obj(lock->key) && !is_dynamic_key(lock->key)) { return NULL; } @@ -775,11 +1140,12 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) goto out_unlock_set; } - /* - * Allocate a new key from the static array, and add it to - * the hash: - */ - if (nr_lock_classes >= MAX_LOCKDEP_KEYS) { + init_data_structures_once(); + + /* Allocate a new lock class and add it to the hash. */ + class = list_first_entry_or_null(&free_lock_classes, typeof(*class), + lock_entry); + if (!class) { if (!debug_locks_off_graph_unlock()) { return NULL; } @@ -788,13 +1154,14 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) dump_stack(); return NULL; } - class = lock_classes + nr_lock_classes++; + nr_lock_classes++; + __set_bit(class - lock_classes, lock_classes_in_use); debug_atomic_inc(nr_unused_locks); class->key = key; class->name = lock->name; class->subclass = subclass; - INIT_LIST_HEAD(&class->locks_before); - INIT_LIST_HEAD(&class->locks_after); + WARN_ON_ONCE(!list_empty(&class->locks_before)); + WARN_ON_ONCE(!list_empty(&class->locks_after)); class->name_version = count_matching_names(class); /* * We use RCU's safe list-add method to make @@ -802,9 +1169,10 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) */ hlist_add_head_rcu(&class->hash_entry, hash_head); /* - * Add it to the global list of classes: + * Remove the class from the free list and add it to the global list + * of classes. */ - list_add_tail(&class->lock_entry, &all_lock_classes); + list_move_tail(&class->lock_entry, &all_lock_classes); if (verbose(class)) { graph_unlock(); @@ -845,7 +1213,10 @@ out_set_class_cache: */ static struct lock_list *alloc_list_entry(void) { - if (nr_list_entries >= MAX_LOCKDEP_ENTRIES) { + int idx = find_first_zero_bit(list_entries_in_use, + ARRAY_SIZE(list_entries)); + + if (idx >= ARRAY_SIZE(list_entries)) { if (!debug_locks_off_graph_unlock()) return NULL; @@ -853,15 +1224,18 @@ static struct lock_list *alloc_list_entry(void) dump_stack(); return NULL; } - return list_entries + nr_list_entries++; + nr_list_entries++; + __set_bit(idx, list_entries_in_use); + return list_entries + idx; } /* * Add a new dependency to the head of the list: */ -static int add_lock_to_list(struct lock_class *this, struct list_head *head, +static int add_lock_to_list(struct lock_class *this, + struct lock_class *links_to, struct list_head *head, unsigned long ip, int distance, - struct stack_trace *trace) + struct lock_trace *trace) { struct lock_list *entry; /* @@ -873,6 +1247,7 @@ static int add_lock_to_list(struct lock_class *this, struct list_head *head, return 0; entry->class = this; + entry->links_to = links_to; entry->distance = distance; entry->trace = *trace; /* @@ -892,13 +1267,17 @@ static int add_lock_to_list(struct lock_class *this, struct list_head *head, #define CQ_MASK (MAX_CIRCULAR_QUEUE_SIZE-1) /* - * The circular_queue and helpers is used to implement the - * breadth-first search(BFS)algorithem, by which we can build - * the shortest path from the next lock to be acquired to the - * previous held lock if there is a circular between them. + * The circular_queue and helpers are used to implement graph + * breadth-first search (BFS) algorithm, by which we can determine + * whether there is a path from a lock to another. In deadlock checks, + * a path from the next lock to be acquired to a previous held lock + * indicates that adding the <prev> -> <next> lock dependency will + * produce a circle in the graph. Breadth-first search instead of + * depth-first search is used in order to find the shortest (circular) + * path. */ struct circular_queue { - unsigned long element[MAX_CIRCULAR_QUEUE_SIZE]; + struct lock_list *element[MAX_CIRCULAR_QUEUE_SIZE]; unsigned int front, rear; }; @@ -924,7 +1303,7 @@ static inline int __cq_full(struct circular_queue *cq) return ((cq->rear + 1) & CQ_MASK) == cq->front; } -static inline int __cq_enqueue(struct circular_queue *cq, unsigned long elem) +static inline int __cq_enqueue(struct circular_queue *cq, struct lock_list *elem) { if (__cq_full(cq)) return -1; @@ -934,14 +1313,21 @@ static inline int __cq_enqueue(struct circular_queue *cq, unsigned long elem) return 0; } -static inline int __cq_dequeue(struct circular_queue *cq, unsigned long *elem) +/* + * Dequeue an element from the circular_queue, return a lock_list if + * the queue is not empty, or NULL if otherwise. + */ +static inline struct lock_list * __cq_dequeue(struct circular_queue *cq) { + struct lock_list * lock; + if (__cq_empty(cq)) - return -1; + return NULL; - *elem = cq->element[cq->front]; + lock = cq->element[cq->front]; cq->front = (cq->front + 1) & CQ_MASK; - return 0; + + return lock; } static inline unsigned int __cq_get_elem_count(struct circular_queue *cq) @@ -955,7 +1341,7 @@ static inline void mark_lock_accessed(struct lock_list *lock, unsigned long nr; nr = lock - list_entries; - WARN_ON(nr >= nr_list_entries); /* Out-of-bounds, input fail */ + WARN_ON(nr >= ARRAY_SIZE(list_entries)); /* Out-of-bounds, input fail */ lock->parent = parent; lock->class->dep_gen_id = lockdep_dependency_gen_id; } @@ -965,7 +1351,7 @@ static inline unsigned long lock_accessed(struct lock_list *lock) unsigned long nr; nr = lock - list_entries; - WARN_ON(nr >= nr_list_entries); /* Out-of-bounds, input fail */ + WARN_ON(nr >= ARRAY_SIZE(list_entries)); /* Out-of-bounds, input fail */ return lock->class->dep_gen_id == lockdep_dependency_gen_id; } @@ -986,13 +1372,32 @@ static inline int get_lock_depth(struct lock_list *child) return depth; } +/* + * Return the forward or backward dependency list. + * + * @lock: the lock_list to get its class's dependency list + * @offset: the offset to struct lock_class to determine whether it is + * locks_after or locks_before + */ +static inline struct list_head *get_dep_list(struct lock_list *lock, int offset) +{ + void *lock_class = lock->class; + + return lock_class + offset; +} + +/* + * Forward- or backward-dependency search, used for both circular dependency + * checking and hardirq-unsafe/softirq-unsafe checking. + */ static int __bfs(struct lock_list *source_entry, void *data, int (*match)(struct lock_list *entry, void *data), struct lock_list **target_entry, - int forward) + int offset) { struct lock_list *entry; + struct lock_list *lock; struct list_head *head; struct circular_queue *cq = &lock_cq; int ret = 1; @@ -1003,31 +1408,21 @@ static int __bfs(struct lock_list *source_entry, goto exit; } - if (forward) - head = &source_entry->class->locks_after; - else - head = &source_entry->class->locks_before; - + head = get_dep_list(source_entry, offset); if (list_empty(head)) goto exit; __cq_init(cq); - __cq_enqueue(cq, (unsigned long)source_entry); + __cq_enqueue(cq, source_entry); - while (!__cq_empty(cq)) { - struct lock_list *lock; - - __cq_dequeue(cq, (unsigned long *)&lock); + while ((lock = __cq_dequeue(cq))) { if (!lock->class) { ret = -2; goto exit; } - if (forward) - head = &lock->class->locks_after; - else - head = &lock->class->locks_before; + head = get_dep_list(lock, offset); DEBUG_LOCKS_WARN_ON(!irqs_disabled()); @@ -1041,7 +1436,7 @@ static int __bfs(struct lock_list *source_entry, goto exit; } - if (__cq_enqueue(cq, (unsigned long)entry)) { + if (__cq_enqueue(cq, entry)) { ret = -1; goto exit; } @@ -1060,7 +1455,8 @@ static inline int __bfs_forwards(struct lock_list *src_entry, int (*match)(struct lock_list *entry, void *data), struct lock_list **target_entry) { - return __bfs(src_entry, data, match, target_entry, 1); + return __bfs(src_entry, data, match, target_entry, + offsetof(struct lock_class, locks_after)); } @@ -1069,31 +1465,31 @@ static inline int __bfs_backwards(struct lock_list *src_entry, int (*match)(struct lock_list *entry, void *data), struct lock_list **target_entry) { - return __bfs(src_entry, data, match, target_entry, 0); + return __bfs(src_entry, data, match, target_entry, + offsetof(struct lock_class, locks_before)); } -/* - * Recursive, forwards-direction lock-dependency checking, used for - * both noncyclic checking and for hardirq-unsafe/softirq-unsafe - * checking. - */ +static void print_lock_trace(struct lock_trace *trace, unsigned int spaces) +{ + unsigned long *entries = stack_trace + trace->offset; + + stack_trace_print(entries, trace->nr_entries, spaces); +} /* * Print a dependency chain entry (this is only done when a deadlock * has been detected): */ -static noinline int +static noinline void print_circular_bug_entry(struct lock_list *target, int depth) { if (debug_locks_silent) - return 0; + return; printk("\n-> #%u", depth); print_lock_name(target->class); printk(KERN_CONT ":\n"); - print_stack_trace(&target->trace, 6); - - return 0; + print_lock_trace(&target->trace, 6); } static void @@ -1150,7 +1546,7 @@ print_circular_lock_scenario(struct held_lock *src, * When a circular dependency is detected, print the * header first: */ -static noinline int +static noinline void print_circular_bug_header(struct lock_list *entry, unsigned int depth, struct held_lock *check_src, struct held_lock *check_tgt) @@ -1158,7 +1554,7 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth, struct task_struct *curr = current; if (debug_locks_silent) - return 0; + return; pr_warn("\n"); pr_warn("======================================================\n"); @@ -1176,8 +1572,6 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth, pr_warn("\nthe existing dependency chain (in reverse order) is:\n"); print_circular_bug_entry(entry, depth); - - return 0; } static inline int class_equal(struct lock_list *entry, void *data) @@ -1185,11 +1579,10 @@ static inline int class_equal(struct lock_list *entry, void *data) return entry->class == data; } -static noinline int print_circular_bug(struct lock_list *this, - struct lock_list *target, - struct held_lock *check_src, - struct held_lock *check_tgt, - struct stack_trace *trace) +static noinline void print_circular_bug(struct lock_list *this, + struct lock_list *target, + struct held_lock *check_src, + struct held_lock *check_tgt) { struct task_struct *curr = current; struct lock_list *parent; @@ -1197,10 +1590,10 @@ static noinline int print_circular_bug(struct lock_list *this, int depth; if (!debug_locks_off_graph_unlock() || debug_locks_silent) - return 0; + return; if (!save_trace(&this->trace)) - return 0; + return; depth = get_lock_depth(target); @@ -1222,21 +1615,17 @@ static noinline int print_circular_bug(struct lock_list *this, printk("\nstack backtrace:\n"); dump_stack(); - - return 0; } -static noinline int print_bfs_bug(int ret) +static noinline void print_bfs_bug(int ret) { if (!debug_locks_off_graph_unlock()) - return 0; + return; /* * Breadth-first-search failed, graph got corrupted? */ WARN(1, "lockdep bfs error:%d\n", ret); - - return 0; } static int noop_count(struct lock_list *entry, void *data) @@ -1299,49 +1688,114 @@ unsigned long lockdep_count_backward_deps(struct lock_class *class) } /* - * Prove that the dependency graph starting at <entry> can not - * lead to <target>. Print an error and return 0 if it does. + * Check that the dependency graph starting at <src> can lead to + * <target> or not. Print an error and return 0 if it does. */ static noinline int -check_noncircular(struct lock_list *root, struct lock_class *target, - struct lock_list **target_entry) +check_path(struct lock_class *target, struct lock_list *src_entry, + struct lock_list **target_entry) { - int result; + int ret; + + ret = __bfs_forwards(src_entry, (void *)target, class_equal, + target_entry); + + if (unlikely(ret < 0)) + print_bfs_bug(ret); + + return ret; +} + +/* + * Prove that the dependency graph starting at <src> can not + * lead to <target>. If it can, there is a circle when adding + * <target> -> <src> dependency. + * + * Print an error and return 0 if it does. + */ +static noinline int +check_noncircular(struct held_lock *src, struct held_lock *target, + struct lock_trace *trace) +{ + int ret; + struct lock_list *uninitialized_var(target_entry); + struct lock_list src_entry = { + .class = hlock_class(src), + .parent = NULL, + }; debug_atomic_inc(nr_cyclic_checks); - result = __bfs_forwards(root, target, class_equal, target_entry); + ret = check_path(hlock_class(target), &src_entry, &target_entry); - return result; + if (unlikely(!ret)) { + if (!trace->nr_entries) { + /* + * If save_trace fails here, the printing might + * trigger a WARN but because of the !nr_entries it + * should not do bad things. + */ + save_trace(trace); + } + + print_circular_bug(&src_entry, target_entry, src, target); + } + + return ret; } +#ifdef CONFIG_LOCKDEP_SMALL +/* + * Check that the dependency graph starting at <src> can lead to + * <target> or not. If it can, <src> -> <target> dependency is already + * in the graph. + * + * Print an error and return 2 if it does or 1 if it does not. + */ static noinline int -check_redundant(struct lock_list *root, struct lock_class *target, - struct lock_list **target_entry) +check_redundant(struct held_lock *src, struct held_lock *target) { - int result; + int ret; + struct lock_list *uninitialized_var(target_entry); + struct lock_list src_entry = { + .class = hlock_class(src), + .parent = NULL, + }; debug_atomic_inc(nr_redundant_checks); - result = __bfs_forwards(root, target, class_equal, target_entry); + ret = check_path(hlock_class(target), &src_entry, &target_entry); - return result; + if (!ret) { + debug_atomic_inc(nr_redundant); + ret = 2; + } else if (ret < 0) + ret = 0; + + return ret; +} +#endif + +#ifdef CONFIG_TRACE_IRQFLAGS + +static inline int usage_accumulate(struct lock_list *entry, void *mask) +{ + *(unsigned long *)mask |= entry->class->usage_mask; + + return 0; } -#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) /* * Forwards and backwards subgraph searching, for the purposes of * proving that two subgraphs can be connected by a new dependency * without creating any illegal irq-safe -> irq-unsafe lock dependency. */ -static inline int usage_match(struct lock_list *entry, void *bit) +static inline int usage_match(struct lock_list *entry, void *mask) { - return entry->class->usage_mask & (1 << (enum lock_usage_bit)bit); + return entry->class->usage_mask & *(unsigned long *)mask; } - - /* * Find a node in the forwards-direction dependency sub-graph starting * at @root->class that matches @bit. @@ -1353,14 +1807,14 @@ static inline int usage_match(struct lock_list *entry, void *bit) * Return <0 on error. */ static int -find_usage_forwards(struct lock_list *root, enum lock_usage_bit bit, +find_usage_forwards(struct lock_list *root, unsigned long usage_mask, struct lock_list **target_entry) { int result; debug_atomic_inc(nr_find_usage_forwards_checks); - result = __bfs_forwards(root, (void *)bit, usage_match, target_entry); + result = __bfs_forwards(root, &usage_mask, usage_match, target_entry); return result; } @@ -1376,14 +1830,14 @@ find_usage_forwards(struct lock_list *root, enum lock_usage_bit bit, * Return <0 on error. */ static int -find_usage_backwards(struct lock_list *root, enum lock_usage_bit bit, +find_usage_backwards(struct lock_list *root, unsigned long usage_mask, struct lock_list **target_entry) { int result; debug_atomic_inc(nr_find_usage_backwards_checks); - result = __bfs_backwards(root, (void *)bit, usage_match, target_entry); + result = __bfs_backwards(root, &usage_mask, usage_match, target_entry); return result; } @@ -1405,7 +1859,7 @@ static void print_lock_class_header(struct lock_class *class, int depth) len += printk("%*s %s", depth, "", usage_str[bit]); len += printk(KERN_CONT " at:\n"); - print_stack_trace(class->usage_traces + bit, len); + print_lock_trace(class->usage_traces + bit, len); } } printk("%*s }\n", depth, ""); @@ -1419,7 +1873,7 @@ static void print_lock_class_header(struct lock_class *class, int depth) */ static void __used print_shortest_lock_dependencies(struct lock_list *leaf, - struct lock_list *root) + struct lock_list *root) { struct lock_list *entry = leaf; int depth; @@ -1430,7 +1884,7 @@ print_shortest_lock_dependencies(struct lock_list *leaf, do { print_lock_class_header(entry->class, depth); printk("%*s ... acquired at:\n", depth, ""); - print_stack_trace(&entry->trace, 2); + print_lock_trace(&entry->trace, 2); printk("\n"); if (depth == 0 && (entry != root)) { @@ -1441,8 +1895,6 @@ print_shortest_lock_dependencies(struct lock_list *leaf, entry = get_lock_parent(entry); depth--; } while (entry && (depth >= 0)); - - return; } static void @@ -1501,7 +1953,7 @@ print_irq_lock_scenario(struct lock_list *safe_entry, printk("\n *** DEADLOCK ***\n\n"); } -static int +static void print_bad_irq_dependency(struct task_struct *curr, struct lock_list *prev_root, struct lock_list *next_root, @@ -1514,7 +1966,7 @@ print_bad_irq_dependency(struct task_struct *curr, const char *irqclass) { if (!debug_locks_off_graph_unlock() || debug_locks_silent) - return 0; + return; pr_warn("\n"); pr_warn("=====================================================\n"); @@ -1543,14 +1995,14 @@ print_bad_irq_dependency(struct task_struct *curr, print_lock_name(backwards_entry->class); pr_warn("\n... which became %s-irq-safe at:\n", irqclass); - print_stack_trace(backwards_entry->class->usage_traces + bit1, 1); + print_lock_trace(backwards_entry->class->usage_traces + bit1, 1); pr_warn("\nto a %s-irq-unsafe lock:\n", irqclass); print_lock_name(forwards_entry->class); pr_warn("\n... which became %s-irq-unsafe at:\n", irqclass); pr_warn("..."); - print_stack_trace(forwards_entry->class->usage_traces + bit2, 1); + print_lock_trace(forwards_entry->class->usage_traces + bit2, 1); pr_warn("\nother info that might help us debug this:\n\n"); print_irq_lock_scenario(backwards_entry, forwards_entry, @@ -1560,52 +2012,17 @@ print_bad_irq_dependency(struct task_struct *curr, pr_warn("\nthe dependencies between %s-irq-safe lock and the holding lock:\n", irqclass); if (!save_trace(&prev_root->trace)) - return 0; + return; print_shortest_lock_dependencies(backwards_entry, prev_root); pr_warn("\nthe dependencies between the lock to be acquired"); pr_warn(" and %s-irq-unsafe lock:\n", irqclass); if (!save_trace(&next_root->trace)) - return 0; + return; print_shortest_lock_dependencies(forwards_entry, next_root); pr_warn("\nstack backtrace:\n"); dump_stack(); - - return 0; -} - -static int -check_usage(struct task_struct *curr, struct held_lock *prev, - struct held_lock *next, enum lock_usage_bit bit_backwards, - enum lock_usage_bit bit_forwards, const char *irqclass) -{ - int ret; - struct lock_list this, that; - struct lock_list *uninitialized_var(target_entry); - struct lock_list *uninitialized_var(target_entry1); - - this.parent = NULL; - - this.class = hlock_class(prev); - ret = find_usage_backwards(&this, bit_backwards, &target_entry); - if (ret < 0) - return print_bfs_bug(ret); - if (ret == 1) - return ret; - - that.parent = NULL; - that.class = hlock_class(next); - ret = find_usage_forwards(&that, bit_forwards, &target_entry1); - if (ret < 0) - return print_bfs_bug(ret); - if (ret == 1) - return ret; - - return print_bad_irq_dependency(curr, &this, &that, - target_entry, target_entry1, - prev, next, - bit_backwards, bit_forwards, irqclass); } static const char *state_names[] = { @@ -1624,70 +2041,192 @@ static const char *state_rnames[] = { static inline const char *state_name(enum lock_usage_bit bit) { - return (bit & 1) ? state_rnames[bit >> 2] : state_names[bit >> 2]; + if (bit & LOCK_USAGE_READ_MASK) + return state_rnames[bit >> LOCK_USAGE_DIR_MASK]; + else + return state_names[bit >> LOCK_USAGE_DIR_MASK]; } +/* + * The bit number is encoded like: + * + * bit0: 0 exclusive, 1 read lock + * bit1: 0 used in irq, 1 irq enabled + * bit2-n: state + */ static int exclusive_bit(int new_bit) { - /* - * USED_IN - * USED_IN_READ - * ENABLED - * ENABLED_READ - * - * bit 0 - write/read - * bit 1 - used_in/enabled - * bit 2+ state - */ - - int state = new_bit & ~3; - int dir = new_bit & 2; + int state = new_bit & LOCK_USAGE_STATE_MASK; + int dir = new_bit & LOCK_USAGE_DIR_MASK; /* * keep state, bit flip the direction and strip read. */ - return state | (dir ^ 2); + return state | (dir ^ LOCK_USAGE_DIR_MASK); } +/* + * Observe that when given a bitmask where each bitnr is encoded as above, a + * right shift of the mask transforms the individual bitnrs as -1 and + * conversely, a left shift transforms into +1 for the individual bitnrs. + * + * So for all bits whose number have LOCK_ENABLED_* set (bitnr1 == 1), we can + * create the mask with those bit numbers using LOCK_USED_IN_* (bitnr1 == 0) + * instead by subtracting the bit number by 2, or shifting the mask right by 2. + * + * Similarly, bitnr1 == 0 becomes bitnr1 == 1 by adding 2, or shifting left 2. + * + * So split the mask (note that LOCKF_ENABLED_IRQ_ALL|LOCKF_USED_IN_IRQ_ALL is + * all bits set) and recompose with bitnr1 flipped. + */ +static unsigned long invert_dir_mask(unsigned long mask) +{ + unsigned long excl = 0; + + /* Invert dir */ + excl |= (mask & LOCKF_ENABLED_IRQ_ALL) >> LOCK_USAGE_DIR_MASK; + excl |= (mask & LOCKF_USED_IN_IRQ_ALL) << LOCK_USAGE_DIR_MASK; + + return excl; +} + +/* + * As above, we clear bitnr0 (LOCK_*_READ off) with bitmask ops. First, for all + * bits with bitnr0 set (LOCK_*_READ), add those with bitnr0 cleared (LOCK_*). + * And then mask out all bitnr0. + */ +static unsigned long exclusive_mask(unsigned long mask) +{ + unsigned long excl = invert_dir_mask(mask); + + /* Strip read */ + excl |= (excl & LOCKF_IRQ_READ) >> LOCK_USAGE_READ_MASK; + excl &= ~LOCKF_IRQ_READ; + + return excl; +} + +/* + * Retrieve the _possible_ original mask to which @mask is + * exclusive. Ie: this is the opposite of exclusive_mask(). + * Note that 2 possible original bits can match an exclusive + * bit: one has LOCK_USAGE_READ_MASK set, the other has it + * cleared. So both are returned for each exclusive bit. + */ +static unsigned long original_mask(unsigned long mask) +{ + unsigned long excl = invert_dir_mask(mask); + + /* Include read in existing usages */ + excl |= (excl & LOCKF_IRQ) << LOCK_USAGE_READ_MASK; + + return excl; +} + +/* + * Find the first pair of bit match between an original + * usage mask and an exclusive usage mask. + */ +static int find_exclusive_match(unsigned long mask, + unsigned long excl_mask, + enum lock_usage_bit *bitp, + enum lock_usage_bit *excl_bitp) +{ + int bit, excl; + + for_each_set_bit(bit, &mask, LOCK_USED) { + excl = exclusive_bit(bit); + if (excl_mask & lock_flag(excl)) { + *bitp = bit; + *excl_bitp = excl; + return 0; + } + } + return -1; +} + +/* + * Prove that the new dependency does not connect a hardirq-safe(-read) + * lock with a hardirq-unsafe lock - to achieve this we search + * the backwards-subgraph starting at <prev>, and the + * forwards-subgraph starting at <next>: + */ static int check_irq_usage(struct task_struct *curr, struct held_lock *prev, - struct held_lock *next, enum lock_usage_bit bit) + struct held_lock *next) { + unsigned long usage_mask = 0, forward_mask, backward_mask; + enum lock_usage_bit forward_bit = 0, backward_bit = 0; + struct lock_list *uninitialized_var(target_entry1); + struct lock_list *uninitialized_var(target_entry); + struct lock_list this, that; + int ret; + /* - * Prove that the new dependency does not connect a hardirq-safe - * lock with a hardirq-unsafe lock - to achieve this we search - * the backwards-subgraph starting at <prev>, and the - * forwards-subgraph starting at <next>: + * Step 1: gather all hard/soft IRQs usages backward in an + * accumulated usage mask. */ - if (!check_usage(curr, prev, next, bit, - exclusive_bit(bit), state_name(bit))) + this.parent = NULL; + this.class = hlock_class(prev); + + ret = __bfs_backwards(&this, &usage_mask, usage_accumulate, NULL); + if (ret < 0) { + print_bfs_bug(ret); return 0; + } - bit++; /* _READ */ + usage_mask &= LOCKF_USED_IN_IRQ_ALL; + if (!usage_mask) + return 1; /* - * Prove that the new dependency does not connect a hardirq-safe-read - * lock with a hardirq-unsafe lock - to achieve this we search - * the backwards-subgraph starting at <prev>, and the - * forwards-subgraph starting at <next>: + * Step 2: find exclusive uses forward that match the previous + * backward accumulated mask. */ - if (!check_usage(curr, prev, next, bit, - exclusive_bit(bit), state_name(bit))) + forward_mask = exclusive_mask(usage_mask); + + that.parent = NULL; + that.class = hlock_class(next); + + ret = find_usage_forwards(&that, forward_mask, &target_entry1); + if (ret < 0) { + print_bfs_bug(ret); return 0; + } + if (ret == 1) + return ret; - return 1; -} + /* + * Step 3: we found a bad match! Now retrieve a lock from the backward + * list whose usage mask matches the exclusive usage mask from the + * lock found on the forward list. + */ + backward_mask = original_mask(target_entry1->class->usage_mask); -static int -check_prev_add_irq(struct task_struct *curr, struct held_lock *prev, - struct held_lock *next) -{ -#define LOCKDEP_STATE(__STATE) \ - if (!check_irq_usage(curr, prev, next, LOCK_USED_IN_##__STATE)) \ + ret = find_usage_backwards(&this, backward_mask, &target_entry); + if (ret < 0) { + print_bfs_bug(ret); return 0; -#include "lockdep_states.h" -#undef LOCKDEP_STATE + } + if (DEBUG_LOCKS_WARN_ON(ret == 1)) + return 1; - return 1; + /* + * Step 4: narrow down to a pair of incompatible usage bits + * and report it. + */ + ret = find_exclusive_match(target_entry->class->usage_mask, + target_entry1->class->usage_mask, + &backward_bit, &forward_bit); + if (DEBUG_LOCKS_WARN_ON(ret == -1)) + return 1; + + print_bad_irq_dependency(curr, &this, &that, + target_entry, target_entry1, + prev, next, + backward_bit, forward_bit, + state_name(backward_bit)); + + return 0; } static void inc_chains(void) @@ -1704,9 +2243,8 @@ static void inc_chains(void) #else -static inline int -check_prev_add_irq(struct task_struct *curr, struct held_lock *prev, - struct held_lock *next) +static inline int check_irq_usage(struct task_struct *curr, + struct held_lock *prev, struct held_lock *next) { return 1; } @@ -1716,11 +2254,10 @@ static inline void inc_chains(void) nr_process_chains++; } -#endif +#endif /* CONFIG_TRACE_IRQFLAGS */ static void -print_deadlock_scenario(struct held_lock *nxt, - struct held_lock *prv) +print_deadlock_scenario(struct held_lock *nxt, struct held_lock *prv) { struct lock_class *next = hlock_class(nxt); struct lock_class *prev = hlock_class(prv); @@ -1738,12 +2275,12 @@ print_deadlock_scenario(struct held_lock *nxt, printk(" May be due to missing lock nesting notation\n\n"); } -static int +static void print_deadlock_bug(struct task_struct *curr, struct held_lock *prev, struct held_lock *next) { if (!debug_locks_off_graph_unlock() || debug_locks_silent) - return 0; + return; pr_warn("\n"); pr_warn("============================================\n"); @@ -1762,8 +2299,6 @@ print_deadlock_bug(struct task_struct *curr, struct held_lock *prev, pr_warn("\nstack backtrace:\n"); dump_stack(); - - return 0; } /* @@ -1775,8 +2310,7 @@ print_deadlock_bug(struct task_struct *curr, struct held_lock *prev, * Returns: 0 on deadlock detected, 1 on OK, 2 on recursive read */ static int -check_deadlock(struct task_struct *curr, struct held_lock *next, - struct lockdep_map *next_instance, int read) +check_deadlock(struct task_struct *curr, struct held_lock *next) { struct held_lock *prev; struct held_lock *nest = NULL; @@ -1795,7 +2329,7 @@ check_deadlock(struct task_struct *curr, struct held_lock *next, * Allow read-after-read recursion of the same * lock class (i.e. read_lock(lock)+read_lock(lock)): */ - if ((read == 2) && prev->read) + if ((next->read == 2) && prev->read) return 2; /* @@ -1805,14 +2339,15 @@ check_deadlock(struct task_struct *curr, struct held_lock *next, if (nest) return 2; - return print_deadlock_bug(curr, prev, next); + print_deadlock_bug(curr, prev, next); + return 0; } return 1; } /* * There was a chain-cache miss, and we are about to add a new dependency - * to a previous lock. We recursively validate the following rules: + * to a previous lock. We validate the following rules: * * - would the adding of the <prev> -> <next> dependency create a * circular dependency in the graph? [== circular deadlock] @@ -1834,41 +2369,44 @@ check_deadlock(struct task_struct *curr, struct held_lock *next, */ static int check_prev_add(struct task_struct *curr, struct held_lock *prev, - struct held_lock *next, int distance, struct stack_trace *trace, - int (*save)(struct stack_trace *trace)) + struct held_lock *next, int distance, struct lock_trace *trace) { - struct lock_list *uninitialized_var(target_entry); struct lock_list *entry; - struct lock_list this; int ret; + if (!hlock_class(prev)->key || !hlock_class(next)->key) { + /* + * The warning statements below may trigger a use-after-free + * of the class name. It is better to trigger a use-after free + * and to have the class name most of the time instead of not + * having the class name available. + */ + WARN_ONCE(!debug_locks_silent && !hlock_class(prev)->key, + "Detected use-after-free of lock class %px/%s\n", + hlock_class(prev), + hlock_class(prev)->name); + WARN_ONCE(!debug_locks_silent && !hlock_class(next)->key, + "Detected use-after-free of lock class %px/%s\n", + hlock_class(next), + hlock_class(next)->name); + return 2; + } + /* * Prove that the new <prev> -> <next> dependency would not * create a circular dependency in the graph. (We do this by - * forward-recursing into the graph starting at <next>, and - * checking whether we can reach <prev>.) + * a breadth-first search into the graph starting at <next>, + * and check whether we can reach <prev>.) * - * We are using global variables to control the recursion, to - * keep the stackframe size of the recursive functions low: + * The search is limited by the size of the circular queue (i.e., + * MAX_CIRCULAR_QUEUE_SIZE) which keeps track of a breadth of nodes + * in the graph whose neighbours are to be checked. */ - this.class = hlock_class(next); - this.parent = NULL; - ret = check_noncircular(&this, hlock_class(prev), &target_entry); - if (unlikely(!ret)) { - if (!trace->entries) { - /* - * If @save fails here, the printing might trigger - * a WARN but because of the !nr_entries it should - * not do bad things. - */ - save(trace); - } - return print_circular_bug(&this, target_entry, next, prev, trace); - } - else if (unlikely(ret < 0)) - return print_bfs_bug(ret); + ret = check_noncircular(next, prev, trace); + if (unlikely(ret <= 0)) + return 0; - if (!check_prev_add_irq(curr, prev, next)) + if (!check_irq_usage(curr, prev, next)) return 0; /* @@ -1897,35 +2435,30 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, } } +#ifdef CONFIG_LOCKDEP_SMALL /* * Is the <prev> -> <next> link redundant? */ - this.class = hlock_class(prev); - this.parent = NULL; - ret = check_redundant(&this, hlock_class(next), &target_entry); - if (!ret) { - debug_atomic_inc(nr_redundant); - return 2; - } - if (ret < 0) - return print_bfs_bug(ret); - + ret = check_redundant(prev, next); + if (ret != 1) + return ret; +#endif - if (!trace->entries && !save(trace)) + if (!trace->nr_entries && !save_trace(trace)) return 0; /* * Ok, all validations passed, add the new lock * to the previous lock's dependency list: */ - ret = add_lock_to_list(hlock_class(next), + ret = add_lock_to_list(hlock_class(next), hlock_class(prev), &hlock_class(prev)->locks_after, next->acquire_ip, distance, trace); if (!ret) return 0; - ret = add_lock_to_list(hlock_class(prev), + ret = add_lock_to_list(hlock_class(prev), hlock_class(next), &hlock_class(next)->locks_before, next->acquire_ip, distance, trace); if (!ret) @@ -1943,14 +2476,9 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, static int check_prevs_add(struct task_struct *curr, struct held_lock *next) { + struct lock_trace trace = { .nr_entries = 0 }; int depth = curr->lockdep_depth; struct held_lock *hlock; - struct stack_trace trace = { - .nr_entries = 0, - .max_entries = 0, - .entries = NULL, - .skip = 0, - }; /* * Debugging checks. @@ -1976,7 +2504,8 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next) * added: */ if (hlock->read != 2 && hlock->check) { - int ret = check_prev_add(curr, hlock, next, distance, &trace, save_trace); + int ret = check_prev_add(curr, hlock, next, distance, + &trace); if (!ret) return 0; @@ -2018,8 +2547,8 @@ out_bug: return 0; } -unsigned long nr_lock_chains; struct lock_chain lock_chains[MAX_LOCKDEP_CHAINS]; +static DECLARE_BITMAP(lock_chains_in_use, MAX_LOCKDEP_CHAINS); int nr_chain_hlocks; static u16 chain_hlocks[MAX_LOCKDEP_CHAIN_HLOCKS]; @@ -2065,12 +2594,13 @@ static void print_chain_keys_held_locks(struct task_struct *curr, struct held_lock *hlock_next) { struct held_lock *hlock; - u64 chain_key = 0; + u64 chain_key = INITIAL_CHAIN_KEY; int depth = curr->lockdep_depth; - int i; + int i = get_first_held_lock(curr, hlock_next); - printk("depth: %u\n", depth + 1); - for (i = get_first_held_lock(curr, hlock_next); i < depth; i++) { + printk("depth: %u (irq_context %u)\n", depth - i + 1, + hlock_next->irq_context); + for (; i < depth; i++) { hlock = curr->held_locks + i; chain_key = print_chain_key_iteration(hlock->class_idx, chain_key); @@ -2084,13 +2614,13 @@ print_chain_keys_held_locks(struct task_struct *curr, struct held_lock *hlock_ne static void print_chain_keys_chain(struct lock_chain *chain) { int i; - u64 chain_key = 0; + u64 chain_key = INITIAL_CHAIN_KEY; int class_id; printk("depth: %u\n", chain->depth); for (i = 0; i < chain->depth; i++) { class_id = chain_hlocks[chain->base + i]; - chain_key = print_chain_key_iteration(class_id + 1, chain_key); + chain_key = print_chain_key_iteration(class_id, chain_key); print_lock_name(lock_classes + class_id); printk("\n"); @@ -2141,7 +2671,7 @@ static int check_no_collision(struct task_struct *curr, } for (j = 0; j < chain->depth - 1; j++, i++) { - id = curr->held_locks[i].class_idx - 1; + id = curr->held_locks[i].class_idx; if (DEBUG_LOCKS_WARN_ON(chain_hlocks[chain->base + j] != id)) { print_collision(curr, hlock, chain); @@ -2153,6 +2683,33 @@ static int check_no_collision(struct task_struct *curr, } /* + * Given an index that is >= -1, return the index of the next lock chain. + * Return -2 if there is no next lock chain. + */ +long lockdep_next_lockchain(long i) +{ + i = find_next_bit(lock_chains_in_use, ARRAY_SIZE(lock_chains), i + 1); + return i < ARRAY_SIZE(lock_chains) ? i : -2; +} + +unsigned long lock_chain_count(void) +{ + return bitmap_weight(lock_chains_in_use, ARRAY_SIZE(lock_chains)); +} + +/* Must be called with the graph lock held. */ +static struct lock_chain *alloc_lock_chain(void) +{ + int idx = find_first_zero_bit(lock_chains_in_use, + ARRAY_SIZE(lock_chains)); + + if (unlikely(idx >= ARRAY_SIZE(lock_chains))) + return NULL; + __set_bit(idx, lock_chains_in_use); + return lock_chains + idx; +} + +/* * Adds a dependency chain into chain hashtable. And must be called with * graph_lock held. * @@ -2169,19 +2726,15 @@ static inline int add_chain_cache(struct task_struct *curr, int i, j; /* - * Allocate a new chain entry from the static array, and add - * it to the hash: - */ - - /* - * We might need to take the graph lock, ensure we've got IRQs + * The caller must hold the graph lock, ensure we've got IRQs * disabled to make this an IRQ-safe lock.. for recursion reasons * lockdep won't complain about its own locking errors. */ if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) return 0; - if (unlikely(nr_lock_chains >= MAX_LOCKDEP_CHAINS)) { + chain = alloc_lock_chain(); + if (!chain) { if (!debug_locks_off_graph_unlock()) return 0; @@ -2189,7 +2742,6 @@ static inline int add_chain_cache(struct task_struct *curr, dump_stack(); return 0; } - chain = lock_chains + nr_lock_chains++; chain->chain_key = chain_key; chain->irq_context = hlock->irq_context; i = get_first_held_lock(curr, hlock); @@ -2202,20 +2754,12 @@ static inline int add_chain_cache(struct task_struct *curr, if (likely(nr_chain_hlocks + chain->depth <= MAX_LOCKDEP_CHAIN_HLOCKS)) { chain->base = nr_chain_hlocks; for (j = 0; j < chain->depth - 1; j++, i++) { - int lock_id = curr->held_locks[i].class_idx - 1; + int lock_id = curr->held_locks[i].class_idx; chain_hlocks[chain->base + j] = lock_id; } chain_hlocks[chain->base + j] = class - lock_classes; - } - - if (nr_chain_hlocks < MAX_LOCKDEP_CHAIN_HLOCKS) nr_chain_hlocks += chain->depth; - -#ifdef CONFIG_DEBUG_LOCKDEP - /* - * Important for check_no_collision(). - */ - if (unlikely(nr_chain_hlocks > MAX_LOCKDEP_CHAIN_HLOCKS)) { + } else { if (!debug_locks_off_graph_unlock()) return 0; @@ -2223,7 +2767,6 @@ static inline int add_chain_cache(struct task_struct *curr, dump_stack(); return 0; } -#endif hlist_add_head_rcu(&chain->entry, hash_head); debug_atomic_inc(chain_lookup_misses); @@ -2233,19 +2776,16 @@ static inline int add_chain_cache(struct task_struct *curr, } /* - * Look up a dependency chain. + * Look up a dependency chain. Must be called with either the graph lock or + * the RCU read lock held. */ static inline struct lock_chain *lookup_chain_cache(u64 chain_key) { struct hlist_head *hash_head = chainhashentry(chain_key); struct lock_chain *chain; - /* - * We can walk it lock-free, because entries only get added - * to the hash: - */ hlist_for_each_entry_rcu(chain, hash_head, entry) { - if (chain->chain_key == chain_key) { + if (READ_ONCE(chain->chain_key) == chain_key) { debug_atomic_inc(chain_lookup_hits); return chain; } @@ -2304,8 +2844,9 @@ cache_hit: return 1; } -static int validate_chain(struct task_struct *curr, struct lockdep_map *lock, - struct held_lock *hlock, int chain_head, u64 chain_key) +static int validate_chain(struct task_struct *curr, + struct held_lock *hlock, + int chain_head, u64 chain_key) { /* * Trylock needs to maintain the stack of held locks, but it @@ -2326,12 +2867,18 @@ static int validate_chain(struct task_struct *curr, struct lockdep_map *lock, * - is softirq-safe, if this lock is hardirq-unsafe * * And check whether the new lock's dependency graph - * could lead back to the previous lock. + * could lead back to the previous lock: + * + * - within the current held-lock stack + * - across our accumulated lock dependency records * - * any of these scenarios could lead to a deadlock. If - * All validations + * any of these scenarios could lead to a deadlock. + */ + /* + * The simple case: does the current hold the same lock + * already? */ - int ret = check_deadlock(curr, hlock, lock, hlock->read); + int ret = check_deadlock(curr, hlock); if (!ret) return 0; @@ -2362,12 +2909,12 @@ static int validate_chain(struct task_struct *curr, struct lockdep_map *lock, } #else static inline int validate_chain(struct task_struct *curr, - struct lockdep_map *lock, struct held_lock *hlock, - int chain_head, u64 chain_key) + struct held_lock *hlock, + int chain_head, u64 chain_key) { return 1; } -#endif +#endif /* CONFIG_PROVE_LOCKING */ /* * We are building curr_chain_key incrementally, so double-check @@ -2378,7 +2925,7 @@ static void check_chain_key(struct task_struct *curr) #ifdef CONFIG_DEBUG_LOCKDEP struct held_lock *hlock, *prev_hlock = NULL; unsigned int i; - u64 chain_key = 0; + u64 chain_key = INITIAL_CHAIN_KEY; for (i = 0; i < curr->lockdep_depth; i++) { hlock = curr->held_locks + i; @@ -2394,15 +2941,17 @@ static void check_chain_key(struct task_struct *curr) (unsigned long long)hlock->prev_chain_key); return; } + /* - * Whoops ran out of static storage again? + * hlock->class_idx can't go beyond MAX_LOCKDEP_KEYS, but is + * it registered lock class index? */ - if (DEBUG_LOCKS_WARN_ON(hlock->class_idx > MAX_LOCKDEP_KEYS)) + if (DEBUG_LOCKS_WARN_ON(!test_bit(hlock->class_idx, lock_classes_in_use))) return; if (prev_hlock && (prev_hlock->irq_context != hlock->irq_context)) - chain_key = 0; + chain_key = INITIAL_CHAIN_KEY; chain_key = iterate_chain_key(chain_key, hlock->class_idx); prev_hlock = hlock; } @@ -2420,8 +2969,11 @@ static void check_chain_key(struct task_struct *curr) #endif } -static void -print_usage_bug_scenario(struct held_lock *lock) +#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) +static int mark_lock(struct task_struct *curr, struct held_lock *this, + enum lock_usage_bit new_bit); + +static void print_usage_bug_scenario(struct held_lock *lock) { struct lock_class *class = hlock_class(lock); @@ -2438,12 +2990,12 @@ print_usage_bug_scenario(struct held_lock *lock) printk("\n *** DEADLOCK ***\n\n"); } -static int +static void print_usage_bug(struct task_struct *curr, struct held_lock *this, enum lock_usage_bit prev_bit, enum lock_usage_bit new_bit) { if (!debug_locks_off_graph_unlock() || debug_locks_silent) - return 0; + return; pr_warn("\n"); pr_warn("================================\n"); @@ -2463,7 +3015,7 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this, print_lock(this); pr_warn("{%s} state was registered at:\n", usage_str[prev_bit]); - print_stack_trace(hlock_class(this)->usage_traces + prev_bit, 1); + print_lock_trace(hlock_class(this)->usage_traces + prev_bit, 1); print_irqtrace_events(curr); pr_warn("\nother info that might help us debug this:\n"); @@ -2473,8 +3025,6 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this, pr_warn("\nstack backtrace:\n"); dump_stack(); - - return 0; } /* @@ -2484,20 +3034,18 @@ static inline int valid_state(struct task_struct *curr, struct held_lock *this, enum lock_usage_bit new_bit, enum lock_usage_bit bad_bit) { - if (unlikely(hlock_class(this)->usage_mask & (1 << bad_bit))) - return print_usage_bug(curr, this, bad_bit, new_bit); + if (unlikely(hlock_class(this)->usage_mask & (1 << bad_bit))) { + print_usage_bug(curr, this, bad_bit, new_bit); + return 0; + } return 1; } -static int mark_lock(struct task_struct *curr, struct held_lock *this, - enum lock_usage_bit new_bit); - -#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) /* * print irq inversion bug: */ -static int +static void print_irq_inversion_bug(struct task_struct *curr, struct lock_list *root, struct lock_list *other, struct held_lock *this, int forwards, @@ -2508,7 +3056,7 @@ print_irq_inversion_bug(struct task_struct *curr, int depth; if (!debug_locks_off_graph_unlock() || debug_locks_silent) - return 0; + return; pr_warn("\n"); pr_warn("========================================================\n"); @@ -2549,13 +3097,11 @@ print_irq_inversion_bug(struct task_struct *curr, pr_warn("\nthe shortest dependencies between 2nd lock and 1st lock:\n"); if (!save_trace(&root->trace)) - return 0; + return; print_shortest_lock_dependencies(other, root); pr_warn("\nstack backtrace:\n"); dump_stack(); - - return 0; } /* @@ -2572,14 +3118,17 @@ check_usage_forwards(struct task_struct *curr, struct held_lock *this, root.parent = NULL; root.class = hlock_class(this); - ret = find_usage_forwards(&root, bit, &target_entry); - if (ret < 0) - return print_bfs_bug(ret); + ret = find_usage_forwards(&root, lock_flag(bit), &target_entry); + if (ret < 0) { + print_bfs_bug(ret); + return 0; + } if (ret == 1) return ret; - return print_irq_inversion_bug(curr, &root, target_entry, - this, 1, irqclass); + print_irq_inversion_bug(curr, &root, target_entry, + this, 1, irqclass); + return 0; } /* @@ -2596,14 +3145,17 @@ check_usage_backwards(struct task_struct *curr, struct held_lock *this, root.parent = NULL; root.class = hlock_class(this); - ret = find_usage_backwards(&root, bit, &target_entry); - if (ret < 0) - return print_bfs_bug(ret); + ret = find_usage_backwards(&root, lock_flag(bit), &target_entry); + if (ret < 0) { + print_bfs_bug(ret); + return 0; + } if (ret == 1) return ret; - return print_irq_inversion_bug(curr, &root, target_entry, - this, 0, irqclass); + print_irq_inversion_bug(curr, &root, target_entry, + this, 0, irqclass); + return 0; } void print_irqtrace_events(struct task_struct *curr) @@ -2651,7 +3203,7 @@ static int (*state_verbose_f[])(struct lock_class *class) = { static inline int state_verbose(enum lock_usage_bit bit, struct lock_class *class) { - return state_verbose_f[bit >> 2](class); + return state_verbose_f[bit >> LOCK_USAGE_DIR_MASK](class); } typedef int (*check_usage_f)(struct task_struct *, struct held_lock *, @@ -2662,8 +3214,8 @@ mark_lock_irq(struct task_struct *curr, struct held_lock *this, enum lock_usage_bit new_bit) { int excl_bit = exclusive_bit(new_bit); - int read = new_bit & 1; - int dir = new_bit & 2; + int read = new_bit & LOCK_USAGE_READ_MASK; + int dir = new_bit & LOCK_USAGE_DIR_MASK; /* * mark USED_IN has to look forwards -- to ensure no dependency @@ -2686,20 +3238,20 @@ mark_lock_irq(struct task_struct *curr, struct held_lock *this, * Validate that the lock dependencies don't have conflicting usage * states. */ - if ((!read || !dir || STRICT_READ_CHECKS) && - !usage(curr, this, excl_bit, state_name(new_bit & ~1))) + if ((!read || STRICT_READ_CHECKS) && + !usage(curr, this, excl_bit, state_name(new_bit & ~LOCK_USAGE_READ_MASK))) return 0; /* * Check for read in write conflicts */ if (!read) { - if (!valid_state(curr, this, new_bit, excl_bit + 1)) + if (!valid_state(curr, this, new_bit, excl_bit + LOCK_USAGE_READ_MASK)) return 0; if (STRICT_READ_CHECKS && - !usage(curr, this, excl_bit + 1, - state_name(new_bit + 1))) + !usage(curr, this, excl_bit + LOCK_USAGE_READ_MASK, + state_name(new_bit + LOCK_USAGE_READ_MASK))) return 0; } @@ -2709,35 +3261,28 @@ mark_lock_irq(struct task_struct *curr, struct held_lock *this, return 1; } -enum mark_type { -#define LOCKDEP_STATE(__STATE) __STATE, -#include "lockdep_states.h" -#undef LOCKDEP_STATE -}; - /* * Mark all held locks with a usage bit: */ static int -mark_held_locks(struct task_struct *curr, enum mark_type mark) +mark_held_locks(struct task_struct *curr, enum lock_usage_bit base_bit) { - enum lock_usage_bit usage_bit; struct held_lock *hlock; int i; for (i = 0; i < curr->lockdep_depth; i++) { + enum lock_usage_bit hlock_bit = base_bit; hlock = curr->held_locks + i; - usage_bit = 2 + (mark << 2); /* ENABLED */ if (hlock->read) - usage_bit += 1; /* READ */ + hlock_bit += LOCK_USAGE_READ_MASK; - BUG_ON(usage_bit >= LOCK_USAGE_STATES); + BUG_ON(hlock_bit >= LOCK_USAGE_STATES); if (!hlock->check) continue; - if (!mark_lock(curr, hlock, usage_bit)) + if (!mark_lock(curr, hlock, hlock_bit)) return 0; } @@ -2758,7 +3303,7 @@ static void __trace_hardirqs_on_caller(unsigned long ip) * We are going to turn hardirqs on, so set the * usage bit for all held locks: */ - if (!mark_held_locks(curr, HARDIRQ)) + if (!mark_held_locks(curr, LOCK_ENABLED_HARDIRQ)) return; /* * If we have softirqs enabled, then set the usage @@ -2766,7 +3311,7 @@ static void __trace_hardirqs_on_caller(unsigned long ip) * this bit from being set before) */ if (curr->softirqs_enabled) - if (!mark_held_locks(curr, SOFTIRQ)) + if (!mark_held_locks(curr, LOCK_ENABLED_SOFTIRQ)) return; curr->hardirq_enable_ip = ip; @@ -2800,7 +3345,7 @@ void lockdep_hardirqs_on(unsigned long ip) /* * See the fine text that goes along with this variable definition. */ - if (DEBUG_LOCKS_WARN_ON(unlikely(early_boot_irqs_disabled))) + if (DEBUG_LOCKS_WARN_ON(early_boot_irqs_disabled)) return; /* @@ -2814,6 +3359,7 @@ void lockdep_hardirqs_on(unsigned long ip) __trace_hardirqs_on_caller(ip); current->lockdep_recursion = 0; } +NOKPROBE_SYMBOL(lockdep_hardirqs_on); /* * Hardirqs were disabled: @@ -2843,6 +3389,7 @@ void lockdep_hardirqs_off(unsigned long ip) } else debug_atomic_inc(redundant_hardirqs_off); } +NOKPROBE_SYMBOL(lockdep_hardirqs_off); /* * Softirqs will be enabled: @@ -2880,7 +3427,7 @@ void trace_softirqs_on(unsigned long ip) * enabled too: */ if (curr->hardirqs_enabled) - mark_held_locks(curr, SOFTIRQ); + mark_held_locks(curr, LOCK_ENABLED_SOFTIRQ); current->lockdep_recursion = 0; } @@ -2916,8 +3463,12 @@ void trace_softirqs_off(unsigned long ip) debug_atomic_inc(redundant_softirqs_off); } -static int mark_irqflags(struct task_struct *curr, struct held_lock *hlock) +static int +mark_usage(struct task_struct *curr, struct held_lock *hlock, int check) { + if (!check) + goto lock_used; + /* * If non-trylock use in a hardirq or softirq context, then * mark the lock as used in these contexts: @@ -2961,6 +3512,11 @@ static int mark_irqflags(struct task_struct *curr, struct held_lock *hlock) } } +lock_used: + /* mark it as used: */ + if (!mark_lock(curr, hlock, LOCK_USED)) + return 0; + return 1; } @@ -2992,35 +3548,6 @@ static int separate_irq_context(struct task_struct *curr, return 0; } -#else /* defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) */ - -static inline -int mark_lock_irq(struct task_struct *curr, struct held_lock *this, - enum lock_usage_bit new_bit) -{ - WARN_ON(1); /* Impossible innit? when we don't have TRACE_IRQFLAG */ - return 1; -} - -static inline int mark_irqflags(struct task_struct *curr, - struct held_lock *hlock) -{ - return 1; -} - -static inline unsigned int task_irq_context(struct task_struct *task) -{ - return 0; -} - -static inline int separate_irq_context(struct task_struct *curr, - struct held_lock *hlock) -{ - return 0; -} - -#endif /* defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) */ - /* * Mark a lock with a usage bit, and validate the state transition: */ @@ -3029,6 +3556,11 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this, { unsigned int new_mask = 1 << new_bit, ret = 1; + if (new_bit >= LOCK_USAGE_STATES) { + DEBUG_LOCKS_WARN_ON(1); + return 0; + } + /* * If already set then do not dirty the cacheline, * nor do any checks: @@ -3052,25 +3584,13 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this, return 0; switch (new_bit) { -#define LOCKDEP_STATE(__STATE) \ - case LOCK_USED_IN_##__STATE: \ - case LOCK_USED_IN_##__STATE##_READ: \ - case LOCK_ENABLED_##__STATE: \ - case LOCK_ENABLED_##__STATE##_READ: -#include "lockdep_states.h" -#undef LOCKDEP_STATE - ret = mark_lock_irq(curr, this, new_bit); - if (!ret) - return 0; - break; case LOCK_USED: debug_atomic_dec(nr_unused_locks); break; default: - if (!debug_locks_off_graph_unlock()) + ret = mark_lock_irq(curr, this, new_bit); + if (!ret) return 0; - WARN_ON(1); - return 0; } graph_unlock(); @@ -3088,6 +3608,27 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this, return ret; } +#else /* defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) */ + +static inline int +mark_usage(struct task_struct *curr, struct held_lock *hlock, int check) +{ + return 1; +} + +static inline unsigned int task_irq_context(struct task_struct *task) +{ + return 0; +} + +static inline int separate_irq_context(struct task_struct *curr, + struct held_lock *hlock) +{ + return 0; +} + +#endif /* defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) */ + /* * Initialize a lock instance's lock-class mapping info: */ @@ -3119,13 +3660,12 @@ void lockdep_init_map(struct lockdep_map *lock, const char *name, if (DEBUG_LOCKS_WARN_ON(!key)) return; /* - * Sanity check, the lock-class key must be persistent: + * Sanity check, the lock-class key must either have been allocated + * statically or must have been registered as a dynamic key. */ - if (!static_obj(key)) { - printk("BUG: key %px not in .data!\n", key); - /* - * What it says above ^^^^^, I suggest you read it. - */ + if (!static_obj(key) && !is_dynamic_key(key)) { + if (debug_locks) + printk(KERN_ERR "BUG: key %px has not been registered!\n", key); DEBUG_LOCKS_WARN_ON(1); return; } @@ -3152,15 +3692,15 @@ EXPORT_SYMBOL_GPL(lockdep_init_map); struct lock_class_key __lockdep_no_validate__; EXPORT_SYMBOL_GPL(__lockdep_no_validate__); -static int +static void print_lock_nested_lock_not_held(struct task_struct *curr, struct held_lock *hlock, unsigned long ip) { if (!debug_locks_off()) - return 0; + return; if (debug_locks_silent) - return 0; + return; pr_warn("\n"); pr_warn("==================================\n"); @@ -3182,8 +3722,6 @@ print_lock_nested_lock_not_held(struct task_struct *curr, pr_warn("\nstack backtrace:\n"); dump_stack(); - - return 0; } static int __lock_is_held(const struct lockdep_map *lock, int read); @@ -3248,24 +3786,24 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, if (DEBUG_LOCKS_WARN_ON(depth >= MAX_LOCK_DEPTH)) return 0; - class_idx = class - lock_classes + 1; + class_idx = class - lock_classes; if (depth) { hlock = curr->held_locks + depth - 1; if (hlock->class_idx == class_idx && nest_lock) { - if (hlock->references) { - /* - * Check: unsigned int references:12, overflow. - */ - if (DEBUG_LOCKS_WARN_ON(hlock->references == (1 << 12)-1)) - return 0; + if (!references) + references++; + if (!hlock->references) hlock->references++; - } else { - hlock->references = 2; - } - return 1; + hlock->references += references; + + /* Overflow */ + if (DEBUG_LOCKS_WARN_ON(hlock->references < references)) + return 0; + + return 2; } } @@ -3292,11 +3830,8 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, #endif hlock->pin_count = pin_count; - if (check && !mark_irqflags(curr, hlock)) - return 0; - - /* mark it as used: */ - if (!mark_lock(curr, hlock, LOCK_USED)) + /* Initialize the lock usage bit */ + if (!mark_usage(curr, hlock, check)) return 0; /* @@ -3310,9 +3845,9 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, * the hash, not class->key. */ /* - * Whoops, we did it again.. ran straight out of our static allocation. + * Whoops, we did it again.. class_idx is invalid. */ - if (DEBUG_LOCKS_WARN_ON(class_idx > MAX_LOCKDEP_KEYS)) + if (DEBUG_LOCKS_WARN_ON(!test_bit(class_idx, lock_classes_in_use))) return 0; chain_key = curr->curr_chain_key; @@ -3320,22 +3855,29 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, /* * How can we have a chain hash when we ain't got no keys?! */ - if (DEBUG_LOCKS_WARN_ON(chain_key != 0)) + if (DEBUG_LOCKS_WARN_ON(chain_key != INITIAL_CHAIN_KEY)) return 0; chain_head = 1; } hlock->prev_chain_key = chain_key; if (separate_irq_context(curr, hlock)) { - chain_key = 0; + chain_key = INITIAL_CHAIN_KEY; chain_head = 1; } chain_key = iterate_chain_key(chain_key, class_idx); - if (nest_lock && !__lock_is_held(nest_lock, -1)) - return print_lock_nested_lock_not_held(curr, hlock, ip); + if (nest_lock && !__lock_is_held(nest_lock, -1)) { + print_lock_nested_lock_not_held(curr, hlock, ip); + return 0; + } + + if (!debug_locks_silent) { + WARN_ON_ONCE(depth && !hlock_class(hlock - 1)->key); + WARN_ON_ONCE(!hlock_class(hlock)->key); + } - if (!validate_chain(curr, lock, hlock, chain_head, chain_key)) + if (!validate_chain(curr, hlock, chain_head, chain_key)) return 0; curr->curr_chain_key = chain_key; @@ -3364,14 +3906,14 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, return 1; } -static int -print_unlock_imbalance_bug(struct task_struct *curr, struct lockdep_map *lock, - unsigned long ip) +static void print_unlock_imbalance_bug(struct task_struct *curr, + struct lockdep_map *lock, + unsigned long ip) { if (!debug_locks_off()) - return 0; + return; if (debug_locks_silent) - return 0; + return; pr_warn("\n"); pr_warn("=====================================\n"); @@ -3389,8 +3931,6 @@ print_unlock_imbalance_bug(struct task_struct *curr, struct lockdep_map *lock, pr_warn("\nstack backtrace:\n"); dump_stack(); - - return 0; } static int match_held_lock(const struct held_lock *hlock, @@ -3422,7 +3962,7 @@ static int match_held_lock(const struct held_lock *hlock, if (DEBUG_LOCKS_WARN_ON(!hlock->nest_lock)) return 0; - if (hlock->class_idx == class - lock_classes + 1) + if (hlock->class_idx == class - lock_classes) return 1; } @@ -3466,22 +4006,33 @@ out: } static int reacquire_held_locks(struct task_struct *curr, unsigned int depth, - int idx) + int idx, unsigned int *merged) { struct held_lock *hlock; + int first_idx = idx; if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) return 0; for (hlock = curr->held_locks + idx; idx < depth; idx++, hlock++) { - if (!__lock_acquire(hlock->instance, + switch (__lock_acquire(hlock->instance, hlock_class(hlock)->subclass, hlock->trylock, hlock->read, hlock->check, hlock->hardirqs_off, hlock->nest_lock, hlock->acquire_ip, - hlock->references, hlock->pin_count)) + hlock->references, hlock->pin_count)) { + case 0: return 1; + case 1: + break; + case 2: + *merged += (idx == first_idx); + break; + default: + WARN_ON(1); + return 0; + } } return 0; } @@ -3492,11 +4043,14 @@ __lock_set_class(struct lockdep_map *lock, const char *name, unsigned long ip) { struct task_struct *curr = current; + unsigned int depth, merged = 0; struct held_lock *hlock; struct lock_class *class; - unsigned int depth; int i; + if (unlikely(!debug_locks)) + return 0; + depth = curr->lockdep_depth; /* * This function is about (re)setting the class of a held lock, @@ -3506,24 +4060,26 @@ __lock_set_class(struct lockdep_map *lock, const char *name, return 0; hlock = find_held_lock(curr, lock, depth, &i); - if (!hlock) - return print_unlock_imbalance_bug(curr, lock, ip); + if (!hlock) { + print_unlock_imbalance_bug(curr, lock, ip); + return 0; + } lockdep_init_map(lock, name, key, 0); class = register_lock_class(lock, subclass, 0); - hlock->class_idx = class - lock_classes + 1; + hlock->class_idx = class - lock_classes; curr->lockdep_depth = i; curr->curr_chain_key = hlock->prev_chain_key; - if (reacquire_held_locks(curr, depth, i)) + if (reacquire_held_locks(curr, depth, i, &merged)) return 0; /* * I took it apart and put it back together again, except now I have * these 'spare' parts.. where shall I put them. */ - if (DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth)) + if (DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth - merged)) return 0; return 1; } @@ -3531,10 +4087,13 @@ __lock_set_class(struct lockdep_map *lock, const char *name, static int __lock_downgrade(struct lockdep_map *lock, unsigned long ip) { struct task_struct *curr = current; + unsigned int depth, merged = 0; struct held_lock *hlock; - unsigned int depth; int i; + if (unlikely(!debug_locks)) + return 0; + depth = curr->lockdep_depth; /* * This function is about (re)setting the class of a held lock, @@ -3544,8 +4103,10 @@ static int __lock_downgrade(struct lockdep_map *lock, unsigned long ip) return 0; hlock = find_held_lock(curr, lock, depth, &i); - if (!hlock) - return print_unlock_imbalance_bug(curr, lock, ip); + if (!hlock) { + print_unlock_imbalance_bug(curr, lock, ip); + return 0; + } curr->lockdep_depth = i; curr->curr_chain_key = hlock->prev_chain_key; @@ -3554,7 +4115,11 @@ static int __lock_downgrade(struct lockdep_map *lock, unsigned long ip) hlock->read = 1; hlock->acquire_ip = ip; - if (reacquire_held_locks(curr, depth, i)) + if (reacquire_held_locks(curr, depth, i, &merged)) + return 0; + + /* Merging can't happen with unchanged classes.. */ + if (DEBUG_LOCKS_WARN_ON(merged)) return 0; /* @@ -3563,6 +4128,7 @@ static int __lock_downgrade(struct lockdep_map *lock, unsigned long ip) */ if (DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth)) return 0; + return 1; } @@ -3574,11 +4140,11 @@ static int __lock_downgrade(struct lockdep_map *lock, unsigned long ip) * @nested is an hysterical artifact, needs a tree wide cleanup. */ static int -__lock_release(struct lockdep_map *lock, int nested, unsigned long ip) +__lock_release(struct lockdep_map *lock, unsigned long ip) { struct task_struct *curr = current; + unsigned int depth, merged = 1; struct held_lock *hlock; - unsigned int depth; int i; if (unlikely(!debug_locks)) @@ -3589,16 +4155,20 @@ __lock_release(struct lockdep_map *lock, int nested, unsigned long ip) * So we're all set to release this lock.. wait what lock? We don't * own any locks, you've been drinking again? */ - if (DEBUG_LOCKS_WARN_ON(depth <= 0)) - return print_unlock_imbalance_bug(curr, lock, ip); + if (depth <= 0) { + print_unlock_imbalance_bug(curr, lock, ip); + return 0; + } /* * Check whether the lock exists in the current stack * of held locks: */ hlock = find_held_lock(curr, lock, depth, &i); - if (!hlock) - return print_unlock_imbalance_bug(curr, lock, ip); + if (!hlock) { + print_unlock_imbalance_bug(curr, lock, ip); + return 0; + } if (hlock->instance == lock) lock_release_holdtime(hlock); @@ -3633,14 +4203,15 @@ __lock_release(struct lockdep_map *lock, int nested, unsigned long ip) if (i == depth-1) return 1; - if (reacquire_held_locks(curr, depth, i + 1)) + if (reacquire_held_locks(curr, depth, i + 1, &merged)) return 0; /* * We had N bottles of beer on the wall, we drank one, but now * there's not N-1 bottles of beer left on the wall... + * Pouring two of the bottles together is acceptable. */ - DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth-1); + DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth - merged); /* * Since reacquire_held_locks() would have called check_chain_key() @@ -3650,7 +4221,8 @@ __lock_release(struct lockdep_map *lock, int nested, unsigned long ip) return 0; } -static int __lock_is_held(const struct lockdep_map *lock, int read) +static nokprobe_inline +int __lock_is_held(const struct lockdep_map *lock, int read) { struct task_struct *curr = current; int i; @@ -3857,7 +4429,7 @@ void lock_release(struct lockdep_map *lock, int nested, check_flags(flags); current->lockdep_recursion = 1; trace_lock_release(lock, ip); - if (__lock_release(lock, nested, ip)) + if (__lock_release(lock, ip)) check_chain_key(current); current->lockdep_recursion = 0; raw_local_irq_restore(flags); @@ -3883,6 +4455,7 @@ int lock_is_held_type(const struct lockdep_map *lock, int read) return ret; } EXPORT_SYMBOL_GPL(lock_is_held_type); +NOKPROBE_SYMBOL(lock_is_held_type); struct pin_cookie lock_pin_lock(struct lockdep_map *lock) { @@ -3939,14 +4512,14 @@ void lock_unpin_lock(struct lockdep_map *lock, struct pin_cookie cookie) EXPORT_SYMBOL_GPL(lock_unpin_lock); #ifdef CONFIG_LOCK_STAT -static int -print_lock_contention_bug(struct task_struct *curr, struct lockdep_map *lock, - unsigned long ip) +static void print_lock_contention_bug(struct task_struct *curr, + struct lockdep_map *lock, + unsigned long ip) { if (!debug_locks_off()) - return 0; + return; if (debug_locks_silent) - return 0; + return; pr_warn("\n"); pr_warn("=================================\n"); @@ -3964,8 +4537,6 @@ print_lock_contention_bug(struct task_struct *curr, struct lockdep_map *lock, pr_warn("\nstack backtrace:\n"); dump_stack(); - - return 0; } static void @@ -4110,9 +4681,7 @@ void lockdep_reset(void) int i; raw_local_irq_save(flags); - current->curr_chain_key = 0; - current->lockdep_depth = 0; - current->lockdep_recursion = 0; + lockdep_init_task(current); memset(current->held_locks, 0, MAX_LOCK_DEPTH*sizeof(struct held_lock)); nr_hardirq_chains = 0; nr_softirq_chains = 0; @@ -4123,29 +4692,132 @@ void lockdep_reset(void) raw_local_irq_restore(flags); } +/* Remove a class from a lock chain. Must be called with the graph lock held. */ +static void remove_class_from_lock_chain(struct pending_free *pf, + struct lock_chain *chain, + struct lock_class *class) +{ +#ifdef CONFIG_PROVE_LOCKING + struct lock_chain *new_chain; + u64 chain_key; + int i; + + for (i = chain->base; i < chain->base + chain->depth; i++) { + if (chain_hlocks[i] != class - lock_classes) + continue; + /* The code below leaks one chain_hlock[] entry. */ + if (--chain->depth > 0) { + memmove(&chain_hlocks[i], &chain_hlocks[i + 1], + (chain->base + chain->depth - i) * + sizeof(chain_hlocks[0])); + } + /* + * Each lock class occurs at most once in a lock chain so once + * we found a match we can break out of this loop. + */ + goto recalc; + } + /* Since the chain has not been modified, return. */ + return; + +recalc: + chain_key = INITIAL_CHAIN_KEY; + for (i = chain->base; i < chain->base + chain->depth; i++) + chain_key = iterate_chain_key(chain_key, chain_hlocks[i]); + if (chain->depth && chain->chain_key == chain_key) + return; + /* Overwrite the chain key for concurrent RCU readers. */ + WRITE_ONCE(chain->chain_key, chain_key); + /* + * Note: calling hlist_del_rcu() from inside a + * hlist_for_each_entry_rcu() loop is safe. + */ + hlist_del_rcu(&chain->entry); + __set_bit(chain - lock_chains, pf->lock_chains_being_freed); + if (chain->depth == 0) + return; + /* + * If the modified lock chain matches an existing lock chain, drop + * the modified lock chain. + */ + if (lookup_chain_cache(chain_key)) + return; + new_chain = alloc_lock_chain(); + if (WARN_ON_ONCE(!new_chain)) { + debug_locks_off(); + return; + } + *new_chain = *chain; + hlist_add_head_rcu(&new_chain->entry, chainhashentry(chain_key)); +#endif +} + +/* Must be called with the graph lock held. */ +static void remove_class_from_lock_chains(struct pending_free *pf, + struct lock_class *class) +{ + struct lock_chain *chain; + struct hlist_head *head; + int i; + + for (i = 0; i < ARRAY_SIZE(chainhash_table); i++) { + head = chainhash_table + i; + hlist_for_each_entry_rcu(chain, head, entry) { + remove_class_from_lock_chain(pf, chain, class); + } + } +} + /* * Remove all references to a lock class. The caller must hold the graph lock. */ -static void zap_class(struct lock_class *class) +static void zap_class(struct pending_free *pf, struct lock_class *class) { + struct lock_list *entry; int i; + WARN_ON_ONCE(!class->key); + /* * Remove all dependencies this lock is * involved in: */ - for (i = 0; i < nr_list_entries; i++) { - if (list_entries[i].class == class) - list_del_rcu(&list_entries[i].entry); + for_each_set_bit(i, list_entries_in_use, ARRAY_SIZE(list_entries)) { + entry = list_entries + i; + if (entry->class != class && entry->links_to != class) + continue; + __clear_bit(i, list_entries_in_use); + nr_list_entries--; + list_del_rcu(&entry->entry); + } + if (list_empty(&class->locks_after) && + list_empty(&class->locks_before)) { + list_move_tail(&class->lock_entry, &pf->zapped); + hlist_del_rcu(&class->hash_entry); + WRITE_ONCE(class->key, NULL); + WRITE_ONCE(class->name, NULL); + nr_lock_classes--; + __clear_bit(class - lock_classes, lock_classes_in_use); + } else { + WARN_ONCE(true, "%s() failed for class %s\n", __func__, + class->name); } - /* - * Unhash the class and remove it from the all_lock_classes list: - */ - hlist_del_rcu(&class->hash_entry); - list_del(&class->lock_entry); - RCU_INIT_POINTER(class->key, NULL); - RCU_INIT_POINTER(class->name, NULL); + remove_class_from_lock_chains(pf, class); +} + +static void reinit_class(struct lock_class *class) +{ + void *const p = class; + const unsigned int offset = offsetof(struct lock_class, key); + + WARN_ON_ONCE(!class->lock_entry.next); + WARN_ON_ONCE(!list_empty(&class->locks_after)); + WARN_ON_ONCE(!list_empty(&class->locks_before)); + memset(p + offset, 0, sizeof(*class) - offset); + WARN_ON_ONCE(!class->lock_entry.next); + WARN_ON_ONCE(!list_empty(&class->locks_after)); + WARN_ON_ONCE(!list_empty(&class->locks_before)); } static inline int within(const void *addr, void *start, unsigned long size) @@ -4153,55 +4825,171 @@ static inline int within(const void *addr, void *start, unsigned long size) return addr >= start && addr < start + size; } +static bool inside_selftest(void) +{ + return current == lockdep_selftest_task_struct; +} + +/* The caller must hold the graph lock. */ +static struct pending_free *get_pending_free(void) +{ + return delayed_free.pf + delayed_free.index; +} + +static void free_zapped_rcu(struct rcu_head *cb); + /* - * Used in module.c to remove lock classes from memory that is going to be - * freed; and possibly re-used by other modules. - * - * We will have had one sync_sched() before getting here, so we're guaranteed - * nobody will look up these exact classes -- they're properly dead but still - * allocated. + * Schedule an RCU callback if no RCU callback is pending. Must be called with + * the graph lock held. */ -void lockdep_free_key_range(void *start, unsigned long size) +static void call_rcu_zapped(struct pending_free *pf) +{ + WARN_ON_ONCE(inside_selftest()); + + if (list_empty(&pf->zapped)) + return; + + if (delayed_free.scheduled) + return; + + delayed_free.scheduled = true; + + WARN_ON_ONCE(delayed_free.pf + delayed_free.index != pf); + delayed_free.index ^= 1; + + call_rcu(&delayed_free.rcu_head, free_zapped_rcu); +} + +/* The caller must hold the graph lock. May be called from RCU context. */ +static void __free_zapped_classes(struct pending_free *pf) { struct lock_class *class; - struct hlist_head *head; + + check_data_structures(); + + list_for_each_entry(class, &pf->zapped, lock_entry) + reinit_class(class); + + list_splice_init(&pf->zapped, &free_lock_classes); + +#ifdef CONFIG_PROVE_LOCKING + bitmap_andnot(lock_chains_in_use, lock_chains_in_use, + pf->lock_chains_being_freed, ARRAY_SIZE(lock_chains)); + bitmap_clear(pf->lock_chains_being_freed, 0, ARRAY_SIZE(lock_chains)); +#endif +} + +static void free_zapped_rcu(struct rcu_head *ch) +{ + struct pending_free *pf; unsigned long flags; - int i; - int locked; + + if (WARN_ON_ONCE(ch != &delayed_free.rcu_head)) + return; raw_local_irq_save(flags); - locked = graph_lock(); + arch_spin_lock(&lockdep_lock); + current->lockdep_recursion = 1; + + /* closed head */ + pf = delayed_free.pf + (delayed_free.index ^ 1); + __free_zapped_classes(pf); + delayed_free.scheduled = false; /* - * Unhash all classes that were created by this module: + * If there's anything on the open list, close and start a new callback. */ + call_rcu_zapped(delayed_free.pf + delayed_free.index); + + current->lockdep_recursion = 0; + arch_spin_unlock(&lockdep_lock); + raw_local_irq_restore(flags); +} + +/* + * Remove all lock classes from the class hash table and from the + * all_lock_classes list whose key or name is in the address range [start, + * start + size). Move these lock classes to the zapped_classes list. Must + * be called with the graph lock held. + */ +static void __lockdep_free_key_range(struct pending_free *pf, void *start, + unsigned long size) +{ + struct lock_class *class; + struct hlist_head *head; + int i; + + /* Unhash all classes that were created by a module. */ for (i = 0; i < CLASSHASH_SIZE; i++) { head = classhash_table + i; hlist_for_each_entry_rcu(class, head, hash_entry) { - if (within(class->key, start, size)) - zap_class(class); - else if (within(class->name, start, size)) - zap_class(class); + if (!within(class->key, start, size) && + !within(class->name, start, size)) + continue; + zap_class(pf, class); } } +} - if (locked) - graph_unlock(); +/* + * Used in module.c to remove lock classes from memory that is going to be + * freed; and possibly re-used by other modules. + * + * We will have had one synchronize_rcu() before getting here, so we're + * guaranteed nobody will look up these exact classes -- they're properly dead + * but still allocated. + */ +static void lockdep_free_key_range_reg(void *start, unsigned long size) +{ + struct pending_free *pf; + unsigned long flags; + + init_data_structures_once(); + + raw_local_irq_save(flags); + arch_spin_lock(&lockdep_lock); + current->lockdep_recursion = 1; + pf = get_pending_free(); + __lockdep_free_key_range(pf, start, size); + call_rcu_zapped(pf); + current->lockdep_recursion = 0; + arch_spin_unlock(&lockdep_lock); raw_local_irq_restore(flags); /* * Wait for any possible iterators from look_up_lock_class() to pass * before continuing to free the memory they refer to. - * - * sync_sched() is sufficient because the read-side is IRQ disable. */ synchronize_rcu(); +} - /* - * XXX at this point we could return the resources to the pool; - * instead we leak them. We would need to change to bitmap allocators - * instead of the linear allocators we have now. - */ +/* + * Free all lockdep keys in the range [start, start+size). Does not sleep. + * Ignores debug_locks. Must only be used by the lockdep selftests. + */ +static void lockdep_free_key_range_imm(void *start, unsigned long size) +{ + struct pending_free *pf = delayed_free.pf; + unsigned long flags; + + init_data_structures_once(); + + raw_local_irq_save(flags); + arch_spin_lock(&lockdep_lock); + __lockdep_free_key_range(pf, start, size); + __free_zapped_classes(pf); + arch_spin_unlock(&lockdep_lock); + raw_local_irq_restore(flags); +} + +void lockdep_free_key_range(void *start, unsigned long size) +{ + init_data_structures_once(); + + if (inside_selftest()) + lockdep_free_key_range_imm(start, size); + else + lockdep_free_key_range_reg(start, size); } /* @@ -4226,14 +5014,12 @@ static bool lock_class_cache_is_registered(struct lockdep_map *lock) return false; } -void lockdep_reset_lock(struct lockdep_map *lock) +/* The caller must hold the graph lock. Does not sleep. */ +static void __lockdep_reset_lock(struct pending_free *pf, + struct lockdep_map *lock) { struct lock_class *class; - unsigned long flags; - int j, locked; - - raw_local_irq_save(flags); - locked = graph_lock(); + int j; /* * Remove all classes this lock might have: @@ -4244,27 +5030,104 @@ void lockdep_reset_lock(struct lockdep_map *lock) */ class = look_up_lock_class(lock, j); if (class) - zap_class(class); + zap_class(pf, class); } /* * Debug check: in the end all mapped classes should * be gone. */ - if (unlikely(lock_class_cache_is_registered(lock))) { - if (debug_locks_off_graph_unlock()) { - /* - * We all just reset everything, how did it match? - */ - WARN_ON(1); + if (WARN_ON_ONCE(lock_class_cache_is_registered(lock))) + debug_locks_off(); +} + +/* + * Remove all information lockdep has about a lock if debug_locks == 1. Free + * released data structures from RCU context. + */ +static void lockdep_reset_lock_reg(struct lockdep_map *lock) +{ + struct pending_free *pf; + unsigned long flags; + int locked; + + raw_local_irq_save(flags); + locked = graph_lock(); + if (!locked) + goto out_irq; + + pf = get_pending_free(); + __lockdep_reset_lock(pf, lock); + call_rcu_zapped(pf); + + graph_unlock(); +out_irq: + raw_local_irq_restore(flags); +} + +/* + * Reset a lock. Does not sleep. Ignores debug_locks. Must only be used by the + * lockdep selftests. + */ +static void lockdep_reset_lock_imm(struct lockdep_map *lock) +{ + struct pending_free *pf = delayed_free.pf; + unsigned long flags; + + raw_local_irq_save(flags); + arch_spin_lock(&lockdep_lock); + __lockdep_reset_lock(pf, lock); + __free_zapped_classes(pf); + arch_spin_unlock(&lockdep_lock); + raw_local_irq_restore(flags); +} + +void lockdep_reset_lock(struct lockdep_map *lock) +{ + init_data_structures_once(); + + if (inside_selftest()) + lockdep_reset_lock_imm(lock); + else + lockdep_reset_lock_reg(lock); +} + +/* Unregister a dynamically allocated key. */ +void lockdep_unregister_key(struct lock_class_key *key) +{ + struct hlist_head *hash_head = keyhashentry(key); + struct lock_class_key *k; + struct pending_free *pf; + unsigned long flags; + bool found = false; + + might_sleep(); + + if (WARN_ON_ONCE(static_obj(key))) + return; + + raw_local_irq_save(flags); + if (!graph_lock()) + goto out_irq; + + pf = get_pending_free(); + hlist_for_each_entry_rcu(k, hash_head, hash_entry) { + if (k == key) { + hlist_del_rcu(&k->hash_entry); + found = true; + break; } - goto out_restore; } - if (locked) - graph_unlock(); - -out_restore: + WARN_ON_ONCE(!found); + __lockdep_free_key_range(pf, key, 1); + call_rcu_zapped(pf); + graph_unlock(); +out_irq: raw_local_irq_restore(flags); + + /* Wait until is_dynamic_key() has finished accessing k->hash_entry. */ + synchronize_rcu(); } +EXPORT_SYMBOL_GPL(lockdep_unregister_key); void __init lockdep_init(void) { @@ -4278,20 +5141,25 @@ void __init lockdep_init(void) printk("... MAX_LOCKDEP_CHAINS: %lu\n", MAX_LOCKDEP_CHAINS); printk("... CHAINHASH_SIZE: %lu\n", CHAINHASH_SIZE); - printk(" memory used by lock dependency info: %lu kB\n", - (sizeof(struct lock_class) * MAX_LOCKDEP_KEYS + - sizeof(struct list_head) * CLASSHASH_SIZE + - sizeof(struct lock_list) * MAX_LOCKDEP_ENTRIES + - sizeof(struct lock_chain) * MAX_LOCKDEP_CHAINS + - sizeof(struct list_head) * CHAINHASH_SIZE + printk(" memory used by lock dependency info: %zu kB\n", + (sizeof(lock_classes) + + sizeof(lock_classes_in_use) + + sizeof(classhash_table) + + sizeof(list_entries) + + sizeof(list_entries_in_use) + + sizeof(chainhash_table) + + sizeof(delayed_free) #ifdef CONFIG_PROVE_LOCKING - + sizeof(struct circular_queue) + + sizeof(lock_cq) + + sizeof(lock_chains) + + sizeof(lock_chains_in_use) + + sizeof(chain_hlocks) #endif ) / 1024 ); - printk(" per task-struct memory footprint: %lu bytes\n", - sizeof(struct held_lock) * MAX_LOCK_DEPTH); + printk(" per task-struct memory footprint: %zu bytes\n", + sizeof(((struct task_struct *)NULL)->held_locks)); } static void diff --git a/kernel/locking/lockdep_internals.h b/kernel/locking/lockdep_internals.h index 88c847a41c8a..cc83568d5012 100644 --- a/kernel/locking/lockdep_internals.h +++ b/kernel/locking/lockdep_internals.h @@ -22,6 +22,10 @@ enum lock_usage_bit { LOCK_USAGE_STATES }; +#define LOCK_USAGE_READ_MASK 1 +#define LOCK_USAGE_DIR_MASK 2 +#define LOCK_USAGE_STATE_MASK (~(LOCK_USAGE_READ_MASK | LOCK_USAGE_DIR_MASK)) + /* * Usage-state bitmasks: */ @@ -38,13 +42,35 @@ enum { __LOCKF(USED) }; -#define LOCKF_ENABLED_IRQ (LOCKF_ENABLED_HARDIRQ | LOCKF_ENABLED_SOFTIRQ) -#define LOCKF_USED_IN_IRQ (LOCKF_USED_IN_HARDIRQ | LOCKF_USED_IN_SOFTIRQ) +#define LOCKDEP_STATE(__STATE) LOCKF_ENABLED_##__STATE | +static const unsigned long LOCKF_ENABLED_IRQ = +#include "lockdep_states.h" + 0; +#undef LOCKDEP_STATE + +#define LOCKDEP_STATE(__STATE) LOCKF_USED_IN_##__STATE | +static const unsigned long LOCKF_USED_IN_IRQ = +#include "lockdep_states.h" + 0; +#undef LOCKDEP_STATE + +#define LOCKDEP_STATE(__STATE) LOCKF_ENABLED_##__STATE##_READ | +static const unsigned long LOCKF_ENABLED_IRQ_READ = +#include "lockdep_states.h" + 0; +#undef LOCKDEP_STATE + +#define LOCKDEP_STATE(__STATE) LOCKF_USED_IN_##__STATE##_READ | +static const unsigned long LOCKF_USED_IN_IRQ_READ = +#include "lockdep_states.h" + 0; +#undef LOCKDEP_STATE + +#define LOCKF_ENABLED_IRQ_ALL (LOCKF_ENABLED_IRQ | LOCKF_ENABLED_IRQ_READ) +#define LOCKF_USED_IN_IRQ_ALL (LOCKF_USED_IN_IRQ | LOCKF_USED_IN_IRQ_READ) -#define LOCKF_ENABLED_IRQ_READ \ - (LOCKF_ENABLED_HARDIRQ_READ | LOCKF_ENABLED_SOFTIRQ_READ) -#define LOCKF_USED_IN_IRQ_READ \ - (LOCKF_USED_IN_HARDIRQ_READ | LOCKF_USED_IN_SOFTIRQ_READ) +#define LOCKF_IRQ (LOCKF_ENABLED_IRQ | LOCKF_USED_IN_IRQ) +#define LOCKF_IRQ_READ (LOCKF_ENABLED_IRQ_READ | LOCKF_USED_IN_IRQ_READ) /* * CONFIG_LOCKDEP_SMALL is defined for sparc. Sparc requires .text, @@ -96,7 +122,8 @@ struct lock_class *lock_chain_get_class(struct lock_chain *chain, int i); extern unsigned long nr_lock_classes; extern unsigned long nr_list_entries; -extern unsigned long nr_lock_chains; +long lockdep_next_lockchain(long i); +unsigned long lock_chain_count(void); extern int nr_chain_hlocks; extern unsigned long nr_stack_trace_entries; @@ -104,7 +131,6 @@ extern unsigned int nr_hardirq_chains; extern unsigned int nr_softirq_chains; extern unsigned int nr_process_chains; extern unsigned int max_lockdep_depth; -extern unsigned int max_recursion_depth; extern unsigned int max_bfs_queue_depth; @@ -133,25 +159,22 @@ lockdep_count_backward_deps(struct lock_class *class) * and we want to avoid too much cache bouncing. */ struct lockdep_stats { - int chain_lookup_hits; - int chain_lookup_misses; - int hardirqs_on_events; - int hardirqs_off_events; - int redundant_hardirqs_on; - int redundant_hardirqs_off; - int softirqs_on_events; - int softirqs_off_events; - int redundant_softirqs_on; - int redundant_softirqs_off; - int nr_unused_locks; - int nr_redundant_checks; - int nr_redundant; - int nr_cyclic_checks; - int nr_cyclic_check_recursions; - int nr_find_usage_forwards_checks; - int nr_find_usage_forwards_recursions; - int nr_find_usage_backwards_checks; - int nr_find_usage_backwards_recursions; + unsigned long chain_lookup_hits; + unsigned int chain_lookup_misses; + unsigned long hardirqs_on_events; + unsigned long hardirqs_off_events; + unsigned long redundant_hardirqs_on; + unsigned long redundant_hardirqs_off; + unsigned long softirqs_on_events; + unsigned long softirqs_off_events; + unsigned long redundant_softirqs_on; + unsigned long redundant_softirqs_off; + int nr_unused_locks; + unsigned int nr_redundant_checks; + unsigned int nr_redundant; + unsigned int nr_cyclic_checks; + unsigned int nr_find_usage_forwards_checks; + unsigned int nr_find_usage_backwards_checks; /* * Per lock class locking operation stat counts diff --git a/kernel/locking/lockdep_proc.c b/kernel/locking/lockdep_proc.c index 3d31f9b0059e..65b6a1600c8f 100644 --- a/kernel/locking/lockdep_proc.c +++ b/kernel/locking/lockdep_proc.c @@ -104,18 +104,18 @@ static const struct seq_operations lockdep_ops = { #ifdef CONFIG_PROVE_LOCKING static void *lc_start(struct seq_file *m, loff_t *pos) { + if (*pos < 0) + return NULL; + if (*pos == 0) return SEQ_START_TOKEN; - if (*pos - 1 < nr_lock_chains) - return lock_chains + (*pos - 1); - - return NULL; + return lock_chains + (*pos - 1); } static void *lc_next(struct seq_file *m, void *v, loff_t *pos) { - (*pos)++; + *pos = lockdep_next_lockchain(*pos - 1) + 1; return lc_start(m, pos); } @@ -210,6 +210,7 @@ static int lockdep_stats_show(struct seq_file *m, void *v) nr_hardirq_read_safe = 0, nr_hardirq_read_unsafe = 0, sum_forward_deps = 0; +#ifdef CONFIG_PROVE_LOCKING list_for_each_entry(class, &all_lock_classes, lock_entry) { if (class->usage_mask == 0) @@ -241,13 +242,13 @@ static int lockdep_stats_show(struct seq_file *m, void *v) if (class->usage_mask & LOCKF_ENABLED_HARDIRQ_READ) nr_hardirq_read_unsafe++; -#ifdef CONFIG_PROVE_LOCKING sum_forward_deps += lockdep_count_forward_deps(class); -#endif } #ifdef CONFIG_DEBUG_LOCKDEP DEBUG_LOCKS_WARN_ON(debug_atomic_read(nr_unused_locks) != nr_unused); #endif + +#endif seq_printf(m, " lock-classes: %11lu [max: %lu]\n", nr_lock_classes, MAX_LOCKDEP_KEYS); seq_printf(m, " direct dependencies: %11lu [max: %lu]\n", @@ -268,7 +269,7 @@ static int lockdep_stats_show(struct seq_file *m, void *v) #ifdef CONFIG_PROVE_LOCKING seq_printf(m, " dependency chains: %11lu [max: %lu]\n", - nr_lock_chains, MAX_LOCKDEP_CHAINS); + lock_chain_count(), MAX_LOCKDEP_CHAINS); seq_printf(m, " dependency chain hlocks: %11d [max: %lu]\n", nr_chain_hlocks, MAX_LOCKDEP_CHAIN_HLOCKS); #endif diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index 7d0b0ed74404..c513031cd7e3 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -1,23 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0+ /* * Module-based torture test facility for locking * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, you can access it online at - * http://www.gnu.org/licenses/gpl-2.0.html. - * * Copyright (C) IBM Corporation, 2014 * - * Authors: Paul E. McKenney <paulmck@us.ibm.com> + * Authors: Paul E. McKenney <paulmck@linux.ibm.com> * Davidlohr Bueso <dave@stgolabs.net> * Based on kernel/rcu/torture.c. */ @@ -45,7 +32,7 @@ #include <linux/torture.h> MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com>"); +MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.ibm.com>"); torture_param(int, nwriters_stress, -1, "Number of write-locking stress-test threads"); @@ -842,7 +829,9 @@ static void lock_torture_cleanup(void) "End of test: SUCCESS"); kfree(cxt.lwsa); + cxt.lwsa = NULL; kfree(cxt.lrsa); + cxt.lrsa = NULL; end: torture_cleanup_end(); @@ -970,7 +959,7 @@ static int __init lock_torture_init(void) /* Prepare torture context. */ if (onoff_interval > 0) { firsterr = torture_onoff_init(onoff_holdoff * HZ, - onoff_interval * HZ); + onoff_interval * HZ, NULL); if (firsterr) goto unwind; } @@ -986,7 +975,7 @@ static int __init lock_torture_init(void) goto unwind; } if (stutter > 0) { - firsterr = torture_stutter_init(stutter); + firsterr = torture_stutter_init(stutter, stutter); if (firsterr) goto unwind; } diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index db578783dd36..edd1c082dbf5 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * kernel/locking/mutex.c * @@ -15,7 +16,7 @@ * by Steven Rostedt, based on work by Gregory Haskins, Peter Morreale * and Sven Dietrich. * - * Also see Documentation/locking/mutex-design.txt. + * Also see Documentation/locking/mutex-design.rst. */ #include <linux/mutex.h> #include <linux/ww_mutex.h> diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c index 883cf1b92d90..364d38a0c444 100644 --- a/kernel/locking/percpu-rwsem.c +++ b/kernel/locking/percpu-rwsem.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only #include <linux/atomic.h> #include <linux/rwsem.h> #include <linux/percpu.h> @@ -7,6 +8,8 @@ #include <linux/sched.h> #include <linux/errno.h> +#include "rwsem.h" + int __percpu_init_rwsem(struct percpu_rw_semaphore *sem, const char *name, struct lock_class_key *rwsem_key) { @@ -15,7 +18,7 @@ int __percpu_init_rwsem(struct percpu_rw_semaphore *sem, return -ENOMEM; /* ->rw_sem represents the whole percpu_rw_semaphore for lockdep */ - rcu_sync_init(&sem->rss, RCU_SCHED_SYNC); + rcu_sync_init(&sem->rss); __init_rwsem(&sem->rw_sem, name, rwsem_key); rcuwait_init(&sem->writer); sem->readers_block = 0; diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c index c7471c3fb798..fe9ca92faa2a 100644 --- a/kernel/locking/qrwlock.c +++ b/kernel/locking/qrwlock.c @@ -1,16 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * Queued read/write locks * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * * (C) Copyright 2013-2014 Hewlett-Packard Development Company, L.P. * * Authors: Waiman Long <waiman.long@hp.com> diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c index 8a8c3c208c5e..2473f10c6956 100644 --- a/kernel/locking/qspinlock.c +++ b/kernel/locking/qspinlock.c @@ -1,16 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * Queued spinlock * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * * (C) Copyright 2013-2015 Hewlett-Packard Development Company, L.P. * (C) Copyright 2013-2014,2018 Red Hat, Inc. * (C) Copyright 2015 Intel Corp. @@ -124,9 +115,6 @@ static inline __pure u32 encode_tail(int cpu, int idx) { u32 tail; -#ifdef CONFIG_DEBUG_SPINLOCK - BUG_ON(idx > 3); -#endif tail = (cpu + 1) << _Q_TAIL_CPU_OFFSET; tail |= idx << _Q_TAIL_IDX_OFFSET; /* assume < 4 */ @@ -398,7 +386,7 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val) * 0,1,0 -> 0,0,1 */ clear_pending_set_locked(lock); - qstat_inc(qstat_lock_pending, true); + lockevent_inc(lock_pending); return; /* @@ -406,18 +394,34 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val) * queuing. */ queue: - qstat_inc(qstat_lock_slowpath, true); + lockevent_inc(lock_slowpath); pv_queue: node = this_cpu_ptr(&qnodes[0].mcs); idx = node->count++; tail = encode_tail(smp_processor_id(), idx); + /* + * 4 nodes are allocated based on the assumption that there will + * not be nested NMIs taking spinlocks. That may not be true in + * some architectures even though the chance of needing more than + * 4 nodes will still be extremely unlikely. When that happens, + * we fall back to spinning on the lock directly without using + * any MCS node. This is not the most elegant solution, but is + * simple enough. + */ + if (unlikely(idx >= MAX_NODES)) { + lockevent_inc(lock_no_node); + while (!queued_spin_trylock(lock)) + cpu_relax(); + goto release; + } + node = grab_mcs_node(node, idx); /* * Keep counts of non-zero index values: */ - qstat_inc(qstat_lock_idx1 + idx - 1, idx); + lockevent_cond_inc(lock_use_node2 + idx - 1, idx); /* * Ensure that we increment the head node->count before initialising diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h index 8f36c27c1794..89bab079e7a4 100644 --- a/kernel/locking/qspinlock_paravirt.h +++ b/kernel/locking/qspinlock_paravirt.h @@ -89,7 +89,7 @@ static inline bool pv_hybrid_queued_unfair_trylock(struct qspinlock *lock) if (!(val & _Q_LOCKED_PENDING_MASK) && (cmpxchg_acquire(&lock->locked, 0, _Q_LOCKED_VAL) == 0)) { - qstat_inc(qstat_pv_lock_stealing, true); + lockevent_inc(pv_lock_stealing); return true; } if (!(val & _Q_TAIL_MASK) || (val & _Q_PENDING_MASK)) @@ -219,7 +219,7 @@ static struct qspinlock **pv_hash(struct qspinlock *lock, struct pv_node *node) hopcnt++; if (!cmpxchg(&he->lock, NULL, lock)) { WRITE_ONCE(he->node, node); - qstat_hop(hopcnt); + lockevent_pv_hop(hopcnt); return &he->lock; } } @@ -320,8 +320,8 @@ static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev) smp_store_mb(pn->state, vcpu_halted); if (!READ_ONCE(node->locked)) { - qstat_inc(qstat_pv_wait_node, true); - qstat_inc(qstat_pv_wait_early, wait_early); + lockevent_inc(pv_wait_node); + lockevent_cond_inc(pv_wait_early, wait_early); pv_wait(&pn->state, vcpu_halted); } @@ -339,7 +339,8 @@ static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev) * So it is better to spin for a while in the hope that the * MCS lock will be released soon. */ - qstat_inc(qstat_pv_spurious_wakeup, !READ_ONCE(node->locked)); + lockevent_cond_inc(pv_spurious_wakeup, + !READ_ONCE(node->locked)); } /* @@ -416,7 +417,7 @@ pv_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node) /* * Tracking # of slowpath locking operations */ - qstat_inc(qstat_lock_slowpath, true); + lockevent_inc(lock_slowpath); for (;; waitcnt++) { /* @@ -464,8 +465,8 @@ pv_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node) } } WRITE_ONCE(pn->state, vcpu_hashed); - qstat_inc(qstat_pv_wait_head, true); - qstat_inc(qstat_pv_wait_again, waitcnt); + lockevent_inc(pv_wait_head); + lockevent_cond_inc(pv_wait_again, waitcnt); pv_wait(&lock->locked, _Q_SLOW_VAL); /* @@ -528,7 +529,7 @@ __pv_queued_spin_unlock_slowpath(struct qspinlock *lock, u8 locked) * vCPU is harmless other than the additional latency in completing * the unlock. */ - qstat_inc(qstat_pv_kick_unlock, true); + lockevent_inc(pv_kick_unlock); pv_kick(node->cpu); } diff --git a/kernel/locking/qspinlock_stat.h b/kernel/locking/qspinlock_stat.h index 42d3d8dc8f49..e625bb410aa2 100644 --- a/kernel/locking/qspinlock_stat.h +++ b/kernel/locking/qspinlock_stat.h @@ -1,261 +1,105 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ /* - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * Authors: Waiman Long <waiman.long@hpe.com> + * Authors: Waiman Long <longman@redhat.com> */ -/* - * When queued spinlock statistical counters are enabled, the following - * debugfs files will be created for reporting the counter values: - * - * <debugfs>/qlockstat/ - * pv_hash_hops - average # of hops per hashing operation - * pv_kick_unlock - # of vCPU kicks issued at unlock time - * pv_kick_wake - # of vCPU kicks used for computing pv_latency_wake - * pv_latency_kick - average latency (ns) of vCPU kick operation - * pv_latency_wake - average latency (ns) from vCPU kick to wakeup - * pv_lock_stealing - # of lock stealing operations - * pv_spurious_wakeup - # of spurious wakeups in non-head vCPUs - * pv_wait_again - # of wait's after a queue head vCPU kick - * pv_wait_early - # of early vCPU wait's - * pv_wait_head - # of vCPU wait's at the queue head - * pv_wait_node - # of vCPU wait's at a non-head queue node - * lock_pending - # of locking operations via pending code - * lock_slowpath - # of locking operations via MCS lock queue - * - * Writing to the "reset_counters" file will reset all the above counter - * values. - * - * These statistical counters are implemented as per-cpu variables which are - * summed and computed whenever the corresponding debugfs files are read. This - * minimizes added overhead making the counters usable even in a production - * environment. - * - * There may be slight difference between pv_kick_wake and pv_kick_unlock. - */ -enum qlock_stats { - qstat_pv_hash_hops, - qstat_pv_kick_unlock, - qstat_pv_kick_wake, - qstat_pv_latency_kick, - qstat_pv_latency_wake, - qstat_pv_lock_stealing, - qstat_pv_spurious_wakeup, - qstat_pv_wait_again, - qstat_pv_wait_early, - qstat_pv_wait_head, - qstat_pv_wait_node, - qstat_lock_pending, - qstat_lock_slowpath, - qstat_lock_idx1, - qstat_lock_idx2, - qstat_lock_idx3, - qstat_num, /* Total number of statistical counters */ - qstat_reset_cnts = qstat_num, -}; +#include "lock_events.h" -#ifdef CONFIG_QUEUED_LOCK_STAT +#ifdef CONFIG_LOCK_EVENT_COUNTS +#ifdef CONFIG_PARAVIRT_SPINLOCKS /* - * Collect pvqspinlock statistics + * Collect pvqspinlock locking event counts */ -#include <linux/debugfs.h> #include <linux/sched.h> #include <linux/sched/clock.h> #include <linux/fs.h> -static const char * const qstat_names[qstat_num + 1] = { - [qstat_pv_hash_hops] = "pv_hash_hops", - [qstat_pv_kick_unlock] = "pv_kick_unlock", - [qstat_pv_kick_wake] = "pv_kick_wake", - [qstat_pv_spurious_wakeup] = "pv_spurious_wakeup", - [qstat_pv_latency_kick] = "pv_latency_kick", - [qstat_pv_latency_wake] = "pv_latency_wake", - [qstat_pv_lock_stealing] = "pv_lock_stealing", - [qstat_pv_wait_again] = "pv_wait_again", - [qstat_pv_wait_early] = "pv_wait_early", - [qstat_pv_wait_head] = "pv_wait_head", - [qstat_pv_wait_node] = "pv_wait_node", - [qstat_lock_pending] = "lock_pending", - [qstat_lock_slowpath] = "lock_slowpath", - [qstat_lock_idx1] = "lock_index1", - [qstat_lock_idx2] = "lock_index2", - [qstat_lock_idx3] = "lock_index3", - [qstat_reset_cnts] = "reset_counters", -}; +#define EVENT_COUNT(ev) lockevents[LOCKEVENT_ ## ev] /* - * Per-cpu counters + * PV specific per-cpu counter */ -static DEFINE_PER_CPU(unsigned long, qstats[qstat_num]); static DEFINE_PER_CPU(u64, pv_kick_time); /* - * Function to read and return the qlock statistical counter values + * Function to read and return the PV qspinlock counts. * * The following counters are handled specially: - * 1. qstat_pv_latency_kick + * 1. pv_latency_kick * Average kick latency (ns) = pv_latency_kick/pv_kick_unlock - * 2. qstat_pv_latency_wake + * 2. pv_latency_wake * Average wake latency (ns) = pv_latency_wake/pv_kick_wake - * 3. qstat_pv_hash_hops + * 3. pv_hash_hops * Average hops/hash = pv_hash_hops/pv_kick_unlock */ -static ssize_t qstat_read(struct file *file, char __user *user_buf, - size_t count, loff_t *ppos) +ssize_t lockevent_read(struct file *file, char __user *user_buf, + size_t count, loff_t *ppos) { char buf[64]; - int cpu, counter, len; - u64 stat = 0, kicks = 0; + int cpu, id, len; + u64 sum = 0, kicks = 0; /* * Get the counter ID stored in file->f_inode->i_private */ - counter = (long)file_inode(file)->i_private; + id = (long)file_inode(file)->i_private; - if (counter >= qstat_num) + if (id >= lockevent_num) return -EBADF; for_each_possible_cpu(cpu) { - stat += per_cpu(qstats[counter], cpu); + sum += per_cpu(lockevents[id], cpu); /* - * Need to sum additional counter for some of them + * Need to sum additional counters for some of them */ - switch (counter) { + switch (id) { - case qstat_pv_latency_kick: - case qstat_pv_hash_hops: - kicks += per_cpu(qstats[qstat_pv_kick_unlock], cpu); + case LOCKEVENT_pv_latency_kick: + case LOCKEVENT_pv_hash_hops: + kicks += per_cpu(EVENT_COUNT(pv_kick_unlock), cpu); break; - case qstat_pv_latency_wake: - kicks += per_cpu(qstats[qstat_pv_kick_wake], cpu); + case LOCKEVENT_pv_latency_wake: + kicks += per_cpu(EVENT_COUNT(pv_kick_wake), cpu); break; } } - if (counter == qstat_pv_hash_hops) { + if (id == LOCKEVENT_pv_hash_hops) { u64 frac = 0; if (kicks) { - frac = 100ULL * do_div(stat, kicks); + frac = 100ULL * do_div(sum, kicks); frac = DIV_ROUND_CLOSEST_ULL(frac, kicks); } /* * Return a X.XX decimal number */ - len = snprintf(buf, sizeof(buf) - 1, "%llu.%02llu\n", stat, frac); + len = snprintf(buf, sizeof(buf) - 1, "%llu.%02llu\n", + sum, frac); } else { /* * Round to the nearest ns */ - if ((counter == qstat_pv_latency_kick) || - (counter == qstat_pv_latency_wake)) { + if ((id == LOCKEVENT_pv_latency_kick) || + (id == LOCKEVENT_pv_latency_wake)) { if (kicks) - stat = DIV_ROUND_CLOSEST_ULL(stat, kicks); + sum = DIV_ROUND_CLOSEST_ULL(sum, kicks); } - len = snprintf(buf, sizeof(buf) - 1, "%llu\n", stat); + len = snprintf(buf, sizeof(buf) - 1, "%llu\n", sum); } return simple_read_from_buffer(user_buf, count, ppos, buf, len); } /* - * Function to handle write request - * - * When counter = reset_cnts, reset all the counter values. - * Since the counter updates aren't atomic, the resetting is done twice - * to make sure that the counters are very likely to be all cleared. - */ -static ssize_t qstat_write(struct file *file, const char __user *user_buf, - size_t count, loff_t *ppos) -{ - int cpu; - - /* - * Get the counter ID stored in file->f_inode->i_private - */ - if ((long)file_inode(file)->i_private != qstat_reset_cnts) - return count; - - for_each_possible_cpu(cpu) { - int i; - unsigned long *ptr = per_cpu_ptr(qstats, cpu); - - for (i = 0 ; i < qstat_num; i++) - WRITE_ONCE(ptr[i], 0); - } - return count; -} - -/* - * Debugfs data structures - */ -static const struct file_operations fops_qstat = { - .read = qstat_read, - .write = qstat_write, - .llseek = default_llseek, -}; - -/* - * Initialize debugfs for the qspinlock statistical counters - */ -static int __init init_qspinlock_stat(void) -{ - struct dentry *d_qstat = debugfs_create_dir("qlockstat", NULL); - int i; - - if (!d_qstat) - goto out; - - /* - * Create the debugfs files - * - * As reading from and writing to the stat files can be slow, only - * root is allowed to do the read/write to limit impact to system - * performance. - */ - for (i = 0; i < qstat_num; i++) - if (!debugfs_create_file(qstat_names[i], 0400, d_qstat, - (void *)(long)i, &fops_qstat)) - goto fail_undo; - - if (!debugfs_create_file(qstat_names[qstat_reset_cnts], 0200, d_qstat, - (void *)(long)qstat_reset_cnts, &fops_qstat)) - goto fail_undo; - - return 0; -fail_undo: - debugfs_remove_recursive(d_qstat); -out: - pr_warn("Could not create 'qlockstat' debugfs entries\n"); - return -ENOMEM; -} -fs_initcall(init_qspinlock_stat); - -/* - * Increment the PV qspinlock statistical counters - */ -static inline void qstat_inc(enum qlock_stats stat, bool cond) -{ - if (cond) - this_cpu_inc(qstats[stat]); -} - -/* * PV hash hop count */ -static inline void qstat_hop(int hopcnt) +static inline void lockevent_pv_hop(int hopcnt) { - this_cpu_add(qstats[qstat_pv_hash_hops], hopcnt); + this_cpu_add(EVENT_COUNT(pv_hash_hops), hopcnt); } /* @@ -267,7 +111,7 @@ static inline void __pv_kick(int cpu) per_cpu(pv_kick_time, cpu) = start; pv_kick(cpu); - this_cpu_add(qstats[qstat_pv_latency_kick], sched_clock() - start); + this_cpu_add(EVENT_COUNT(pv_latency_kick), sched_clock() - start); } /* @@ -280,18 +124,19 @@ static inline void __pv_wait(u8 *ptr, u8 val) *pkick_time = 0; pv_wait(ptr, val); if (*pkick_time) { - this_cpu_add(qstats[qstat_pv_latency_wake], + this_cpu_add(EVENT_COUNT(pv_latency_wake), sched_clock() - *pkick_time); - qstat_inc(qstat_pv_kick_wake, true); + lockevent_inc(pv_kick_wake); } } #define pv_kick(c) __pv_kick(c) #define pv_wait(p, v) __pv_wait(p, v) -#else /* CONFIG_QUEUED_LOCK_STAT */ +#endif /* CONFIG_PARAVIRT_SPINLOCKS */ + +#else /* CONFIG_LOCK_EVENT_COUNTS */ -static inline void qstat_inc(enum qlock_stats stat, bool cond) { } -static inline void qstat_hop(int hopcnt) { } +static inline void lockevent_pv_hop(int hopcnt) { } -#endif /* CONFIG_QUEUED_LOCK_STAT */ +#endif /* CONFIG_LOCK_EVENT_COUNTS */ diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 978d63a8261c..fa83d36e30c6 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * RT-Mutexes: simple blocking mutual exclusion locks with PI support * @@ -8,7 +9,7 @@ * Copyright (C) 2005 Kihon Technologies Inc., Steven Rostedt * Copyright (C) 2006 Esben Nielsen * - * See Documentation/locking/rt-mutex-design.txt for details. + * See Documentation/locking/rt-mutex-design.rst for details. */ #include <linux/spinlock.h> #include <linux/export.h> diff --git a/kernel/locking/rwsem-spinlock.c b/kernel/locking/rwsem-spinlock.c deleted file mode 100644 index a7ffb2a96ede..000000000000 --- a/kernel/locking/rwsem-spinlock.c +++ /dev/null @@ -1,339 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* rwsem-spinlock.c: R/W semaphores: contention handling functions for - * generic spinlock implementation - * - * Copyright (c) 2001 David Howells (dhowells@redhat.com). - * - Derived partially from idea by Andrea Arcangeli <andrea@suse.de> - * - Derived also from comments by Linus - */ -#include <linux/rwsem.h> -#include <linux/sched/signal.h> -#include <linux/sched/debug.h> -#include <linux/export.h> - -enum rwsem_waiter_type { - RWSEM_WAITING_FOR_WRITE, - RWSEM_WAITING_FOR_READ -}; - -struct rwsem_waiter { - struct list_head list; - struct task_struct *task; - enum rwsem_waiter_type type; -}; - -int rwsem_is_locked(struct rw_semaphore *sem) -{ - int ret = 1; - unsigned long flags; - - if (raw_spin_trylock_irqsave(&sem->wait_lock, flags)) { - ret = (sem->count != 0); - raw_spin_unlock_irqrestore(&sem->wait_lock, flags); - } - return ret; -} -EXPORT_SYMBOL(rwsem_is_locked); - -/* - * initialise the semaphore - */ -void __init_rwsem(struct rw_semaphore *sem, const char *name, - struct lock_class_key *key) -{ -#ifdef CONFIG_DEBUG_LOCK_ALLOC - /* - * Make sure we are not reinitializing a held semaphore: - */ - debug_check_no_locks_freed((void *)sem, sizeof(*sem)); - lockdep_init_map(&sem->dep_map, name, key, 0); -#endif - sem->count = 0; - raw_spin_lock_init(&sem->wait_lock); - INIT_LIST_HEAD(&sem->wait_list); -} -EXPORT_SYMBOL(__init_rwsem); - -/* - * handle the lock release when processes blocked on it that can now run - * - if we come here, then: - * - the 'active count' _reached_ zero - * - the 'waiting count' is non-zero - * - the spinlock must be held by the caller - * - woken process blocks are discarded from the list after having task zeroed - * - writers are only woken if wakewrite is non-zero - */ -static inline struct rw_semaphore * -__rwsem_do_wake(struct rw_semaphore *sem, int wakewrite) -{ - struct rwsem_waiter *waiter; - struct task_struct *tsk; - int woken; - - waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list); - - if (waiter->type == RWSEM_WAITING_FOR_WRITE) { - if (wakewrite) - /* Wake up a writer. Note that we do not grant it the - * lock - it will have to acquire it when it runs. */ - wake_up_process(waiter->task); - goto out; - } - - /* grant an infinite number of read locks to the front of the queue */ - woken = 0; - do { - struct list_head *next = waiter->list.next; - - list_del(&waiter->list); - tsk = waiter->task; - /* - * Make sure we do not wakeup the next reader before - * setting the nil condition to grant the next reader; - * otherwise we could miss the wakeup on the other - * side and end up sleeping again. See the pairing - * in rwsem_down_read_failed(). - */ - smp_mb(); - waiter->task = NULL; - wake_up_process(tsk); - put_task_struct(tsk); - woken++; - if (next == &sem->wait_list) - break; - waiter = list_entry(next, struct rwsem_waiter, list); - } while (waiter->type != RWSEM_WAITING_FOR_WRITE); - - sem->count += woken; - - out: - return sem; -} - -/* - * wake a single writer - */ -static inline struct rw_semaphore * -__rwsem_wake_one_writer(struct rw_semaphore *sem) -{ - struct rwsem_waiter *waiter; - - waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list); - wake_up_process(waiter->task); - - return sem; -} - -/* - * get a read lock on the semaphore - */ -int __sched __down_read_common(struct rw_semaphore *sem, int state) -{ - struct rwsem_waiter waiter; - unsigned long flags; - - raw_spin_lock_irqsave(&sem->wait_lock, flags); - - if (sem->count >= 0 && list_empty(&sem->wait_list)) { - /* granted */ - sem->count++; - raw_spin_unlock_irqrestore(&sem->wait_lock, flags); - goto out; - } - - /* set up my own style of waitqueue */ - waiter.task = current; - waiter.type = RWSEM_WAITING_FOR_READ; - get_task_struct(current); - - list_add_tail(&waiter.list, &sem->wait_list); - - /* wait to be given the lock */ - for (;;) { - if (!waiter.task) - break; - if (signal_pending_state(state, current)) - goto out_nolock; - set_current_state(state); - raw_spin_unlock_irqrestore(&sem->wait_lock, flags); - schedule(); - raw_spin_lock_irqsave(&sem->wait_lock, flags); - } - - raw_spin_unlock_irqrestore(&sem->wait_lock, flags); - out: - return 0; - -out_nolock: - /* - * We didn't take the lock, so that there is a writer, which - * is owner or the first waiter of the sem. If it's a waiter, - * it will be woken by current owner. Not need to wake anybody. - */ - list_del(&waiter.list); - raw_spin_unlock_irqrestore(&sem->wait_lock, flags); - return -EINTR; -} - -void __sched __down_read(struct rw_semaphore *sem) -{ - __down_read_common(sem, TASK_UNINTERRUPTIBLE); -} - -int __sched __down_read_killable(struct rw_semaphore *sem) -{ - return __down_read_common(sem, TASK_KILLABLE); -} - -/* - * trylock for reading -- returns 1 if successful, 0 if contention - */ -int __down_read_trylock(struct rw_semaphore *sem) -{ - unsigned long flags; - int ret = 0; - - - raw_spin_lock_irqsave(&sem->wait_lock, flags); - - if (sem->count >= 0 && list_empty(&sem->wait_list)) { - /* granted */ - sem->count++; - ret = 1; - } - - raw_spin_unlock_irqrestore(&sem->wait_lock, flags); - - return ret; -} - -/* - * get a write lock on the semaphore - */ -int __sched __down_write_common(struct rw_semaphore *sem, int state) -{ - struct rwsem_waiter waiter; - unsigned long flags; - int ret = 0; - - raw_spin_lock_irqsave(&sem->wait_lock, flags); - - /* set up my own style of waitqueue */ - waiter.task = current; - waiter.type = RWSEM_WAITING_FOR_WRITE; - list_add_tail(&waiter.list, &sem->wait_list); - - /* wait for someone to release the lock */ - for (;;) { - /* - * That is the key to support write lock stealing: allows the - * task already on CPU to get the lock soon rather than put - * itself into sleep and waiting for system woke it or someone - * else in the head of the wait list up. - */ - if (sem->count == 0) - break; - if (signal_pending_state(state, current)) - goto out_nolock; - - set_current_state(state); - raw_spin_unlock_irqrestore(&sem->wait_lock, flags); - schedule(); - raw_spin_lock_irqsave(&sem->wait_lock, flags); - } - /* got the lock */ - sem->count = -1; - list_del(&waiter.list); - - raw_spin_unlock_irqrestore(&sem->wait_lock, flags); - - return ret; - -out_nolock: - list_del(&waiter.list); - if (!list_empty(&sem->wait_list) && sem->count >= 0) - __rwsem_do_wake(sem, 0); - raw_spin_unlock_irqrestore(&sem->wait_lock, flags); - - return -EINTR; -} - -void __sched __down_write(struct rw_semaphore *sem) -{ - __down_write_common(sem, TASK_UNINTERRUPTIBLE); -} - -int __sched __down_write_killable(struct rw_semaphore *sem) -{ - return __down_write_common(sem, TASK_KILLABLE); -} - -/* - * trylock for writing -- returns 1 if successful, 0 if contention - */ -int __down_write_trylock(struct rw_semaphore *sem) -{ - unsigned long flags; - int ret = 0; - - raw_spin_lock_irqsave(&sem->wait_lock, flags); - - if (sem->count == 0) { - /* got the lock */ - sem->count = -1; - ret = 1; - } - - raw_spin_unlock_irqrestore(&sem->wait_lock, flags); - - return ret; -} - -/* - * release a read lock on the semaphore - */ -void __up_read(struct rw_semaphore *sem) -{ - unsigned long flags; - - raw_spin_lock_irqsave(&sem->wait_lock, flags); - - if (--sem->count == 0 && !list_empty(&sem->wait_list)) - sem = __rwsem_wake_one_writer(sem); - - raw_spin_unlock_irqrestore(&sem->wait_lock, flags); -} - -/* - * release a write lock on the semaphore - */ -void __up_write(struct rw_semaphore *sem) -{ - unsigned long flags; - - raw_spin_lock_irqsave(&sem->wait_lock, flags); - - sem->count = 0; - if (!list_empty(&sem->wait_list)) - sem = __rwsem_do_wake(sem, 1); - - raw_spin_unlock_irqrestore(&sem->wait_lock, flags); -} - -/* - * downgrade a write lock into a read lock - * - just wake up any readers at the front of the queue - */ -void __downgrade_write(struct rw_semaphore *sem) -{ - unsigned long flags; - - raw_spin_lock_irqsave(&sem->wait_lock, flags); - - sem->count = 1; - if (!list_empty(&sem->wait_list)) - sem = __rwsem_do_wake(sem, 0); - - raw_spin_unlock_irqrestore(&sem->wait_lock, flags); -} - diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c deleted file mode 100644 index 50d9af615dc4..000000000000 --- a/kernel/locking/rwsem-xadd.c +++ /dev/null @@ -1,725 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* rwsem.c: R/W semaphores: contention handling functions - * - * Written by David Howells (dhowells@redhat.com). - * Derived from arch/i386/kernel/semaphore.c - * - * Writer lock-stealing by Alex Shi <alex.shi@intel.com> - * and Michel Lespinasse <walken@google.com> - * - * Optimistic spinning by Tim Chen <tim.c.chen@intel.com> - * and Davidlohr Bueso <davidlohr@hp.com>. Based on mutexes. - */ -#include <linux/rwsem.h> -#include <linux/init.h> -#include <linux/export.h> -#include <linux/sched/signal.h> -#include <linux/sched/rt.h> -#include <linux/sched/wake_q.h> -#include <linux/sched/debug.h> -#include <linux/osq_lock.h> - -#include "rwsem.h" - -/* - * Guide to the rw_semaphore's count field for common values. - * (32-bit case illustrated, similar for 64-bit) - * - * 0x0000000X (1) X readers active or attempting lock, no writer waiting - * X = #active_readers + #readers attempting to lock - * (X*ACTIVE_BIAS) - * - * 0x00000000 rwsem is unlocked, and no one is waiting for the lock or - * attempting to read lock or write lock. - * - * 0xffff000X (1) X readers active or attempting lock, with waiters for lock - * X = #active readers + # readers attempting lock - * (X*ACTIVE_BIAS + WAITING_BIAS) - * (2) 1 writer attempting lock, no waiters for lock - * X-1 = #active readers + #readers attempting lock - * ((X-1)*ACTIVE_BIAS + ACTIVE_WRITE_BIAS) - * (3) 1 writer active, no waiters for lock - * X-1 = #active readers + #readers attempting lock - * ((X-1)*ACTIVE_BIAS + ACTIVE_WRITE_BIAS) - * - * 0xffff0001 (1) 1 reader active or attempting lock, waiters for lock - * (WAITING_BIAS + ACTIVE_BIAS) - * (2) 1 writer active or attempting lock, no waiters for lock - * (ACTIVE_WRITE_BIAS) - * - * 0xffff0000 (1) There are writers or readers queued but none active - * or in the process of attempting lock. - * (WAITING_BIAS) - * Note: writer can attempt to steal lock for this count by adding - * ACTIVE_WRITE_BIAS in cmpxchg and checking the old count - * - * 0xfffe0001 (1) 1 writer active, or attempting lock. Waiters on queue. - * (ACTIVE_WRITE_BIAS + WAITING_BIAS) - * - * Note: Readers attempt to lock by adding ACTIVE_BIAS in down_read and checking - * the count becomes more than 0 for successful lock acquisition, - * i.e. the case where there are only readers or nobody has lock. - * (1st and 2nd case above). - * - * Writers attempt to lock by adding ACTIVE_WRITE_BIAS in down_write and - * checking the count becomes ACTIVE_WRITE_BIAS for successful lock - * acquisition (i.e. nobody else has lock or attempts lock). If - * unsuccessful, in rwsem_down_write_failed, we'll check to see if there - * are only waiters but none active (5th case above), and attempt to - * steal the lock. - * - */ - -/* - * Initialize an rwsem: - */ -void __init_rwsem(struct rw_semaphore *sem, const char *name, - struct lock_class_key *key) -{ -#ifdef CONFIG_DEBUG_LOCK_ALLOC - /* - * Make sure we are not reinitializing a held semaphore: - */ - debug_check_no_locks_freed((void *)sem, sizeof(*sem)); - lockdep_init_map(&sem->dep_map, name, key, 0); -#endif - atomic_long_set(&sem->count, RWSEM_UNLOCKED_VALUE); - raw_spin_lock_init(&sem->wait_lock); - INIT_LIST_HEAD(&sem->wait_list); -#ifdef CONFIG_RWSEM_SPIN_ON_OWNER - sem->owner = NULL; - osq_lock_init(&sem->osq); -#endif -} - -EXPORT_SYMBOL(__init_rwsem); - -enum rwsem_waiter_type { - RWSEM_WAITING_FOR_WRITE, - RWSEM_WAITING_FOR_READ -}; - -struct rwsem_waiter { - struct list_head list; - struct task_struct *task; - enum rwsem_waiter_type type; -}; - -enum rwsem_wake_type { - RWSEM_WAKE_ANY, /* Wake whatever's at head of wait list */ - RWSEM_WAKE_READERS, /* Wake readers only */ - RWSEM_WAKE_READ_OWNED /* Waker thread holds the read lock */ -}; - -/* - * handle the lock release when processes blocked on it that can now run - * - if we come here from up_xxxx(), then: - * - the 'active part' of count (&0x0000ffff) reached 0 (but may have changed) - * - the 'waiting part' of count (&0xffff0000) is -ve (and will still be so) - * - there must be someone on the queue - * - the wait_lock must be held by the caller - * - tasks are marked for wakeup, the caller must later invoke wake_up_q() - * to actually wakeup the blocked task(s) and drop the reference count, - * preferably when the wait_lock is released - * - woken process blocks are discarded from the list after having task zeroed - * - writers are only marked woken if downgrading is false - */ -static void __rwsem_mark_wake(struct rw_semaphore *sem, - enum rwsem_wake_type wake_type, - struct wake_q_head *wake_q) -{ - struct rwsem_waiter *waiter, *tmp; - long oldcount, woken = 0, adjustment = 0; - - /* - * Take a peek at the queue head waiter such that we can determine - * the wakeup(s) to perform. - */ - waiter = list_first_entry(&sem->wait_list, struct rwsem_waiter, list); - - if (waiter->type == RWSEM_WAITING_FOR_WRITE) { - if (wake_type == RWSEM_WAKE_ANY) { - /* - * Mark writer at the front of the queue for wakeup. - * Until the task is actually later awoken later by - * the caller, other writers are able to steal it. - * Readers, on the other hand, will block as they - * will notice the queued writer. - */ - wake_q_add(wake_q, waiter->task); - } - - return; - } - - /* - * Writers might steal the lock before we grant it to the next reader. - * We prefer to do the first reader grant before counting readers - * so we can bail out early if a writer stole the lock. - */ - if (wake_type != RWSEM_WAKE_READ_OWNED) { - adjustment = RWSEM_ACTIVE_READ_BIAS; - try_reader_grant: - oldcount = atomic_long_fetch_add(adjustment, &sem->count); - if (unlikely(oldcount < RWSEM_WAITING_BIAS)) { - /* - * If the count is still less than RWSEM_WAITING_BIAS - * after removing the adjustment, it is assumed that - * a writer has stolen the lock. We have to undo our - * reader grant. - */ - if (atomic_long_add_return(-adjustment, &sem->count) < - RWSEM_WAITING_BIAS) - return; - - /* Last active locker left. Retry waking readers. */ - goto try_reader_grant; - } - /* - * It is not really necessary to set it to reader-owned here, - * but it gives the spinners an early indication that the - * readers now have the lock. - */ - __rwsem_set_reader_owned(sem, waiter->task); - } - - /* - * Grant an infinite number of read locks to the readers at the front - * of the queue. We know that woken will be at least 1 as we accounted - * for above. Note we increment the 'active part' of the count by the - * number of readers before waking any processes up. - */ - list_for_each_entry_safe(waiter, tmp, &sem->wait_list, list) { - struct task_struct *tsk; - - if (waiter->type == RWSEM_WAITING_FOR_WRITE) - break; - - woken++; - tsk = waiter->task; - - get_task_struct(tsk); - list_del(&waiter->list); - /* - * Ensure calling get_task_struct() before setting the reader - * waiter to nil such that rwsem_down_read_failed() cannot - * race with do_exit() by always holding a reference count - * to the task to wakeup. - */ - smp_store_release(&waiter->task, NULL); - /* - * Ensure issuing the wakeup (either by us or someone else) - * after setting the reader waiter to nil. - */ - wake_q_add(wake_q, tsk); - /* wake_q_add() already take the task ref */ - put_task_struct(tsk); - } - - adjustment = woken * RWSEM_ACTIVE_READ_BIAS - adjustment; - if (list_empty(&sem->wait_list)) { - /* hit end of list above */ - adjustment -= RWSEM_WAITING_BIAS; - } - - if (adjustment) - atomic_long_add(adjustment, &sem->count); -} - -/* - * Wait for the read lock to be granted - */ -static inline struct rw_semaphore __sched * -__rwsem_down_read_failed_common(struct rw_semaphore *sem, int state) -{ - long count, adjustment = -RWSEM_ACTIVE_READ_BIAS; - struct rwsem_waiter waiter; - DEFINE_WAKE_Q(wake_q); - - waiter.task = current; - waiter.type = RWSEM_WAITING_FOR_READ; - - raw_spin_lock_irq(&sem->wait_lock); - if (list_empty(&sem->wait_list)) { - /* - * In case the wait queue is empty and the lock isn't owned - * by a writer, this reader can exit the slowpath and return - * immediately as its RWSEM_ACTIVE_READ_BIAS has already - * been set in the count. - */ - if (atomic_long_read(&sem->count) >= 0) { - raw_spin_unlock_irq(&sem->wait_lock); - return sem; - } - adjustment += RWSEM_WAITING_BIAS; - } - list_add_tail(&waiter.list, &sem->wait_list); - - /* we're now waiting on the lock, but no longer actively locking */ - count = atomic_long_add_return(adjustment, &sem->count); - - /* - * If there are no active locks, wake the front queued process(es). - * - * If there are no writers and we are first in the queue, - * wake our own waiter to join the existing active readers ! - */ - if (count == RWSEM_WAITING_BIAS || - (count > RWSEM_WAITING_BIAS && - adjustment != -RWSEM_ACTIVE_READ_BIAS)) - __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q); - - raw_spin_unlock_irq(&sem->wait_lock); - wake_up_q(&wake_q); - - /* wait to be given the lock */ - while (true) { - set_current_state(state); - if (!waiter.task) - break; - if (signal_pending_state(state, current)) { - raw_spin_lock_irq(&sem->wait_lock); - if (waiter.task) - goto out_nolock; - raw_spin_unlock_irq(&sem->wait_lock); - break; - } - schedule(); - } - - __set_current_state(TASK_RUNNING); - return sem; -out_nolock: - list_del(&waiter.list); - if (list_empty(&sem->wait_list)) - atomic_long_add(-RWSEM_WAITING_BIAS, &sem->count); - raw_spin_unlock_irq(&sem->wait_lock); - __set_current_state(TASK_RUNNING); - return ERR_PTR(-EINTR); -} - -__visible struct rw_semaphore * __sched -rwsem_down_read_failed(struct rw_semaphore *sem) -{ - return __rwsem_down_read_failed_common(sem, TASK_UNINTERRUPTIBLE); -} -EXPORT_SYMBOL(rwsem_down_read_failed); - -__visible struct rw_semaphore * __sched -rwsem_down_read_failed_killable(struct rw_semaphore *sem) -{ - return __rwsem_down_read_failed_common(sem, TASK_KILLABLE); -} -EXPORT_SYMBOL(rwsem_down_read_failed_killable); - -/* - * This function must be called with the sem->wait_lock held to prevent - * race conditions between checking the rwsem wait list and setting the - * sem->count accordingly. - */ -static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem) -{ - /* - * Avoid trying to acquire write lock if count isn't RWSEM_WAITING_BIAS. - */ - if (count != RWSEM_WAITING_BIAS) - return false; - - /* - * Acquire the lock by trying to set it to ACTIVE_WRITE_BIAS. If there - * are other tasks on the wait list, we need to add on WAITING_BIAS. - */ - count = list_is_singular(&sem->wait_list) ? - RWSEM_ACTIVE_WRITE_BIAS : - RWSEM_ACTIVE_WRITE_BIAS + RWSEM_WAITING_BIAS; - - if (atomic_long_cmpxchg_acquire(&sem->count, RWSEM_WAITING_BIAS, count) - == RWSEM_WAITING_BIAS) { - rwsem_set_owner(sem); - return true; - } - - return false; -} - -#ifdef CONFIG_RWSEM_SPIN_ON_OWNER -/* - * Try to acquire write lock before the writer has been put on wait queue. - */ -static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem) -{ - long old, count = atomic_long_read(&sem->count); - - while (true) { - if (!(count == 0 || count == RWSEM_WAITING_BIAS)) - return false; - - old = atomic_long_cmpxchg_acquire(&sem->count, count, - count + RWSEM_ACTIVE_WRITE_BIAS); - if (old == count) { - rwsem_set_owner(sem); - return true; - } - - count = old; - } -} - -static inline bool owner_on_cpu(struct task_struct *owner) -{ - /* - * As lock holder preemption issue, we both skip spinning if - * task is not on cpu or its cpu is preempted - */ - return owner->on_cpu && !vcpu_is_preempted(task_cpu(owner)); -} - -static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem) -{ - struct task_struct *owner; - bool ret = true; - - BUILD_BUG_ON(!rwsem_has_anonymous_owner(RWSEM_OWNER_UNKNOWN)); - - if (need_resched()) - return false; - - rcu_read_lock(); - owner = READ_ONCE(sem->owner); - if (owner) { - ret = is_rwsem_owner_spinnable(owner) && - owner_on_cpu(owner); - } - rcu_read_unlock(); - return ret; -} - -/* - * Return true only if we can still spin on the owner field of the rwsem. - */ -static noinline bool rwsem_spin_on_owner(struct rw_semaphore *sem) -{ - struct task_struct *owner = READ_ONCE(sem->owner); - - if (!is_rwsem_owner_spinnable(owner)) - return false; - - rcu_read_lock(); - while (owner && (READ_ONCE(sem->owner) == owner)) { - /* - * Ensure we emit the owner->on_cpu, dereference _after_ - * checking sem->owner still matches owner, if that fails, - * owner might point to free()d memory, if it still matches, - * the rcu_read_lock() ensures the memory stays valid. - */ - barrier(); - - /* - * abort spinning when need_resched or owner is not running or - * owner's cpu is preempted. - */ - if (need_resched() || !owner_on_cpu(owner)) { - rcu_read_unlock(); - return false; - } - - cpu_relax(); - } - rcu_read_unlock(); - - /* - * If there is a new owner or the owner is not set, we continue - * spinning. - */ - return is_rwsem_owner_spinnable(READ_ONCE(sem->owner)); -} - -static bool rwsem_optimistic_spin(struct rw_semaphore *sem) -{ - bool taken = false; - - preempt_disable(); - - /* sem->wait_lock should not be held when doing optimistic spinning */ - if (!rwsem_can_spin_on_owner(sem)) - goto done; - - if (!osq_lock(&sem->osq)) - goto done; - - /* - * Optimistically spin on the owner field and attempt to acquire the - * lock whenever the owner changes. Spinning will be stopped when: - * 1) the owning writer isn't running; or - * 2) readers own the lock as we can't determine if they are - * actively running or not. - */ - while (rwsem_spin_on_owner(sem)) { - /* - * Try to acquire the lock - */ - if (rwsem_try_write_lock_unqueued(sem)) { - taken = true; - break; - } - - /* - * When there's no owner, we might have preempted between the - * owner acquiring the lock and setting the owner field. If - * we're an RT task that will live-lock because we won't let - * the owner complete. - */ - if (!sem->owner && (need_resched() || rt_task(current))) - break; - - /* - * The cpu_relax() call is a compiler barrier which forces - * everything in this loop to be re-loaded. We don't need - * memory barriers as we'll eventually observe the right - * values at the cost of a few extra spins. - */ - cpu_relax(); - } - osq_unlock(&sem->osq); -done: - preempt_enable(); - return taken; -} - -/* - * Return true if the rwsem has active spinner - */ -static inline bool rwsem_has_spinner(struct rw_semaphore *sem) -{ - return osq_is_locked(&sem->osq); -} - -#else -static bool rwsem_optimistic_spin(struct rw_semaphore *sem) -{ - return false; -} - -static inline bool rwsem_has_spinner(struct rw_semaphore *sem) -{ - return false; -} -#endif - -/* - * Wait until we successfully acquire the write lock - */ -static inline struct rw_semaphore * -__rwsem_down_write_failed_common(struct rw_semaphore *sem, int state) -{ - long count; - bool waiting = true; /* any queued threads before us */ - struct rwsem_waiter waiter; - struct rw_semaphore *ret = sem; - DEFINE_WAKE_Q(wake_q); - - /* undo write bias from down_write operation, stop active locking */ - count = atomic_long_sub_return(RWSEM_ACTIVE_WRITE_BIAS, &sem->count); - - /* do optimistic spinning and steal lock if possible */ - if (rwsem_optimistic_spin(sem)) - return sem; - - /* - * Optimistic spinning failed, proceed to the slowpath - * and block until we can acquire the sem. - */ - waiter.task = current; - waiter.type = RWSEM_WAITING_FOR_WRITE; - - raw_spin_lock_irq(&sem->wait_lock); - - /* account for this before adding a new element to the list */ - if (list_empty(&sem->wait_list)) - waiting = false; - - list_add_tail(&waiter.list, &sem->wait_list); - - /* we're now waiting on the lock, but no longer actively locking */ - if (waiting) { - count = atomic_long_read(&sem->count); - - /* - * If there were already threads queued before us and there are - * no active writers, the lock must be read owned; so we try to - * wake any read locks that were queued ahead of us. - */ - if (count > RWSEM_WAITING_BIAS) { - __rwsem_mark_wake(sem, RWSEM_WAKE_READERS, &wake_q); - /* - * The wakeup is normally called _after_ the wait_lock - * is released, but given that we are proactively waking - * readers we can deal with the wake_q overhead as it is - * similar to releasing and taking the wait_lock again - * for attempting rwsem_try_write_lock(). - */ - wake_up_q(&wake_q); - - /* - * Reinitialize wake_q after use. - */ - wake_q_init(&wake_q); - } - - } else - count = atomic_long_add_return(RWSEM_WAITING_BIAS, &sem->count); - - /* wait until we successfully acquire the lock */ - set_current_state(state); - while (true) { - if (rwsem_try_write_lock(count, sem)) - break; - raw_spin_unlock_irq(&sem->wait_lock); - - /* Block until there are no active lockers. */ - do { - if (signal_pending_state(state, current)) - goto out_nolock; - - schedule(); - set_current_state(state); - } while ((count = atomic_long_read(&sem->count)) & RWSEM_ACTIVE_MASK); - - raw_spin_lock_irq(&sem->wait_lock); - } - __set_current_state(TASK_RUNNING); - list_del(&waiter.list); - raw_spin_unlock_irq(&sem->wait_lock); - - return ret; - -out_nolock: - __set_current_state(TASK_RUNNING); - raw_spin_lock_irq(&sem->wait_lock); - list_del(&waiter.list); - if (list_empty(&sem->wait_list)) - atomic_long_add(-RWSEM_WAITING_BIAS, &sem->count); - else - __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q); - raw_spin_unlock_irq(&sem->wait_lock); - wake_up_q(&wake_q); - - return ERR_PTR(-EINTR); -} - -__visible struct rw_semaphore * __sched -rwsem_down_write_failed(struct rw_semaphore *sem) -{ - return __rwsem_down_write_failed_common(sem, TASK_UNINTERRUPTIBLE); -} -EXPORT_SYMBOL(rwsem_down_write_failed); - -__visible struct rw_semaphore * __sched -rwsem_down_write_failed_killable(struct rw_semaphore *sem) -{ - return __rwsem_down_write_failed_common(sem, TASK_KILLABLE); -} -EXPORT_SYMBOL(rwsem_down_write_failed_killable); - -/* - * handle waking up a waiter on the semaphore - * - up_read/up_write has decremented the active part of count if we come here - */ -__visible -struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem) -{ - unsigned long flags; - DEFINE_WAKE_Q(wake_q); - - /* - * __rwsem_down_write_failed_common(sem) - * rwsem_optimistic_spin(sem) - * osq_unlock(sem->osq) - * ... - * atomic_long_add_return(&sem->count) - * - * - VS - - * - * __up_write() - * if (atomic_long_sub_return_release(&sem->count) < 0) - * rwsem_wake(sem) - * osq_is_locked(&sem->osq) - * - * And __up_write() must observe !osq_is_locked() when it observes the - * atomic_long_add_return() in order to not miss a wakeup. - * - * This boils down to: - * - * [S.rel] X = 1 [RmW] r0 = (Y += 0) - * MB RMB - * [RmW] Y += 1 [L] r1 = X - * - * exists (r0=1 /\ r1=0) - */ - smp_rmb(); - - /* - * If a spinner is present, it is not necessary to do the wakeup. - * Try to do wakeup only if the trylock succeeds to minimize - * spinlock contention which may introduce too much delay in the - * unlock operation. - * - * spinning writer up_write/up_read caller - * --------------- ----------------------- - * [S] osq_unlock() [L] osq - * MB RMB - * [RmW] rwsem_try_write_lock() [RmW] spin_trylock(wait_lock) - * - * Here, it is important to make sure that there won't be a missed - * wakeup while the rwsem is free and the only spinning writer goes - * to sleep without taking the rwsem. Even when the spinning writer - * is just going to break out of the waiting loop, it will still do - * a trylock in rwsem_down_write_failed() before sleeping. IOW, if - * rwsem_has_spinner() is true, it will guarantee at least one - * trylock attempt on the rwsem later on. - */ - if (rwsem_has_spinner(sem)) { - /* - * The smp_rmb() here is to make sure that the spinner - * state is consulted before reading the wait_lock. - */ - smp_rmb(); - if (!raw_spin_trylock_irqsave(&sem->wait_lock, flags)) - return sem; - goto locked; - } - raw_spin_lock_irqsave(&sem->wait_lock, flags); -locked: - - if (!list_empty(&sem->wait_list)) - __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q); - - raw_spin_unlock_irqrestore(&sem->wait_lock, flags); - wake_up_q(&wake_q); - - return sem; -} -EXPORT_SYMBOL(rwsem_wake); - -/* - * downgrade a write lock into a read lock - * - caller incremented waiting part of count and discovered it still negative - * - just wake up any readers at the front of the queue - */ -__visible -struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem) -{ - unsigned long flags; - DEFINE_WAKE_Q(wake_q); - - raw_spin_lock_irqsave(&sem->wait_lock, flags); - - if (!list_empty(&sem->wait_list)) - __rwsem_mark_wake(sem, RWSEM_WAKE_READ_OWNED, &wake_q); - - raw_spin_unlock_irqrestore(&sem->wait_lock, flags); - wake_up_q(&wake_q); - - return sem; -} -EXPORT_SYMBOL(rwsem_downgrade_wake); diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index e586f0d03ad3..37524a47f002 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -3,17 +3,1438 @@ * * Written by David Howells (dhowells@redhat.com). * Derived from asm-i386/semaphore.h + * + * Writer lock-stealing by Alex Shi <alex.shi@intel.com> + * and Michel Lespinasse <walken@google.com> + * + * Optimistic spinning by Tim Chen <tim.c.chen@intel.com> + * and Davidlohr Bueso <davidlohr@hp.com>. Based on mutexes. + * + * Rwsem count bit fields re-definition and rwsem rearchitecture by + * Waiman Long <longman@redhat.com> and + * Peter Zijlstra <peterz@infradead.org>. */ #include <linux/types.h> #include <linux/kernel.h> #include <linux/sched.h> +#include <linux/sched/rt.h> +#include <linux/sched/task.h> #include <linux/sched/debug.h> +#include <linux/sched/wake_q.h> +#include <linux/sched/signal.h> +#include <linux/sched/clock.h> #include <linux/export.h> #include <linux/rwsem.h> #include <linux/atomic.h> #include "rwsem.h" +#include "lock_events.h" + +/* + * The least significant 3 bits of the owner value has the following + * meanings when set. + * - Bit 0: RWSEM_READER_OWNED - The rwsem is owned by readers + * - Bit 1: RWSEM_RD_NONSPINNABLE - Readers cannot spin on this lock. + * - Bit 2: RWSEM_WR_NONSPINNABLE - Writers cannot spin on this lock. + * + * When the rwsem is either owned by an anonymous writer, or it is + * reader-owned, but a spinning writer has timed out, both nonspinnable + * bits will be set to disable optimistic spinning by readers and writers. + * In the later case, the last unlocking reader should then check the + * writer nonspinnable bit and clear it only to give writers preference + * to acquire the lock via optimistic spinning, but not readers. Similar + * action is also done in the reader slowpath. + + * When a writer acquires a rwsem, it puts its task_struct pointer + * into the owner field. It is cleared after an unlock. + * + * When a reader acquires a rwsem, it will also puts its task_struct + * pointer into the owner field with the RWSEM_READER_OWNED bit set. + * On unlock, the owner field will largely be left untouched. So + * for a free or reader-owned rwsem, the owner value may contain + * information about the last reader that acquires the rwsem. + * + * That information may be helpful in debugging cases where the system + * seems to hang on a reader owned rwsem especially if only one reader + * is involved. Ideally we would like to track all the readers that own + * a rwsem, but the overhead is simply too big. + * + * Reader optimistic spinning is helpful when the reader critical section + * is short and there aren't that many readers around. It makes readers + * relatively more preferred than writers. When a writer times out spinning + * on a reader-owned lock and set the nospinnable bits, there are two main + * reasons for that. + * + * 1) The reader critical section is long, perhaps the task sleeps after + * acquiring the read lock. + * 2) There are just too many readers contending the lock causing it to + * take a while to service all of them. + * + * In the former case, long reader critical section will impede the progress + * of writers which is usually more important for system performance. In + * the later case, reader optimistic spinning tends to make the reader + * groups that contain readers that acquire the lock together smaller + * leading to more of them. That may hurt performance in some cases. In + * other words, the setting of nonspinnable bits indicates that reader + * optimistic spinning may not be helpful for those workloads that cause + * it. + * + * Therefore, any writers that had observed the setting of the writer + * nonspinnable bit for a given rwsem after they fail to acquire the lock + * via optimistic spinning will set the reader nonspinnable bit once they + * acquire the write lock. Similarly, readers that observe the setting + * of reader nonspinnable bit at slowpath entry will set the reader + * nonspinnable bits when they acquire the read lock via the wakeup path. + * + * Once the reader nonspinnable bit is on, it will only be reset when + * a writer is able to acquire the rwsem in the fast path or somehow a + * reader or writer in the slowpath doesn't observe the nonspinable bit. + * + * This is to discourage reader optmistic spinning on that particular + * rwsem and make writers more preferred. This adaptive disabling of reader + * optimistic spinning will alleviate the negative side effect of this + * feature. + */ +#define RWSEM_READER_OWNED (1UL << 0) +#define RWSEM_RD_NONSPINNABLE (1UL << 1) +#define RWSEM_WR_NONSPINNABLE (1UL << 2) +#define RWSEM_NONSPINNABLE (RWSEM_RD_NONSPINNABLE | RWSEM_WR_NONSPINNABLE) +#define RWSEM_OWNER_FLAGS_MASK (RWSEM_READER_OWNED | RWSEM_NONSPINNABLE) + +#ifdef CONFIG_DEBUG_RWSEMS +# define DEBUG_RWSEMS_WARN_ON(c, sem) do { \ + if (!debug_locks_silent && \ + WARN_ONCE(c, "DEBUG_RWSEMS_WARN_ON(%s): count = 0x%lx, owner = 0x%lx, curr 0x%lx, list %sempty\n",\ + #c, atomic_long_read(&(sem)->count), \ + atomic_long_read(&(sem)->owner), (long)current, \ + list_empty(&(sem)->wait_list) ? "" : "not ")) \ + debug_locks_off(); \ + } while (0) +#else +# define DEBUG_RWSEMS_WARN_ON(c, sem) +#endif + +/* + * On 64-bit architectures, the bit definitions of the count are: + * + * Bit 0 - writer locked bit + * Bit 1 - waiters present bit + * Bit 2 - lock handoff bit + * Bits 3-7 - reserved + * Bits 8-62 - 55-bit reader count + * Bit 63 - read fail bit + * + * On 32-bit architectures, the bit definitions of the count are: + * + * Bit 0 - writer locked bit + * Bit 1 - waiters present bit + * Bit 2 - lock handoff bit + * Bits 3-7 - reserved + * Bits 8-30 - 23-bit reader count + * Bit 31 - read fail bit + * + * It is not likely that the most significant bit (read fail bit) will ever + * be set. This guard bit is still checked anyway in the down_read() fastpath + * just in case we need to use up more of the reader bits for other purpose + * in the future. + * + * atomic_long_fetch_add() is used to obtain reader lock, whereas + * atomic_long_cmpxchg() will be used to obtain writer lock. + * + * There are three places where the lock handoff bit may be set or cleared. + * 1) rwsem_mark_wake() for readers. + * 2) rwsem_try_write_lock() for writers. + * 3) Error path of rwsem_down_write_slowpath(). + * + * For all the above cases, wait_lock will be held. A writer must also + * be the first one in the wait_list to be eligible for setting the handoff + * bit. So concurrent setting/clearing of handoff bit is not possible. + */ +#define RWSEM_WRITER_LOCKED (1UL << 0) +#define RWSEM_FLAG_WAITERS (1UL << 1) +#define RWSEM_FLAG_HANDOFF (1UL << 2) +#define RWSEM_FLAG_READFAIL (1UL << (BITS_PER_LONG - 1)) + +#define RWSEM_READER_SHIFT 8 +#define RWSEM_READER_BIAS (1UL << RWSEM_READER_SHIFT) +#define RWSEM_READER_MASK (~(RWSEM_READER_BIAS - 1)) +#define RWSEM_WRITER_MASK RWSEM_WRITER_LOCKED +#define RWSEM_LOCK_MASK (RWSEM_WRITER_MASK|RWSEM_READER_MASK) +#define RWSEM_READ_FAILED_MASK (RWSEM_WRITER_MASK|RWSEM_FLAG_WAITERS|\ + RWSEM_FLAG_HANDOFF|RWSEM_FLAG_READFAIL) + +/* + * All writes to owner are protected by WRITE_ONCE() to make sure that + * store tearing can't happen as optimistic spinners may read and use + * the owner value concurrently without lock. Read from owner, however, + * may not need READ_ONCE() as long as the pointer value is only used + * for comparison and isn't being dereferenced. + */ +static inline void rwsem_set_owner(struct rw_semaphore *sem) +{ + atomic_long_set(&sem->owner, (long)current); +} + +static inline void rwsem_clear_owner(struct rw_semaphore *sem) +{ + atomic_long_set(&sem->owner, 0); +} + +/* + * Test the flags in the owner field. + */ +static inline bool rwsem_test_oflags(struct rw_semaphore *sem, long flags) +{ + return atomic_long_read(&sem->owner) & flags; +} + +/* + * The task_struct pointer of the last owning reader will be left in + * the owner field. + * + * Note that the owner value just indicates the task has owned the rwsem + * previously, it may not be the real owner or one of the real owners + * anymore when that field is examined, so take it with a grain of salt. + * + * The reader non-spinnable bit is preserved. + */ +static inline void __rwsem_set_reader_owned(struct rw_semaphore *sem, + struct task_struct *owner) +{ + unsigned long val = (unsigned long)owner | RWSEM_READER_OWNED | + (atomic_long_read(&sem->owner) & RWSEM_RD_NONSPINNABLE); + + atomic_long_set(&sem->owner, val); +} + +static inline void rwsem_set_reader_owned(struct rw_semaphore *sem) +{ + __rwsem_set_reader_owned(sem, current); +} + +/* + * Return true if the rwsem is owned by a reader. + */ +static inline bool is_rwsem_reader_owned(struct rw_semaphore *sem) +{ +#ifdef CONFIG_DEBUG_RWSEMS + /* + * Check the count to see if it is write-locked. + */ + long count = atomic_long_read(&sem->count); + + if (count & RWSEM_WRITER_MASK) + return false; +#endif + return rwsem_test_oflags(sem, RWSEM_READER_OWNED); +} + +#ifdef CONFIG_DEBUG_RWSEMS +/* + * With CONFIG_DEBUG_RWSEMS configured, it will make sure that if there + * is a task pointer in owner of a reader-owned rwsem, it will be the + * real owner or one of the real owners. The only exception is when the + * unlock is done by up_read_non_owner(). + */ +static inline void rwsem_clear_reader_owned(struct rw_semaphore *sem) +{ + unsigned long val = atomic_long_read(&sem->owner); + + while ((val & ~RWSEM_OWNER_FLAGS_MASK) == (unsigned long)current) { + if (atomic_long_try_cmpxchg(&sem->owner, &val, + val & RWSEM_OWNER_FLAGS_MASK)) + return; + } +} +#else +static inline void rwsem_clear_reader_owned(struct rw_semaphore *sem) +{ +} +#endif + +/* + * Set the RWSEM_NONSPINNABLE bits if the RWSEM_READER_OWNED flag + * remains set. Otherwise, the operation will be aborted. + */ +static inline void rwsem_set_nonspinnable(struct rw_semaphore *sem) +{ + unsigned long owner = atomic_long_read(&sem->owner); + + do { + if (!(owner & RWSEM_READER_OWNED)) + break; + if (owner & RWSEM_NONSPINNABLE) + break; + } while (!atomic_long_try_cmpxchg(&sem->owner, &owner, + owner | RWSEM_NONSPINNABLE)); +} + +static inline bool rwsem_read_trylock(struct rw_semaphore *sem) +{ + long cnt = atomic_long_add_return_acquire(RWSEM_READER_BIAS, &sem->count); + if (WARN_ON_ONCE(cnt < 0)) + rwsem_set_nonspinnable(sem); + return !(cnt & RWSEM_READ_FAILED_MASK); +} + +/* + * Return just the real task structure pointer of the owner + */ +static inline struct task_struct *rwsem_owner(struct rw_semaphore *sem) +{ + return (struct task_struct *) + (atomic_long_read(&sem->owner) & ~RWSEM_OWNER_FLAGS_MASK); +} + +/* + * Return the real task structure pointer of the owner and the embedded + * flags in the owner. pflags must be non-NULL. + */ +static inline struct task_struct * +rwsem_owner_flags(struct rw_semaphore *sem, unsigned long *pflags) +{ + unsigned long owner = atomic_long_read(&sem->owner); + + *pflags = owner & RWSEM_OWNER_FLAGS_MASK; + return (struct task_struct *)(owner & ~RWSEM_OWNER_FLAGS_MASK); +} + +/* + * Guide to the rw_semaphore's count field. + * + * When the RWSEM_WRITER_LOCKED bit in count is set, the lock is owned + * by a writer. + * + * The lock is owned by readers when + * (1) the RWSEM_WRITER_LOCKED isn't set in count, + * (2) some of the reader bits are set in count, and + * (3) the owner field has RWSEM_READ_OWNED bit set. + * + * Having some reader bits set is not enough to guarantee a readers owned + * lock as the readers may be in the process of backing out from the count + * and a writer has just released the lock. So another writer may steal + * the lock immediately after that. + */ + +/* + * Initialize an rwsem: + */ +void __init_rwsem(struct rw_semaphore *sem, const char *name, + struct lock_class_key *key) +{ +#ifdef CONFIG_DEBUG_LOCK_ALLOC + /* + * Make sure we are not reinitializing a held semaphore: + */ + debug_check_no_locks_freed((void *)sem, sizeof(*sem)); + lockdep_init_map(&sem->dep_map, name, key, 0); +#endif + atomic_long_set(&sem->count, RWSEM_UNLOCKED_VALUE); + raw_spin_lock_init(&sem->wait_lock); + INIT_LIST_HEAD(&sem->wait_list); + atomic_long_set(&sem->owner, 0L); +#ifdef CONFIG_RWSEM_SPIN_ON_OWNER + osq_lock_init(&sem->osq); +#endif +} +EXPORT_SYMBOL(__init_rwsem); + +enum rwsem_waiter_type { + RWSEM_WAITING_FOR_WRITE, + RWSEM_WAITING_FOR_READ +}; + +struct rwsem_waiter { + struct list_head list; + struct task_struct *task; + enum rwsem_waiter_type type; + unsigned long timeout; + unsigned long last_rowner; +}; +#define rwsem_first_waiter(sem) \ + list_first_entry(&sem->wait_list, struct rwsem_waiter, list) + +enum rwsem_wake_type { + RWSEM_WAKE_ANY, /* Wake whatever's at head of wait list */ + RWSEM_WAKE_READERS, /* Wake readers only */ + RWSEM_WAKE_READ_OWNED /* Waker thread holds the read lock */ +}; + +enum writer_wait_state { + WRITER_NOT_FIRST, /* Writer is not first in wait list */ + WRITER_FIRST, /* Writer is first in wait list */ + WRITER_HANDOFF /* Writer is first & handoff needed */ +}; + +/* + * The typical HZ value is either 250 or 1000. So set the minimum waiting + * time to at least 4ms or 1 jiffy (if it is higher than 4ms) in the wait + * queue before initiating the handoff protocol. + */ +#define RWSEM_WAIT_TIMEOUT DIV_ROUND_UP(HZ, 250) + +/* + * Magic number to batch-wakeup waiting readers, even when writers are + * also present in the queue. This both limits the amount of work the + * waking thread must do and also prevents any potential counter overflow, + * however unlikely. + */ +#define MAX_READERS_WAKEUP 0x100 + +/* + * handle the lock release when processes blocked on it that can now run + * - if we come here from up_xxxx(), then the RWSEM_FLAG_WAITERS bit must + * have been set. + * - there must be someone on the queue + * - the wait_lock must be held by the caller + * - tasks are marked for wakeup, the caller must later invoke wake_up_q() + * to actually wakeup the blocked task(s) and drop the reference count, + * preferably when the wait_lock is released + * - woken process blocks are discarded from the list after having task zeroed + * - writers are only marked woken if downgrading is false + */ +static void rwsem_mark_wake(struct rw_semaphore *sem, + enum rwsem_wake_type wake_type, + struct wake_q_head *wake_q) +{ + struct rwsem_waiter *waiter, *tmp; + long oldcount, woken = 0, adjustment = 0; + struct list_head wlist; + + lockdep_assert_held(&sem->wait_lock); + + /* + * Take a peek at the queue head waiter such that we can determine + * the wakeup(s) to perform. + */ + waiter = rwsem_first_waiter(sem); + + if (waiter->type == RWSEM_WAITING_FOR_WRITE) { + if (wake_type == RWSEM_WAKE_ANY) { + /* + * Mark writer at the front of the queue for wakeup. + * Until the task is actually later awoken later by + * the caller, other writers are able to steal it. + * Readers, on the other hand, will block as they + * will notice the queued writer. + */ + wake_q_add(wake_q, waiter->task); + lockevent_inc(rwsem_wake_writer); + } + + return; + } + + /* + * No reader wakeup if there are too many of them already. + */ + if (unlikely(atomic_long_read(&sem->count) < 0)) + return; + + /* + * Writers might steal the lock before we grant it to the next reader. + * We prefer to do the first reader grant before counting readers + * so we can bail out early if a writer stole the lock. + */ + if (wake_type != RWSEM_WAKE_READ_OWNED) { + struct task_struct *owner; + + adjustment = RWSEM_READER_BIAS; + oldcount = atomic_long_fetch_add(adjustment, &sem->count); + if (unlikely(oldcount & RWSEM_WRITER_MASK)) { + /* + * When we've been waiting "too" long (for writers + * to give up the lock), request a HANDOFF to + * force the issue. + */ + if (!(oldcount & RWSEM_FLAG_HANDOFF) && + time_after(jiffies, waiter->timeout)) { + adjustment -= RWSEM_FLAG_HANDOFF; + lockevent_inc(rwsem_rlock_handoff); + } + + atomic_long_add(-adjustment, &sem->count); + return; + } + /* + * Set it to reader-owned to give spinners an early + * indication that readers now have the lock. + * The reader nonspinnable bit seen at slowpath entry of + * the reader is copied over. + */ + owner = waiter->task; + if (waiter->last_rowner & RWSEM_RD_NONSPINNABLE) { + owner = (void *)((unsigned long)owner | RWSEM_RD_NONSPINNABLE); + lockevent_inc(rwsem_opt_norspin); + } + __rwsem_set_reader_owned(sem, owner); + } + + /* + * Grant up to MAX_READERS_WAKEUP read locks to all the readers in the + * queue. We know that the woken will be at least 1 as we accounted + * for above. Note we increment the 'active part' of the count by the + * number of readers before waking any processes up. + * + * This is an adaptation of the phase-fair R/W locks where at the + * reader phase (first waiter is a reader), all readers are eligible + * to acquire the lock at the same time irrespective of their order + * in the queue. The writers acquire the lock according to their + * order in the queue. + * + * We have to do wakeup in 2 passes to prevent the possibility that + * the reader count may be decremented before it is incremented. It + * is because the to-be-woken waiter may not have slept yet. So it + * may see waiter->task got cleared, finish its critical section and + * do an unlock before the reader count increment. + * + * 1) Collect the read-waiters in a separate list, count them and + * fully increment the reader count in rwsem. + * 2) For each waiters in the new list, clear waiter->task and + * put them into wake_q to be woken up later. + */ + INIT_LIST_HEAD(&wlist); + list_for_each_entry_safe(waiter, tmp, &sem->wait_list, list) { + if (waiter->type == RWSEM_WAITING_FOR_WRITE) + continue; + + woken++; + list_move_tail(&waiter->list, &wlist); + + /* + * Limit # of readers that can be woken up per wakeup call. + */ + if (woken >= MAX_READERS_WAKEUP) + break; + } + + adjustment = woken * RWSEM_READER_BIAS - adjustment; + lockevent_cond_inc(rwsem_wake_reader, woken); + if (list_empty(&sem->wait_list)) { + /* hit end of list above */ + adjustment -= RWSEM_FLAG_WAITERS; + } + + /* + * When we've woken a reader, we no longer need to force writers + * to give up the lock and we can clear HANDOFF. + */ + if (woken && (atomic_long_read(&sem->count) & RWSEM_FLAG_HANDOFF)) + adjustment -= RWSEM_FLAG_HANDOFF; + + if (adjustment) + atomic_long_add(adjustment, &sem->count); + + /* 2nd pass */ + list_for_each_entry_safe(waiter, tmp, &wlist, list) { + struct task_struct *tsk; + + tsk = waiter->task; + get_task_struct(tsk); + + /* + * Ensure calling get_task_struct() before setting the reader + * waiter to nil such that rwsem_down_read_slowpath() cannot + * race with do_exit() by always holding a reference count + * to the task to wakeup. + */ + smp_store_release(&waiter->task, NULL); + /* + * Ensure issuing the wakeup (either by us or someone else) + * after setting the reader waiter to nil. + */ + wake_q_add_safe(wake_q, tsk); + } +} + +/* + * This function must be called with the sem->wait_lock held to prevent + * race conditions between checking the rwsem wait list and setting the + * sem->count accordingly. + * + * If wstate is WRITER_HANDOFF, it will make sure that either the handoff + * bit is set or the lock is acquired with handoff bit cleared. + */ +static inline bool rwsem_try_write_lock(struct rw_semaphore *sem, + enum writer_wait_state wstate) +{ + long count, new; + + lockdep_assert_held(&sem->wait_lock); + + count = atomic_long_read(&sem->count); + do { + bool has_handoff = !!(count & RWSEM_FLAG_HANDOFF); + + if (has_handoff && wstate == WRITER_NOT_FIRST) + return false; + + new = count; + + if (count & RWSEM_LOCK_MASK) { + if (has_handoff || (wstate != WRITER_HANDOFF)) + return false; + + new |= RWSEM_FLAG_HANDOFF; + } else { + new |= RWSEM_WRITER_LOCKED; + new &= ~RWSEM_FLAG_HANDOFF; + + if (list_is_singular(&sem->wait_list)) + new &= ~RWSEM_FLAG_WAITERS; + } + } while (!atomic_long_try_cmpxchg_acquire(&sem->count, &count, new)); + + /* + * We have either acquired the lock with handoff bit cleared or + * set the handoff bit. + */ + if (new & RWSEM_FLAG_HANDOFF) + return false; + + rwsem_set_owner(sem); + return true; +} + +#ifdef CONFIG_RWSEM_SPIN_ON_OWNER +/* + * Try to acquire read lock before the reader is put on wait queue. + * Lock acquisition isn't allowed if the rwsem is locked or a writer handoff + * is ongoing. + */ +static inline bool rwsem_try_read_lock_unqueued(struct rw_semaphore *sem) +{ + long count = atomic_long_read(&sem->count); + + if (count & (RWSEM_WRITER_MASK | RWSEM_FLAG_HANDOFF)) + return false; + + count = atomic_long_fetch_add_acquire(RWSEM_READER_BIAS, &sem->count); + if (!(count & (RWSEM_WRITER_MASK | RWSEM_FLAG_HANDOFF))) { + rwsem_set_reader_owned(sem); + lockevent_inc(rwsem_opt_rlock); + return true; + } + + /* Back out the change */ + atomic_long_add(-RWSEM_READER_BIAS, &sem->count); + return false; +} + +/* + * Try to acquire write lock before the writer has been put on wait queue. + */ +static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem) +{ + long count = atomic_long_read(&sem->count); + + while (!(count & (RWSEM_LOCK_MASK|RWSEM_FLAG_HANDOFF))) { + if (atomic_long_try_cmpxchg_acquire(&sem->count, &count, + count | RWSEM_WRITER_LOCKED)) { + rwsem_set_owner(sem); + lockevent_inc(rwsem_opt_wlock); + return true; + } + } + return false; +} + +static inline bool owner_on_cpu(struct task_struct *owner) +{ + /* + * As lock holder preemption issue, we both skip spinning if + * task is not on cpu or its cpu is preempted + */ + return owner->on_cpu && !vcpu_is_preempted(task_cpu(owner)); +} + +static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem, + unsigned long nonspinnable) +{ + struct task_struct *owner; + unsigned long flags; + bool ret = true; + + BUILD_BUG_ON(!(RWSEM_OWNER_UNKNOWN & RWSEM_NONSPINNABLE)); + + if (need_resched()) { + lockevent_inc(rwsem_opt_fail); + return false; + } + + preempt_disable(); + rcu_read_lock(); + owner = rwsem_owner_flags(sem, &flags); + if ((flags & nonspinnable) || (owner && !owner_on_cpu(owner))) + ret = false; + rcu_read_unlock(); + preempt_enable(); + + lockevent_cond_inc(rwsem_opt_fail, !ret); + return ret; +} + +/* + * The rwsem_spin_on_owner() function returns the folowing 4 values + * depending on the lock owner state. + * OWNER_NULL : owner is currently NULL + * OWNER_WRITER: when owner changes and is a writer + * OWNER_READER: when owner changes and the new owner may be a reader. + * OWNER_NONSPINNABLE: + * when optimistic spinning has to stop because either the + * owner stops running, is unknown, or its timeslice has + * been used up. + */ +enum owner_state { + OWNER_NULL = 1 << 0, + OWNER_WRITER = 1 << 1, + OWNER_READER = 1 << 2, + OWNER_NONSPINNABLE = 1 << 3, +}; +#define OWNER_SPINNABLE (OWNER_NULL | OWNER_WRITER | OWNER_READER) + +static inline enum owner_state +rwsem_owner_state(struct task_struct *owner, unsigned long flags, unsigned long nonspinnable) +{ + if (flags & nonspinnable) + return OWNER_NONSPINNABLE; + + if (flags & RWSEM_READER_OWNED) + return OWNER_READER; + + return owner ? OWNER_WRITER : OWNER_NULL; +} + +static noinline enum owner_state +rwsem_spin_on_owner(struct rw_semaphore *sem, unsigned long nonspinnable) +{ + struct task_struct *new, *owner; + unsigned long flags, new_flags; + enum owner_state state; + + owner = rwsem_owner_flags(sem, &flags); + state = rwsem_owner_state(owner, flags, nonspinnable); + if (state != OWNER_WRITER) + return state; + + rcu_read_lock(); + for (;;) { + if (atomic_long_read(&sem->count) & RWSEM_FLAG_HANDOFF) { + state = OWNER_NONSPINNABLE; + break; + } + + new = rwsem_owner_flags(sem, &new_flags); + if ((new != owner) || (new_flags != flags)) { + state = rwsem_owner_state(new, new_flags, nonspinnable); + break; + } + + /* + * Ensure we emit the owner->on_cpu, dereference _after_ + * checking sem->owner still matches owner, if that fails, + * owner might point to free()d memory, if it still matches, + * the rcu_read_lock() ensures the memory stays valid. + */ + barrier(); + + if (need_resched() || !owner_on_cpu(owner)) { + state = OWNER_NONSPINNABLE; + break; + } + + cpu_relax(); + } + rcu_read_unlock(); + + return state; +} + +/* + * Calculate reader-owned rwsem spinning threshold for writer + * + * The more readers own the rwsem, the longer it will take for them to + * wind down and free the rwsem. So the empirical formula used to + * determine the actual spinning time limit here is: + * + * Spinning threshold = (10 + nr_readers/2)us + * + * The limit is capped to a maximum of 25us (30 readers). This is just + * a heuristic and is subjected to change in the future. + */ +static inline u64 rwsem_rspin_threshold(struct rw_semaphore *sem) +{ + long count = atomic_long_read(&sem->count); + int readers = count >> RWSEM_READER_SHIFT; + u64 delta; + + if (readers > 30) + readers = 30; + delta = (20 + readers) * NSEC_PER_USEC / 2; + + return sched_clock() + delta; +} + +static bool rwsem_optimistic_spin(struct rw_semaphore *sem, bool wlock) +{ + bool taken = false; + int prev_owner_state = OWNER_NULL; + int loop = 0; + u64 rspin_threshold = 0; + unsigned long nonspinnable = wlock ? RWSEM_WR_NONSPINNABLE + : RWSEM_RD_NONSPINNABLE; + + preempt_disable(); + + /* sem->wait_lock should not be held when doing optimistic spinning */ + if (!osq_lock(&sem->osq)) + goto done; + + /* + * Optimistically spin on the owner field and attempt to acquire the + * lock whenever the owner changes. Spinning will be stopped when: + * 1) the owning writer isn't running; or + * 2) readers own the lock and spinning time has exceeded limit. + */ + for (;;) { + enum owner_state owner_state; + + owner_state = rwsem_spin_on_owner(sem, nonspinnable); + if (!(owner_state & OWNER_SPINNABLE)) + break; + + /* + * Try to acquire the lock + */ + taken = wlock ? rwsem_try_write_lock_unqueued(sem) + : rwsem_try_read_lock_unqueued(sem); + + if (taken) + break; + + /* + * Time-based reader-owned rwsem optimistic spinning + */ + if (wlock && (owner_state == OWNER_READER)) { + /* + * Re-initialize rspin_threshold every time when + * the owner state changes from non-reader to reader. + * This allows a writer to steal the lock in between + * 2 reader phases and have the threshold reset at + * the beginning of the 2nd reader phase. + */ + if (prev_owner_state != OWNER_READER) { + if (rwsem_test_oflags(sem, nonspinnable)) + break; + rspin_threshold = rwsem_rspin_threshold(sem); + loop = 0; + } + + /* + * Check time threshold once every 16 iterations to + * avoid calling sched_clock() too frequently so + * as to reduce the average latency between the times + * when the lock becomes free and when the spinner + * is ready to do a trylock. + */ + else if (!(++loop & 0xf) && (sched_clock() > rspin_threshold)) { + rwsem_set_nonspinnable(sem); + lockevent_inc(rwsem_opt_nospin); + break; + } + } + + /* + * An RT task cannot do optimistic spinning if it cannot + * be sure the lock holder is running or live-lock may + * happen if the current task and the lock holder happen + * to run in the same CPU. However, aborting optimistic + * spinning while a NULL owner is detected may miss some + * opportunity where spinning can continue without causing + * problem. + * + * There are 2 possible cases where an RT task may be able + * to continue spinning. + * + * 1) The lock owner is in the process of releasing the + * lock, sem->owner is cleared but the lock has not + * been released yet. + * 2) The lock was free and owner cleared, but another + * task just comes in and acquire the lock before + * we try to get it. The new owner may be a spinnable + * writer. + * + * To take advantage of two scenarios listed agove, the RT + * task is made to retry one more time to see if it can + * acquire the lock or continue spinning on the new owning + * writer. Of course, if the time lag is long enough or the + * new owner is not a writer or spinnable, the RT task will + * quit spinning. + * + * If the owner is a writer, the need_resched() check is + * done inside rwsem_spin_on_owner(). If the owner is not + * a writer, need_resched() check needs to be done here. + */ + if (owner_state != OWNER_WRITER) { + if (need_resched()) + break; + if (rt_task(current) && + (prev_owner_state != OWNER_WRITER)) + break; + } + prev_owner_state = owner_state; + + /* + * The cpu_relax() call is a compiler barrier which forces + * everything in this loop to be re-loaded. We don't need + * memory barriers as we'll eventually observe the right + * values at the cost of a few extra spins. + */ + cpu_relax(); + } + osq_unlock(&sem->osq); +done: + preempt_enable(); + lockevent_cond_inc(rwsem_opt_fail, !taken); + return taken; +} + +/* + * Clear the owner's RWSEM_WR_NONSPINNABLE bit if it is set. This should + * only be called when the reader count reaches 0. + * + * This give writers better chance to acquire the rwsem first before + * readers when the rwsem was being held by readers for a relatively long + * period of time. Race can happen that an optimistic spinner may have + * just stolen the rwsem and set the owner, but just clearing the + * RWSEM_WR_NONSPINNABLE bit will do no harm anyway. + */ +static inline void clear_wr_nonspinnable(struct rw_semaphore *sem) +{ + if (rwsem_test_oflags(sem, RWSEM_WR_NONSPINNABLE)) + atomic_long_andnot(RWSEM_WR_NONSPINNABLE, &sem->owner); +} + +/* + * This function is called when the reader fails to acquire the lock via + * optimistic spinning. In this case we will still attempt to do a trylock + * when comparing the rwsem state right now with the state when entering + * the slowpath indicates that the reader is still in a valid reader phase. + * This happens when the following conditions are true: + * + * 1) The lock is currently reader owned, and + * 2) The lock is previously not reader-owned or the last read owner changes. + * + * In the former case, we have transitioned from a writer phase to a + * reader-phase while spinning. In the latter case, it means the reader + * phase hasn't ended when we entered the optimistic spinning loop. In + * both cases, the reader is eligible to acquire the lock. This is the + * secondary path where a read lock is acquired optimistically. + * + * The reader non-spinnable bit wasn't set at time of entry or it will + * not be here at all. + */ +static inline bool rwsem_reader_phase_trylock(struct rw_semaphore *sem, + unsigned long last_rowner) +{ + unsigned long owner = atomic_long_read(&sem->owner); + + if (!(owner & RWSEM_READER_OWNED)) + return false; + + if (((owner ^ last_rowner) & ~RWSEM_OWNER_FLAGS_MASK) && + rwsem_try_read_lock_unqueued(sem)) { + lockevent_inc(rwsem_opt_rlock2); + lockevent_add(rwsem_opt_fail, -1); + return true; + } + return false; +} +#else +static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem, + unsigned long nonspinnable) +{ + return false; +} + +static inline bool rwsem_optimistic_spin(struct rw_semaphore *sem, bool wlock) +{ + return false; +} + +static inline void clear_wr_nonspinnable(struct rw_semaphore *sem) { } + +static inline bool rwsem_reader_phase_trylock(struct rw_semaphore *sem, + unsigned long last_rowner) +{ + return false; +} +#endif + +/* + * Wait for the read lock to be granted + */ +static struct rw_semaphore __sched * +rwsem_down_read_slowpath(struct rw_semaphore *sem, int state) +{ + long count, adjustment = -RWSEM_READER_BIAS; + struct rwsem_waiter waiter; + DEFINE_WAKE_Q(wake_q); + bool wake = false; + + /* + * Save the current read-owner of rwsem, if available, and the + * reader nonspinnable bit. + */ + waiter.last_rowner = atomic_long_read(&sem->owner); + if (!(waiter.last_rowner & RWSEM_READER_OWNED)) + waiter.last_rowner &= RWSEM_RD_NONSPINNABLE; + + if (!rwsem_can_spin_on_owner(sem, RWSEM_RD_NONSPINNABLE)) + goto queue; + + /* + * Undo read bias from down_read() and do optimistic spinning. + */ + atomic_long_add(-RWSEM_READER_BIAS, &sem->count); + adjustment = 0; + if (rwsem_optimistic_spin(sem, false)) { + /* + * Wake up other readers in the wait list if the front + * waiter is a reader. + */ + if ((atomic_long_read(&sem->count) & RWSEM_FLAG_WAITERS)) { + raw_spin_lock_irq(&sem->wait_lock); + if (!list_empty(&sem->wait_list)) + rwsem_mark_wake(sem, RWSEM_WAKE_READ_OWNED, + &wake_q); + raw_spin_unlock_irq(&sem->wait_lock); + wake_up_q(&wake_q); + } + return sem; + } else if (rwsem_reader_phase_trylock(sem, waiter.last_rowner)) { + return sem; + } + +queue: + waiter.task = current; + waiter.type = RWSEM_WAITING_FOR_READ; + waiter.timeout = jiffies + RWSEM_WAIT_TIMEOUT; + + raw_spin_lock_irq(&sem->wait_lock); + if (list_empty(&sem->wait_list)) { + /* + * In case the wait queue is empty and the lock isn't owned + * by a writer or has the handoff bit set, this reader can + * exit the slowpath and return immediately as its + * RWSEM_READER_BIAS has already been set in the count. + */ + if (adjustment && !(atomic_long_read(&sem->count) & + (RWSEM_WRITER_MASK | RWSEM_FLAG_HANDOFF))) { + raw_spin_unlock_irq(&sem->wait_lock); + rwsem_set_reader_owned(sem); + lockevent_inc(rwsem_rlock_fast); + return sem; + } + adjustment += RWSEM_FLAG_WAITERS; + } + list_add_tail(&waiter.list, &sem->wait_list); + + /* we're now waiting on the lock, but no longer actively locking */ + if (adjustment) + count = atomic_long_add_return(adjustment, &sem->count); + else + count = atomic_long_read(&sem->count); + + /* + * If there are no active locks, wake the front queued process(es). + * + * If there are no writers and we are first in the queue, + * wake our own waiter to join the existing active readers ! + */ + if (!(count & RWSEM_LOCK_MASK)) { + clear_wr_nonspinnable(sem); + wake = true; + } + if (wake || (!(count & RWSEM_WRITER_MASK) && + (adjustment & RWSEM_FLAG_WAITERS))) + rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q); + + raw_spin_unlock_irq(&sem->wait_lock); + wake_up_q(&wake_q); + + /* wait to be given the lock */ + while (true) { + set_current_state(state); + if (!waiter.task) + break; + if (signal_pending_state(state, current)) { + raw_spin_lock_irq(&sem->wait_lock); + if (waiter.task) + goto out_nolock; + raw_spin_unlock_irq(&sem->wait_lock); + break; + } + schedule(); + lockevent_inc(rwsem_sleep_reader); + } + + __set_current_state(TASK_RUNNING); + lockevent_inc(rwsem_rlock); + return sem; +out_nolock: + list_del(&waiter.list); + if (list_empty(&sem->wait_list)) { + atomic_long_andnot(RWSEM_FLAG_WAITERS|RWSEM_FLAG_HANDOFF, + &sem->count); + } + raw_spin_unlock_irq(&sem->wait_lock); + __set_current_state(TASK_RUNNING); + lockevent_inc(rwsem_rlock_fail); + return ERR_PTR(-EINTR); +} + +/* + * This function is called by the a write lock owner. So the owner value + * won't get changed by others. + */ +static inline void rwsem_disable_reader_optspin(struct rw_semaphore *sem, + bool disable) +{ + if (unlikely(disable)) { + atomic_long_or(RWSEM_RD_NONSPINNABLE, &sem->owner); + lockevent_inc(rwsem_opt_norspin); + } +} + +/* + * Wait until we successfully acquire the write lock + */ +static struct rw_semaphore * +rwsem_down_write_slowpath(struct rw_semaphore *sem, int state) +{ + long count; + bool disable_rspin; + enum writer_wait_state wstate; + struct rwsem_waiter waiter; + struct rw_semaphore *ret = sem; + DEFINE_WAKE_Q(wake_q); + + /* do optimistic spinning and steal lock if possible */ + if (rwsem_can_spin_on_owner(sem, RWSEM_WR_NONSPINNABLE) && + rwsem_optimistic_spin(sem, true)) + return sem; + + /* + * Disable reader optimistic spinning for this rwsem after + * acquiring the write lock when the setting of the nonspinnable + * bits are observed. + */ + disable_rspin = atomic_long_read(&sem->owner) & RWSEM_NONSPINNABLE; + + /* + * Optimistic spinning failed, proceed to the slowpath + * and block until we can acquire the sem. + */ + waiter.task = current; + waiter.type = RWSEM_WAITING_FOR_WRITE; + waiter.timeout = jiffies + RWSEM_WAIT_TIMEOUT; + + raw_spin_lock_irq(&sem->wait_lock); + + /* account for this before adding a new element to the list */ + wstate = list_empty(&sem->wait_list) ? WRITER_FIRST : WRITER_NOT_FIRST; + + list_add_tail(&waiter.list, &sem->wait_list); + + /* we're now waiting on the lock */ + if (wstate == WRITER_NOT_FIRST) { + count = atomic_long_read(&sem->count); + + /* + * If there were already threads queued before us and: + * 1) there are no no active locks, wake the front + * queued process(es) as the handoff bit might be set. + * 2) there are no active writers and some readers, the lock + * must be read owned; so we try to wake any read lock + * waiters that were queued ahead of us. + */ + if (count & RWSEM_WRITER_MASK) + goto wait; + + rwsem_mark_wake(sem, (count & RWSEM_READER_MASK) + ? RWSEM_WAKE_READERS + : RWSEM_WAKE_ANY, &wake_q); + + if (!wake_q_empty(&wake_q)) { + /* + * We want to minimize wait_lock hold time especially + * when a large number of readers are to be woken up. + */ + raw_spin_unlock_irq(&sem->wait_lock); + wake_up_q(&wake_q); + wake_q_init(&wake_q); /* Used again, reinit */ + raw_spin_lock_irq(&sem->wait_lock); + } + } else { + atomic_long_or(RWSEM_FLAG_WAITERS, &sem->count); + } + +wait: + /* wait until we successfully acquire the lock */ + set_current_state(state); + while (true) { + if (rwsem_try_write_lock(sem, wstate)) + break; + + raw_spin_unlock_irq(&sem->wait_lock); + + /* Block until there are no active lockers. */ + for (;;) { + if (signal_pending_state(state, current)) + goto out_nolock; + + schedule(); + lockevent_inc(rwsem_sleep_writer); + set_current_state(state); + /* + * If HANDOFF bit is set, unconditionally do + * a trylock. + */ + if (wstate == WRITER_HANDOFF) + break; + + if ((wstate == WRITER_NOT_FIRST) && + (rwsem_first_waiter(sem) == &waiter)) + wstate = WRITER_FIRST; + + count = atomic_long_read(&sem->count); + if (!(count & RWSEM_LOCK_MASK)) + break; + + /* + * The setting of the handoff bit is deferred + * until rwsem_try_write_lock() is called. + */ + if ((wstate == WRITER_FIRST) && (rt_task(current) || + time_after(jiffies, waiter.timeout))) { + wstate = WRITER_HANDOFF; + lockevent_inc(rwsem_wlock_handoff); + break; + } + } + + raw_spin_lock_irq(&sem->wait_lock); + } + __set_current_state(TASK_RUNNING); + list_del(&waiter.list); + rwsem_disable_reader_optspin(sem, disable_rspin); + raw_spin_unlock_irq(&sem->wait_lock); + lockevent_inc(rwsem_wlock); + + return ret; + +out_nolock: + __set_current_state(TASK_RUNNING); + raw_spin_lock_irq(&sem->wait_lock); + list_del(&waiter.list); + + if (unlikely(wstate == WRITER_HANDOFF)) + atomic_long_add(-RWSEM_FLAG_HANDOFF, &sem->count); + + if (list_empty(&sem->wait_list)) + atomic_long_andnot(RWSEM_FLAG_WAITERS, &sem->count); + else + rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q); + raw_spin_unlock_irq(&sem->wait_lock); + wake_up_q(&wake_q); + lockevent_inc(rwsem_wlock_fail); + + return ERR_PTR(-EINTR); +} + +/* + * handle waking up a waiter on the semaphore + * - up_read/up_write has decremented the active part of count if we come here + */ +static struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem, long count) +{ + unsigned long flags; + DEFINE_WAKE_Q(wake_q); + + raw_spin_lock_irqsave(&sem->wait_lock, flags); + + if (!list_empty(&sem->wait_list)) + rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q); + + raw_spin_unlock_irqrestore(&sem->wait_lock, flags); + wake_up_q(&wake_q); + + return sem; +} + +/* + * downgrade a write lock into a read lock + * - caller incremented waiting part of count and discovered it still negative + * - just wake up any readers at the front of the queue + */ +static struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem) +{ + unsigned long flags; + DEFINE_WAKE_Q(wake_q); + + raw_spin_lock_irqsave(&sem->wait_lock, flags); + + if (!list_empty(&sem->wait_list)) + rwsem_mark_wake(sem, RWSEM_WAKE_READ_OWNED, &wake_q); + + raw_spin_unlock_irqrestore(&sem->wait_lock, flags); + wake_up_q(&wake_q); + + return sem; +} + +/* + * lock for reading + */ +inline void __down_read(struct rw_semaphore *sem) +{ + if (!rwsem_read_trylock(sem)) { + rwsem_down_read_slowpath(sem, TASK_UNINTERRUPTIBLE); + DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem); + } else { + rwsem_set_reader_owned(sem); + } +} + +static inline int __down_read_killable(struct rw_semaphore *sem) +{ + if (!rwsem_read_trylock(sem)) { + if (IS_ERR(rwsem_down_read_slowpath(sem, TASK_KILLABLE))) + return -EINTR; + DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem); + } else { + rwsem_set_reader_owned(sem); + } + return 0; +} + +static inline int __down_read_trylock(struct rw_semaphore *sem) +{ + /* + * Optimize for the case when the rwsem is not locked at all. + */ + long tmp = RWSEM_UNLOCKED_VALUE; + + do { + if (atomic_long_try_cmpxchg_acquire(&sem->count, &tmp, + tmp + RWSEM_READER_BIAS)) { + rwsem_set_reader_owned(sem); + return 1; + } + } while (!(tmp & RWSEM_READ_FAILED_MASK)); + return 0; +} + +/* + * lock for writing + */ +static inline void __down_write(struct rw_semaphore *sem) +{ + long tmp = RWSEM_UNLOCKED_VALUE; + + if (unlikely(!atomic_long_try_cmpxchg_acquire(&sem->count, &tmp, + RWSEM_WRITER_LOCKED))) + rwsem_down_write_slowpath(sem, TASK_UNINTERRUPTIBLE); + else + rwsem_set_owner(sem); +} + +static inline int __down_write_killable(struct rw_semaphore *sem) +{ + long tmp = RWSEM_UNLOCKED_VALUE; + + if (unlikely(!atomic_long_try_cmpxchg_acquire(&sem->count, &tmp, + RWSEM_WRITER_LOCKED))) { + if (IS_ERR(rwsem_down_write_slowpath(sem, TASK_KILLABLE))) + return -EINTR; + } else { + rwsem_set_owner(sem); + } + return 0; +} + +static inline int __down_write_trylock(struct rw_semaphore *sem) +{ + long tmp = RWSEM_UNLOCKED_VALUE; + + if (atomic_long_try_cmpxchg_acquire(&sem->count, &tmp, + RWSEM_WRITER_LOCKED)) { + rwsem_set_owner(sem); + return true; + } + return false; +} + +/* + * unlock after reading + */ +inline void __up_read(struct rw_semaphore *sem) +{ + long tmp; + + DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem); + rwsem_clear_reader_owned(sem); + tmp = atomic_long_add_return_release(-RWSEM_READER_BIAS, &sem->count); + DEBUG_RWSEMS_WARN_ON(tmp < 0, sem); + if (unlikely((tmp & (RWSEM_LOCK_MASK|RWSEM_FLAG_WAITERS)) == + RWSEM_FLAG_WAITERS)) { + clear_wr_nonspinnable(sem); + rwsem_wake(sem, tmp); + } +} + +/* + * unlock after writing + */ +static inline void __up_write(struct rw_semaphore *sem) +{ + long tmp; + + /* + * sem->owner may differ from current if the ownership is transferred + * to an anonymous writer by setting the RWSEM_NONSPINNABLE bits. + */ + DEBUG_RWSEMS_WARN_ON((rwsem_owner(sem) != current) && + !rwsem_test_oflags(sem, RWSEM_NONSPINNABLE), sem); + rwsem_clear_owner(sem); + tmp = atomic_long_fetch_add_release(-RWSEM_WRITER_LOCKED, &sem->count); + if (unlikely(tmp & RWSEM_FLAG_WAITERS)) + rwsem_wake(sem, tmp); +} + +/* + * downgrade write lock to read lock + */ +static inline void __downgrade_write(struct rw_semaphore *sem) +{ + long tmp; + + /* + * When downgrading from exclusive to shared ownership, + * anything inside the write-locked region cannot leak + * into the read side. In contrast, anything in the + * read-locked region is ok to be re-ordered into the + * write side. As such, rely on RELEASE semantics. + */ + DEBUG_RWSEMS_WARN_ON(rwsem_owner(sem) != current, sem); + tmp = atomic_long_fetch_add_release( + -RWSEM_WRITER_LOCKED+RWSEM_READER_BIAS, &sem->count); + rwsem_set_reader_owned(sem); + if (tmp & RWSEM_FLAG_WAITERS) + rwsem_downgrade_wake(sem); +} /* * lock for reading @@ -24,9 +1445,7 @@ void __sched down_read(struct rw_semaphore *sem) rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_); LOCK_CONTENDED(sem, __down_read_trylock, __down_read); - rwsem_set_reader_owned(sem); } - EXPORT_SYMBOL(down_read); int __sched down_read_killable(struct rw_semaphore *sem) @@ -39,10 +1458,8 @@ int __sched down_read_killable(struct rw_semaphore *sem) return -EINTR; } - rwsem_set_reader_owned(sem); return 0; } - EXPORT_SYMBOL(down_read_killable); /* @@ -52,13 +1469,10 @@ int down_read_trylock(struct rw_semaphore *sem) { int ret = __down_read_trylock(sem); - if (ret == 1) { + if (ret == 1) rwsem_acquire_read(&sem->dep_map, 0, 1, _RET_IP_); - rwsem_set_reader_owned(sem); - } return ret; } - EXPORT_SYMBOL(down_read_trylock); /* @@ -68,11 +1482,8 @@ void __sched down_write(struct rw_semaphore *sem) { might_sleep(); rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_); - LOCK_CONTENDED(sem, __down_write_trylock, __down_write); - rwsem_set_owner(sem); } - EXPORT_SYMBOL(down_write); /* @@ -83,15 +1494,14 @@ int __sched down_write_killable(struct rw_semaphore *sem) might_sleep(); rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_); - if (LOCK_CONTENDED_RETURN(sem, __down_write_trylock, __down_write_killable)) { + if (LOCK_CONTENDED_RETURN(sem, __down_write_trylock, + __down_write_killable)) { rwsem_release(&sem->dep_map, 1, _RET_IP_); return -EINTR; } - rwsem_set_owner(sem); return 0; } - EXPORT_SYMBOL(down_write_killable); /* @@ -101,14 +1511,11 @@ int down_write_trylock(struct rw_semaphore *sem) { int ret = __down_write_trylock(sem); - if (ret == 1) { + if (ret == 1) rwsem_acquire(&sem->dep_map, 0, 1, _RET_IP_); - rwsem_set_owner(sem); - } return ret; } - EXPORT_SYMBOL(down_write_trylock); /* @@ -117,12 +1524,8 @@ EXPORT_SYMBOL(down_write_trylock); void up_read(struct rw_semaphore *sem) { rwsem_release(&sem->dep_map, 1, _RET_IP_); - DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner & RWSEM_READER_OWNED)); - - rwsem_clear_reader_owned(sem); __up_read(sem); } - EXPORT_SYMBOL(up_read); /* @@ -131,12 +1534,8 @@ EXPORT_SYMBOL(up_read); void up_write(struct rw_semaphore *sem) { rwsem_release(&sem->dep_map, 1, _RET_IP_); - DEBUG_RWSEMS_WARN_ON(sem->owner != current); - - rwsem_clear_owner(sem); __up_write(sem); } - EXPORT_SYMBOL(up_write); /* @@ -145,12 +1544,8 @@ EXPORT_SYMBOL(up_write); void downgrade_write(struct rw_semaphore *sem) { lock_downgrade(&sem->dep_map, _RET_IP_); - DEBUG_RWSEMS_WARN_ON(sem->owner != current); - - rwsem_set_reader_owned(sem); __downgrade_write(sem); } - EXPORT_SYMBOL(downgrade_write); #ifdef CONFIG_DEBUG_LOCK_ALLOC @@ -159,43 +1554,32 @@ void down_read_nested(struct rw_semaphore *sem, int subclass) { might_sleep(); rwsem_acquire_read(&sem->dep_map, subclass, 0, _RET_IP_); - LOCK_CONTENDED(sem, __down_read_trylock, __down_read); - rwsem_set_reader_owned(sem); } - EXPORT_SYMBOL(down_read_nested); void _down_write_nest_lock(struct rw_semaphore *sem, struct lockdep_map *nest) { might_sleep(); rwsem_acquire_nest(&sem->dep_map, 0, 0, nest, _RET_IP_); - LOCK_CONTENDED(sem, __down_write_trylock, __down_write); - rwsem_set_owner(sem); } - EXPORT_SYMBOL(_down_write_nest_lock); void down_read_non_owner(struct rw_semaphore *sem) { might_sleep(); - __down_read(sem); __rwsem_set_reader_owned(sem, NULL); } - EXPORT_SYMBOL(down_read_non_owner); void down_write_nested(struct rw_semaphore *sem, int subclass) { might_sleep(); rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_); - LOCK_CONTENDED(sem, __down_write_trylock, __down_write); - rwsem_set_owner(sem); } - EXPORT_SYMBOL(down_write_nested); int __sched down_write_killable_nested(struct rw_semaphore *sem, int subclass) @@ -203,23 +1587,21 @@ int __sched down_write_killable_nested(struct rw_semaphore *sem, int subclass) might_sleep(); rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_); - if (LOCK_CONTENDED_RETURN(sem, __down_write_trylock, __down_write_killable)) { + if (LOCK_CONTENDED_RETURN(sem, __down_write_trylock, + __down_write_killable)) { rwsem_release(&sem->dep_map, 1, _RET_IP_); return -EINTR; } - rwsem_set_owner(sem); return 0; } - EXPORT_SYMBOL(down_write_killable_nested); void up_read_non_owner(struct rw_semaphore *sem) { - DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner & RWSEM_READER_OWNED)); + DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem); __up_read(sem); } - EXPORT_SYMBOL(up_read_non_owner); #endif diff --git a/kernel/locking/rwsem.h b/kernel/locking/rwsem.h index bad2bca0268b..2534ce49f648 100644 --- a/kernel/locking/rwsem.h +++ b/kernel/locking/rwsem.h @@ -1,134 +1,10 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* - * The least significant 2 bits of the owner value has the following - * meanings when set. - * - RWSEM_READER_OWNED (bit 0): The rwsem is owned by readers - * - RWSEM_ANONYMOUSLY_OWNED (bit 1): The rwsem is anonymously owned, - * i.e. the owner(s) cannot be readily determined. It can be reader - * owned or the owning writer is indeterminate. - * - * When a writer acquires a rwsem, it puts its task_struct pointer - * into the owner field. It is cleared after an unlock. - * - * When a reader acquires a rwsem, it will also puts its task_struct - * pointer into the owner field with both the RWSEM_READER_OWNED and - * RWSEM_ANONYMOUSLY_OWNED bits set. On unlock, the owner field will - * largely be left untouched. So for a free or reader-owned rwsem, - * the owner value may contain information about the last reader that - * acquires the rwsem. The anonymous bit is set because that particular - * reader may or may not still own the lock. - * - * That information may be helpful in debugging cases where the system - * seems to hang on a reader owned rwsem especially if only one reader - * is involved. Ideally we would like to track all the readers that own - * a rwsem, but the overhead is simply too big. - */ -#define RWSEM_READER_OWNED (1UL << 0) -#define RWSEM_ANONYMOUSLY_OWNED (1UL << 1) -#ifdef CONFIG_DEBUG_RWSEMS -# define DEBUG_RWSEMS_WARN_ON(c) DEBUG_LOCKS_WARN_ON(c) -#else -# define DEBUG_RWSEMS_WARN_ON(c) -#endif +#ifndef __INTERNAL_RWSEM_H +#define __INTERNAL_RWSEM_H +#include <linux/rwsem.h> -#ifdef CONFIG_RWSEM_SPIN_ON_OWNER -/* - * All writes to owner are protected by WRITE_ONCE() to make sure that - * store tearing can't happen as optimistic spinners may read and use - * the owner value concurrently without lock. Read from owner, however, - * may not need READ_ONCE() as long as the pointer value is only used - * for comparison and isn't being dereferenced. - */ -static inline void rwsem_set_owner(struct rw_semaphore *sem) -{ - WRITE_ONCE(sem->owner, current); -} +extern void __down_read(struct rw_semaphore *sem); +extern void __up_read(struct rw_semaphore *sem); -static inline void rwsem_clear_owner(struct rw_semaphore *sem) -{ - WRITE_ONCE(sem->owner, NULL); -} - -/* - * The task_struct pointer of the last owning reader will be left in - * the owner field. - * - * Note that the owner value just indicates the task has owned the rwsem - * previously, it may not be the real owner or one of the real owners - * anymore when that field is examined, so take it with a grain of salt. - */ -static inline void __rwsem_set_reader_owned(struct rw_semaphore *sem, - struct task_struct *owner) -{ - unsigned long val = (unsigned long)owner | RWSEM_READER_OWNED - | RWSEM_ANONYMOUSLY_OWNED; - - WRITE_ONCE(sem->owner, (struct task_struct *)val); -} - -static inline void rwsem_set_reader_owned(struct rw_semaphore *sem) -{ - __rwsem_set_reader_owned(sem, current); -} - -/* - * Return true if the a rwsem waiter can spin on the rwsem's owner - * and steal the lock, i.e. the lock is not anonymously owned. - * N.B. !owner is considered spinnable. - */ -static inline bool is_rwsem_owner_spinnable(struct task_struct *owner) -{ - return !((unsigned long)owner & RWSEM_ANONYMOUSLY_OWNED); -} - -/* - * Return true if rwsem is owned by an anonymous writer or readers. - */ -static inline bool rwsem_has_anonymous_owner(struct task_struct *owner) -{ - return (unsigned long)owner & RWSEM_ANONYMOUSLY_OWNED; -} - -#ifdef CONFIG_DEBUG_RWSEMS -/* - * With CONFIG_DEBUG_RWSEMS configured, it will make sure that if there - * is a task pointer in owner of a reader-owned rwsem, it will be the - * real owner or one of the real owners. The only exception is when the - * unlock is done by up_read_non_owner(). - */ -#define rwsem_clear_reader_owned rwsem_clear_reader_owned -static inline void rwsem_clear_reader_owned(struct rw_semaphore *sem) -{ - unsigned long val = (unsigned long)current | RWSEM_READER_OWNED - | RWSEM_ANONYMOUSLY_OWNED; - if (READ_ONCE(sem->owner) == (struct task_struct *)val) - cmpxchg_relaxed((unsigned long *)&sem->owner, val, - RWSEM_READER_OWNED | RWSEM_ANONYMOUSLY_OWNED); -} -#endif - -#else -static inline void rwsem_set_owner(struct rw_semaphore *sem) -{ -} - -static inline void rwsem_clear_owner(struct rw_semaphore *sem) -{ -} - -static inline void __rwsem_set_reader_owned(struct rw_semaphore *sem, - struct task_struct *owner) -{ -} - -static inline void rwsem_set_reader_owned(struct rw_semaphore *sem) -{ -} -#endif - -#ifndef rwsem_clear_reader_owned -static inline void rwsem_clear_reader_owned(struct rw_semaphore *sem) -{ -} -#endif +#endif /* __INTERNAL_RWSEM_H */ diff --git a/kernel/locking/semaphore.c b/kernel/locking/semaphore.c index 561acdd39960..d9dd94defc0a 100644 --- a/kernel/locking/semaphore.c +++ b/kernel/locking/semaphore.c @@ -1,9 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2008 Intel Corporation * Author: Matthew Wilcox <willy@linux.intel.com> * - * Distributed under the terms of the GNU GPL, version 2 - * * This file implements counting semaphores. * A counting semaphore may be acquired 'n' times before sleeping. * See mutex.c for single-acquisition sleeping locks which enforce diff --git a/kernel/locking/spinlock.c b/kernel/locking/spinlock.c index 936f3d14dd6b..0ff08380f531 100644 --- a/kernel/locking/spinlock.c +++ b/kernel/locking/spinlock.c @@ -22,6 +22,13 @@ #include <linux/debug_locks.h> #include <linux/export.h> +#ifdef CONFIG_MMIOWB +#ifndef arch_mmiowb_state +DEFINE_PER_CPU(struct mmiowb_state, __mmiowb_state); +EXPORT_PER_CPU_SYMBOL(__mmiowb_state); +#endif +#endif + /* * If lockdep is enabled then we use the non-preemption spin-ops * even on CONFIG_PREEMPT, because lockdep assumes that interrupts are diff --git a/kernel/locking/spinlock_debug.c b/kernel/locking/spinlock_debug.c index 9aa0fccd5d43..399669f7eba8 100644 --- a/kernel/locking/spinlock_debug.c +++ b/kernel/locking/spinlock_debug.c @@ -111,6 +111,7 @@ void do_raw_spin_lock(raw_spinlock_t *lock) { debug_spin_lock_before(lock); arch_spin_lock(&lock->raw_lock); + mmiowb_spin_lock(); debug_spin_lock_after(lock); } @@ -118,8 +119,10 @@ int do_raw_spin_trylock(raw_spinlock_t *lock) { int ret = arch_spin_trylock(&lock->raw_lock); - if (ret) + if (ret) { + mmiowb_spin_lock(); debug_spin_lock_after(lock); + } #ifndef CONFIG_SMP /* * Must not happen on UP: @@ -131,6 +134,7 @@ int do_raw_spin_trylock(raw_spinlock_t *lock) void do_raw_spin_unlock(raw_spinlock_t *lock) { + mmiowb_spin_unlock(); debug_spin_unlock(lock); arch_spin_unlock(&lock->raw_lock); } diff --git a/kernel/locking/test-ww_mutex.c b/kernel/locking/test-ww_mutex.c index 65a3b7e55b9f..3e82f449b4ff 100644 --- a/kernel/locking/test-ww_mutex.c +++ b/kernel/locking/test-ww_mutex.c @@ -1,19 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * Module-based API test facility for ww_mutexes - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, you can access it online at - * http://www.gnu.org/licenses/gpl-2.0.html. */ #include <linux/kernel.h> diff --git a/kernel/memremap.c b/kernel/memremap.c index a856cb5ff192..bea6f887adad 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -11,42 +11,39 @@ #include <linux/types.h> #include <linux/wait_bit.h> #include <linux/xarray.h> -#include <linux/hmm.h> static DEFINE_XARRAY(pgmap_array); #define SECTION_MASK ~((1UL << PA_SECTION_SHIFT) - 1) #define SECTION_SIZE (1UL << PA_SECTION_SHIFT) -#if IS_ENABLED(CONFIG_DEVICE_PRIVATE) -vm_fault_t device_private_entry_fault(struct vm_area_struct *vma, - unsigned long addr, - swp_entry_t entry, - unsigned int flags, - pmd_t *pmdp) +#ifdef CONFIG_DEV_PAGEMAP_OPS +DEFINE_STATIC_KEY_FALSE(devmap_managed_key); +EXPORT_SYMBOL(devmap_managed_key); +static atomic_t devmap_managed_enable; + +static void devmap_managed_enable_put(void *data) { - struct page *page = device_private_entry_to_page(entry); - struct hmm_devmem *devmem; + if (atomic_dec_and_test(&devmap_managed_enable)) + static_branch_disable(&devmap_managed_key); +} - devmem = container_of(page->pgmap, typeof(*devmem), pagemap); +static int devmap_managed_enable_get(struct device *dev, struct dev_pagemap *pgmap) +{ + if (!pgmap->ops || !pgmap->ops->page_free) { + WARN(1, "Missing page_free method\n"); + return -EINVAL; + } - /* - * The page_fault() callback must migrate page back to system memory - * so that CPU can access it. This might fail for various reasons - * (device issue, device was unsafely unplugged, ...). When such - * error conditions happen, the callback must return VM_FAULT_SIGBUS. - * - * Note that because memory cgroup charges are accounted to the device - * memory, this should never fail because of memory restrictions (but - * allocation of regular system page might still fail because we are - * out of memory). - * - * There is a more in-depth description of what that callback can and - * cannot do, in include/linux/memremap.h - */ - return devmem->page_fault(vma, addr, page, flags, pmdp); + if (atomic_inc_return(&devmap_managed_enable) == 1) + static_branch_enable(&devmap_managed_key); + return devm_add_action_or_reset(dev, devmap_managed_enable_put, NULL); +} +#else +static int devmap_managed_enable_get(struct device *dev, struct dev_pagemap *pgmap) +{ + return -EINVAL; } -EXPORT_SYMBOL(device_private_entry_fault); -#endif /* CONFIG_DEVICE_PRIVATE */ +#endif /* CONFIG_DEV_PAGEMAP_OPS */ static void pgmap_array_delete(struct resource *res) { @@ -57,14 +54,8 @@ static void pgmap_array_delete(struct resource *res) static unsigned long pfn_first(struct dev_pagemap *pgmap) { - const struct resource *res = &pgmap->res; - struct vmem_altmap *altmap = &pgmap->altmap; - unsigned long pfn; - - pfn = res->start >> PAGE_SHIFT; - if (pgmap->altmap_valid) - pfn += vmem_altmap_offset(altmap); - return pfn; + return (pgmap->res.start >> PAGE_SHIFT) + + vmem_altmap_offset(pgmap_altmap(pgmap)); } static unsigned long pfn_end(struct dev_pagemap *pgmap) @@ -84,6 +75,24 @@ static unsigned long pfn_next(unsigned long pfn) #define for_each_device_pfn(pfn, map) \ for (pfn = pfn_first(map); pfn < pfn_end(map); pfn = pfn_next(pfn)) +static void dev_pagemap_kill(struct dev_pagemap *pgmap) +{ + if (pgmap->ops && pgmap->ops->kill) + pgmap->ops->kill(pgmap); + else + percpu_ref_kill(pgmap->ref); +} + +static void dev_pagemap_cleanup(struct dev_pagemap *pgmap) +{ + if (pgmap->ops && pgmap->ops->cleanup) { + pgmap->ops->cleanup(pgmap); + } else { + wait_for_completion(&pgmap->done); + percpu_ref_exit(pgmap->ref); + } +} + static void devm_memremap_pages_release(void *data) { struct dev_pagemap *pgmap = data; @@ -93,9 +102,10 @@ static void devm_memremap_pages_release(void *data) unsigned long pfn; int nid; - pgmap->kill(pgmap->ref); + dev_pagemap_kill(pgmap); for_each_device_pfn(pfn, pgmap) put_page(pfn_to_page(pfn)); + dev_pagemap_cleanup(pgmap); /* pages are dead and unused, undo the arch mapping */ align_start = res->start & ~(SECTION_SIZE - 1); @@ -111,7 +121,7 @@ static void devm_memremap_pages_release(void *data) align_size >> PAGE_SHIFT, NULL); } else { arch_remove_memory(nid, align_start, align_size, - pgmap->altmap_valid ? &pgmap->altmap : NULL); + pgmap_altmap(pgmap)); kasan_remove_zero_shadow(__va(align_start), align_size); } mem_hotplug_done(); @@ -122,19 +132,28 @@ static void devm_memremap_pages_release(void *data) "%s: failed to free all reserved pages\n", __func__); } +static void dev_pagemap_percpu_release(struct percpu_ref *ref) +{ + struct dev_pagemap *pgmap = + container_of(ref, struct dev_pagemap, internal_ref); + + complete(&pgmap->done); +} + /** * devm_memremap_pages - remap and provide memmap backing for the given resource * @dev: hosting device for @res * @pgmap: pointer to a struct dev_pagemap * * Notes: - * 1/ At a minimum the res, ref and type members of @pgmap must be initialized + * 1/ At a minimum the res and type members of @pgmap must be initialized * by the caller before passing it to this function * - * 2/ The altmap field may optionally be initialized, in which case altmap_valid - * must be set to true + * 2/ The altmap field may optionally be initialized, in which case + * PGMAP_ALTMAP_VALID must be set in pgmap->flags. * - * 3/ pgmap->ref must be 'live' on entry and will be killed at + * 3/ The ref field may optionally be provided, in which pgmap->ref must be + * 'live' on entry and will be killed and reaped at * devm_memremap_pages_release() time, or if this routine fails. * * 4/ res is expected to be a host memory range that could feasibly be @@ -144,15 +163,67 @@ static void devm_memremap_pages_release(void *data) void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) { resource_size_t align_start, align_size, align_end; - struct vmem_altmap *altmap = pgmap->altmap_valid ? - &pgmap->altmap : NULL; struct resource *res = &pgmap->res; struct dev_pagemap *conflict_pgmap; + struct mhp_restrictions restrictions = { + /* + * We do not want any optional features only our own memmap + */ + .altmap = pgmap_altmap(pgmap), + }; pgprot_t pgprot = PAGE_KERNEL; int error, nid, is_ram; + bool need_devmap_managed = true; + + switch (pgmap->type) { + case MEMORY_DEVICE_PRIVATE: + if (!IS_ENABLED(CONFIG_DEVICE_PRIVATE)) { + WARN(1, "Device private memory not supported\n"); + return ERR_PTR(-EINVAL); + } + if (!pgmap->ops || !pgmap->ops->migrate_to_ram) { + WARN(1, "Missing migrate_to_ram method\n"); + return ERR_PTR(-EINVAL); + } + break; + case MEMORY_DEVICE_FS_DAX: + if (!IS_ENABLED(CONFIG_ZONE_DEVICE) || + IS_ENABLED(CONFIG_FS_DAX_LIMITED)) { + WARN(1, "File system DAX not supported\n"); + return ERR_PTR(-EINVAL); + } + break; + case MEMORY_DEVICE_DEVDAX: + case MEMORY_DEVICE_PCI_P2PDMA: + need_devmap_managed = false; + break; + default: + WARN(1, "Invalid pgmap type %d\n", pgmap->type); + break; + } - if (!pgmap->ref || !pgmap->kill) - return ERR_PTR(-EINVAL); + if (!pgmap->ref) { + if (pgmap->ops && (pgmap->ops->kill || pgmap->ops->cleanup)) + return ERR_PTR(-EINVAL); + + init_completion(&pgmap->done); + error = percpu_ref_init(&pgmap->internal_ref, + dev_pagemap_percpu_release, 0, GFP_KERNEL); + if (error) + return ERR_PTR(error); + pgmap->ref = &pgmap->internal_ref; + } else { + if (!pgmap->ops || !pgmap->ops->kill || !pgmap->ops->cleanup) { + WARN(1, "Missing reference count teardown definition\n"); + return ERR_PTR(-EINVAL); + } + } + + if (need_devmap_managed) { + error = devmap_managed_enable_get(dev, pgmap); + if (error) + return ERR_PTR(error); + } align_start = res->start & ~(SECTION_SIZE - 1); align_size = ALIGN(res->start + resource_size(res), SECTION_SIZE) @@ -163,14 +234,16 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) if (conflict_pgmap) { dev_WARN(dev, "Conflicting mapping in same section\n"); put_dev_pagemap(conflict_pgmap); - return ERR_PTR(-ENOMEM); + error = -ENOMEM; + goto err_array; } conflict_pgmap = get_dev_pagemap(PHYS_PFN(align_end), NULL); if (conflict_pgmap) { dev_WARN(dev, "Conflicting mapping in same section\n"); put_dev_pagemap(conflict_pgmap); - return ERR_PTR(-ENOMEM); + error = -ENOMEM; + goto err_array; } is_ram = region_intersects(align_start, align_size, @@ -214,7 +287,7 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) */ if (pgmap->type == MEMORY_DEVICE_PRIVATE) { error = add_pages(nid, align_start >> PAGE_SHIFT, - align_size >> PAGE_SHIFT, NULL, false); + align_size >> PAGE_SHIFT, &restrictions); } else { error = kasan_add_zero_shadow(__va(align_start), align_size); if (error) { @@ -222,8 +295,8 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) goto err_kasan; } - error = arch_add_memory(nid, align_start, align_size, altmap, - false); + error = arch_add_memory(nid, align_start, align_size, + &restrictions); } if (!error) { @@ -231,7 +304,7 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) zone = &NODE_DATA(nid)->node_zones[ZONE_DEVICE]; move_pfn_range_to_zone(zone, align_start >> PAGE_SHIFT, - align_size >> PAGE_SHIFT, altmap); + align_size >> PAGE_SHIFT, pgmap_altmap(pgmap)); } mem_hotplug_done(); @@ -261,15 +334,24 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) err_pfn_remap: pgmap_array_delete(res); err_array: - pgmap->kill(pgmap->ref); + dev_pagemap_kill(pgmap); + dev_pagemap_cleanup(pgmap); return ERR_PTR(error); } EXPORT_SYMBOL_GPL(devm_memremap_pages); +void devm_memunmap_pages(struct device *dev, struct dev_pagemap *pgmap) +{ + devm_release_action(dev, devm_memremap_pages_release, pgmap); +} +EXPORT_SYMBOL_GPL(devm_memunmap_pages); + unsigned long vmem_altmap_offset(struct vmem_altmap *altmap) { /* number of pfns from base where pfn_to_page() is valid */ - return altmap->reserve + altmap->free; + if (altmap) + return altmap->reserve + altmap->free; + return 0; } void vmem_altmap_free(struct vmem_altmap *altmap, unsigned long nr_pfns) @@ -311,28 +393,6 @@ struct dev_pagemap *get_dev_pagemap(unsigned long pfn, EXPORT_SYMBOL_GPL(get_dev_pagemap); #ifdef CONFIG_DEV_PAGEMAP_OPS -DEFINE_STATIC_KEY_FALSE(devmap_managed_key); -EXPORT_SYMBOL(devmap_managed_key); -static atomic_t devmap_enable; - -/* - * Toggle the static key for ->page_free() callbacks when dev_pagemap - * pages go idle. - */ -void dev_pagemap_get_ops(void) -{ - if (atomic_inc_return(&devmap_enable) == 1) - static_branch_enable(&devmap_managed_key); -} -EXPORT_SYMBOL_GPL(dev_pagemap_get_ops); - -void dev_pagemap_put_ops(void) -{ - if (atomic_dec_and_test(&devmap_enable)) - static_branch_disable(&devmap_managed_key); -} -EXPORT_SYMBOL_GPL(dev_pagemap_put_ops); - void __put_devmap_managed_page(struct page *page) { int count = page_ref_dec_return(page); @@ -348,7 +408,7 @@ void __put_devmap_managed_page(struct page *page) mem_cgroup_uncharge(page); - page->pgmap->page_free(page, page->pgmap->data); + page->pgmap->ops->page_free(page); } else if (!count) __put_page(page); } diff --git a/kernel/module-internal.h b/kernel/module-internal.h index 79c9be2dbbe9..33783abc377b 100644 --- a/kernel/module-internal.h +++ b/kernel/module-internal.h @@ -1,12 +1,8 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ /* Module internals * * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public Licence - * as published by the Free Software Foundation; either version - * 2 of the Licence, or (at your option) any later version. */ #include <linux/elf.h> @@ -20,7 +16,7 @@ struct load_info { unsigned long len; Elf_Shdr *sechdrs; char *secstrings, *strtab; - unsigned long symoffs, stroffs; + unsigned long symoffs, stroffs, init_typeoffs, core_typeoffs; struct _ddebug *debug; unsigned int num_debug; bool sig_ok; diff --git a/kernel/module.c b/kernel/module.c index 2ad1b5239910..a2cee14a83f3 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1,20 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* Copyright (C) 2002 Richard Henderson Copyright (C) 2001 Rusty Russell, 2002, 2010 Rusty Russell IBM. - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program; if not, write to the Free Software - Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include <linux/export.h> #include <linux/extable.h> @@ -98,6 +86,10 @@ DEFINE_MUTEX(module_mutex); EXPORT_SYMBOL_GPL(module_mutex); static LIST_HEAD(modules); +/* Work queue for freeing init sections in success case */ +static struct work_struct init_free_wq; +static struct llist_head init_free_list; + #ifdef CONFIG_MODULES_TREE_LOOKUP /* @@ -286,6 +278,11 @@ bool is_module_sig_enforced(void) } EXPORT_SYMBOL(is_module_sig_enforced); +void set_module_sig_enforced(void) +{ + sig_enforce = true; +} + /* Block module loading/unloading? */ int modules_disabled = 0; core_param(nomodule, modules_disabled, bint, 0); @@ -1949,9 +1946,16 @@ void module_enable_ro(const struct module *mod, bool after_init) if (!rodata_enabled) return; + set_vm_flush_reset_perms(mod->core_layout.base); + set_vm_flush_reset_perms(mod->init_layout.base); frob_text(&mod->core_layout, set_memory_ro); + frob_text(&mod->core_layout, set_memory_x); + frob_rodata(&mod->core_layout, set_memory_ro); + frob_text(&mod->init_layout, set_memory_ro); + frob_text(&mod->init_layout, set_memory_x); + frob_rodata(&mod->init_layout, set_memory_ro); if (after_init) @@ -1967,15 +1971,6 @@ static void module_enable_nx(const struct module *mod) frob_writable_data(&mod->init_layout, set_memory_nx); } -static void module_disable_nx(const struct module *mod) -{ - frob_rodata(&mod->core_layout, set_memory_x); - frob_ro_after_init(&mod->core_layout, set_memory_x); - frob_writable_data(&mod->core_layout, set_memory_x); - frob_rodata(&mod->init_layout, set_memory_x); - frob_writable_data(&mod->init_layout, set_memory_x); -} - /* Iterate through all modules and set each module's text as RW */ void set_all_modules_text_rw(void) { @@ -2019,23 +2014,8 @@ void set_all_modules_text_ro(void) } mutex_unlock(&module_mutex); } - -static void disable_ro_nx(const struct module_layout *layout) -{ - if (rodata_enabled) { - frob_text(layout, set_memory_rw); - frob_rodata(layout, set_memory_rw); - frob_ro_after_init(layout, set_memory_rw); - } - frob_rodata(layout, set_memory_x); - frob_ro_after_init(layout, set_memory_x); - frob_writable_data(layout, set_memory_x); -} - #else -static void disable_ro_nx(const struct module_layout *layout) { } static void module_enable_nx(const struct module *mod) { } -static void module_disable_nx(const struct module *mod) { } #endif #ifdef CONFIG_LIVEPATCH @@ -2115,6 +2095,11 @@ static void free_module_elf(struct module *mod) void __weak module_memfree(void *module_region) { + /* + * This memory may be RO, and freeing RO memory in an interrupt is not + * supported by vmalloc. + */ + WARN_ON(in_interrupt()); vfree(module_region); } @@ -2166,7 +2151,6 @@ static void free_module(struct module *mod) mutex_unlock(&module_mutex); /* This may be empty, but that's OK */ - disable_ro_nx(&mod->init_layout); module_arch_freeing_init(mod); module_memfree(mod->init_layout.base); kfree(mod->args); @@ -2176,7 +2160,6 @@ static void free_module(struct module *mod) lockdep_free_key_range(mod->core_layout.base, mod->core_layout.size); /* Finally, free the core (containing the module structure) */ - disable_ro_nx(&mod->core_layout); module_memfree(mod->core_layout.base); } @@ -2647,6 +2630,8 @@ static void layout_symtab(struct module *mod, struct load_info *info) info->symoffs = ALIGN(mod->core_layout.size, symsect->sh_addralign ?: 1); info->stroffs = mod->core_layout.size = info->symoffs + ndst * sizeof(Elf_Sym); mod->core_layout.size += strtab_size; + info->core_typeoffs = mod->core_layout.size; + mod->core_layout.size += ndst * sizeof(char); mod->core_layout.size = debug_align(mod->core_layout.size); /* Put string table section at end of init part of module. */ @@ -2660,6 +2645,8 @@ static void layout_symtab(struct module *mod, struct load_info *info) __alignof__(struct mod_kallsyms)); info->mod_kallsyms_init_off = mod->init_layout.size; mod->init_layout.size += sizeof(struct mod_kallsyms); + info->init_typeoffs = mod->init_layout.size; + mod->init_layout.size += nsrc * sizeof(char); mod->init_layout.size = debug_align(mod->init_layout.size); } @@ -2683,20 +2670,23 @@ static void add_kallsyms(struct module *mod, const struct load_info *info) mod->kallsyms->num_symtab = symsec->sh_size / sizeof(Elf_Sym); /* Make sure we get permanent strtab: don't use info->strtab. */ mod->kallsyms->strtab = (void *)info->sechdrs[info->index.str].sh_addr; + mod->kallsyms->typetab = mod->init_layout.base + info->init_typeoffs; - /* Set types up while we still have access to sections. */ - for (i = 0; i < mod->kallsyms->num_symtab; i++) - mod->kallsyms->symtab[i].st_size - = elf_type(&mod->kallsyms->symtab[i], info); - - /* Now populate the cut down core kallsyms for after init. */ + /* + * Now populate the cut down core kallsyms for after init + * and set types up while we still have access to sections. + */ mod->core_kallsyms.symtab = dst = mod->core_layout.base + info->symoffs; mod->core_kallsyms.strtab = s = mod->core_layout.base + info->stroffs; + mod->core_kallsyms.typetab = mod->core_layout.base + info->core_typeoffs; src = mod->kallsyms->symtab; for (ndst = i = 0; i < mod->kallsyms->num_symtab; i++) { + mod->kallsyms->typetab[i] = elf_type(src + i, info); if (i == 0 || is_livepatch_module(mod) || is_core_symbol(src+i, info->sechdrs, info->hdr->e_shnum, info->index.pcpu)) { + mod->core_kallsyms.typetab[ndst] = + mod->kallsyms->typetab[i]; dst[ndst] = src[i]; dst[ndst++].st_name = s - mod->core_kallsyms.strtab; s += strlcpy(s, &mod->kallsyms->strtab[src[i].st_name], @@ -2719,11 +2709,7 @@ static void dynamic_debug_setup(struct module *mod, struct _ddebug *debug, unsig { if (!debug) return; -#ifdef CONFIG_DYNAMIC_DEBUG - if (ddebug_add_module(debug, num, mod->name)) - pr_err("dynamic debug error adding module: %s\n", - debug->modname); -#endif + ddebug_add_module(debug, num, mod->name); } static void dynamic_debug_remove(struct module *mod, struct _ddebug *debug) @@ -3097,6 +3083,11 @@ static int find_module_sections(struct module *mod, struct load_info *info) sizeof(*mod->tracepoints_ptrs), &mod->num_tracepoints); #endif +#ifdef CONFIG_TREE_SRCU + mod->srcu_struct_ptrs = section_objs(info, "___srcu_struct_ptrs", + sizeof(*mod->srcu_struct_ptrs), + &mod->num_srcu_structs); +#endif #ifdef CONFIG_BPF_EVENTS mod->bpf_raw_events = section_objs(info, "__bpf_raw_tp_map", sizeof(*mod->bpf_raw_events), @@ -3419,16 +3410,33 @@ static void do_mod_ctors(struct module *mod) /* For freeing module_init on success, in case kallsyms traversing */ struct mod_initfree { - struct rcu_head rcu; + struct llist_node node; void *module_init; }; -static void do_free_init(struct rcu_head *head) +static void do_free_init(struct work_struct *w) +{ + struct llist_node *pos, *n, *list; + struct mod_initfree *initfree; + + list = llist_del_all(&init_free_list); + + synchronize_rcu(); + + llist_for_each_safe(pos, n, list) { + initfree = container_of(pos, struct mod_initfree, node); + module_memfree(initfree->module_init); + kfree(initfree); + } +} + +static int __init modules_wq_init(void) { - struct mod_initfree *m = container_of(head, struct mod_initfree, rcu); - module_memfree(m->module_init); - kfree(m); + INIT_WORK(&init_free_wq, do_free_init); + init_llist_head(&init_free_list); + return 0; } +module_init(modules_wq_init); /* * This is where the real work happens. @@ -3506,7 +3514,6 @@ static noinline int do_init_module(struct module *mod) #endif module_enable_ro(mod, true); mod_tree_remove_init(mod); - disable_ro_nx(&mod->init_layout); module_arch_freeing_init(mod); mod->init_layout.base = NULL; mod->init_layout.size = 0; @@ -3517,14 +3524,18 @@ static noinline int do_init_module(struct module *mod) * We want to free module_init, but be aware that kallsyms may be * walking this with preempt disabled. In all the failure paths, we * call synchronize_rcu(), but we don't want to slow down the success - * path, so use actual RCU here. + * path. module_memfree() cannot be called in an interrupt, so do the + * work and call synchronize_rcu() in a work queue. + * * Note that module_alloc() on most architectures creates W+X page * mappings which won't be cleaned up until do_free_init() runs. Any * code such as mark_rodata_ro() which depends on those mappings to * be cleaned up needs to sync with the queued work - ie * rcu_barrier() */ - call_rcu(&freeinit->rcu, do_free_init); + if (llist_add(&freeinit->node, &init_free_list)) + schedule_work(&init_free_wq); + mutex_unlock(&module_mutex); wake_up_all(&module_wq); @@ -3821,10 +3832,6 @@ static int load_module(struct load_info *info, const char __user *uargs, module_bug_cleanup(mod); mutex_unlock(&module_mutex); - /* we can't deallocate the module until we clear memory protection */ - module_disable_ro(mod); - module_disable_nx(mod); - ddebug_cleanup: ftrace_release_mod(mod); dynamic_debug_remove(mod, info->debug); @@ -4084,7 +4091,7 @@ int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type, const Elf_Sym *sym = &kallsyms->symtab[symnum]; *value = kallsyms_symbol_value(sym); - *type = sym->st_size; + *type = kallsyms->typetab[symnum]; strlcpy(name, kallsyms_symbol_name(kallsyms, symnum), KSYM_NAME_LEN); strlcpy(module_name, mod->name, MODULE_NAME_LEN); *exported = is_exported(name, *value, mod); diff --git a/kernel/module_signing.c b/kernel/module_signing.c index 6b9a926fd86b..b10fb1986ca9 100644 --- a/kernel/module_signing.c +++ b/kernel/module_signing.c @@ -1,12 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* Module signature checker * * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public Licence - * as published by the Free Software Foundation; either version - * 2 of the Licence, or (at your option) any later version. */ #include <linux/kernel.h> diff --git a/kernel/notifier.c b/kernel/notifier.c index 6196af8a8223..d9f5081d578d 100644 --- a/kernel/notifier.c +++ b/kernel/notifier.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only #include <linux/kdebug.h> #include <linux/kprobes.h> #include <linux/export.h> @@ -22,6 +23,7 @@ static int notifier_chain_register(struct notifier_block **nl, struct notifier_block *n) { while ((*nl) != NULL) { + WARN_ONCE(((*nl) == n), "double register detected"); if (n->priority > (*nl)->priority) break; nl = &((*nl)->next); diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index f6c5d330059a..c815f58e6bc0 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -1,13 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2006 IBM Corporation * * Author: Serge Hallyn <serue@us.ibm.com> * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation, version 2 of the - * License. - * * Jun 2006 - namespaces support * OpenVZ, SWsoft Inc. * Pavel Emelianov <xemul@openvz.org> diff --git a/kernel/padata.c b/kernel/padata.c index 3e2633ae3bca..2d2fddbb7a4c 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -957,6 +957,7 @@ static struct attribute *padata_default_attrs[] = { ¶llel_cpumask_attr.attr, NULL, }; +ATTRIBUTE_GROUPS(padata_default); static ssize_t padata_sysfs_show(struct kobject *kobj, struct attribute *attr, char *buf) @@ -995,7 +996,7 @@ static const struct sysfs_ops padata_sysfs_ops = { static struct kobj_type padata_attr_type = { .sysfs_ops = &padata_sysfs_ops, - .default_attrs = padata_default_attrs, + .default_groups = padata_default_groups, .release = padata_sysfs_release, }; diff --git a/kernel/panic.c b/kernel/panic.c index f121e6ba7e11..057540b6eee9 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * linux/kernel/panic.c * @@ -51,6 +52,7 @@ EXPORT_SYMBOL_GPL(panic_timeout); #define PANIC_PRINT_TIMER_INFO 0x00000004 #define PANIC_PRINT_LOCK_INFO 0x00000008 #define PANIC_PRINT_FTRACE_INFO 0x00000010 +#define PANIC_PRINT_ALL_PRINTK_MSG 0x00000020 unsigned long panic_print; ATOMIC_NOTIFIER_HEAD(panic_notifier_list); @@ -134,6 +136,9 @@ EXPORT_SYMBOL(nmi_panic); static void panic_print_sys_info(void) { + if (panic_print & PANIC_PRINT_ALL_PRINTK_MSG) + console_flush_on_panic(CONSOLE_REPLAY_ALL); + if (panic_print & PANIC_PRINT_TASK_INFO) show_state(); @@ -277,7 +282,7 @@ void panic(const char *fmt, ...) * panic() is not being callled from OOPS. */ debug_locks_off(); - console_flush_on_panic(); + console_flush_on_panic(CONSOLE_FLUSH_PENDING); panic_print_sys_info(); @@ -306,6 +311,8 @@ void panic(const char *fmt, ...) * shutting down. But if there is a chance of * rebooting the system it will be rebooted. */ + if (panic_reboot_mode != REBOOT_UNDEFINED) + reboot_mode = panic_reboot_mode; emergency_restart(); } #ifdef __sparc__ @@ -318,14 +325,12 @@ void panic(const char *fmt, ...) } #endif #if defined(CONFIG_S390) - { - unsigned long caller; - - caller = (unsigned long)__builtin_return_address(0); - disabled_wait(caller); - } + disabled_wait(); #endif pr_emerg("---[ end Kernel panic - not syncing: %s ]---\n", buf); + + /* Do not scroll important messages printed above */ + suppress_printk = 1; local_irq_enable(); for (i = 0; ; i += PANIC_TIMER_STEP) { touch_softlockup_watchdog(); @@ -367,7 +372,7 @@ const struct taint_flag taint_flags[TAINT_FLAGS_COUNT] = { /** * print_tainted - return a string to represent the kernel taint state. * - * For individual taint flag meanings, see Documentation/sysctl/kernel.txt + * For individual taint flag meanings, see Documentation/admin-guide/sysctl/kernel.rst * * The string is overwritten by the next call to print_tainted(), * but is always NULL terminated. @@ -642,16 +647,14 @@ static int clear_warn_once_set(void *data, u64 val) return 0; } -DEFINE_SIMPLE_ATTRIBUTE(clear_warn_once_fops, - NULL, - clear_warn_once_set, - "%lld\n"); +DEFINE_DEBUGFS_ATTRIBUTE(clear_warn_once_fops, NULL, clear_warn_once_set, + "%lld\n"); static __init int register_warn_debugfs(void) { /* Don't care about failure */ - debugfs_create_file("clear_warn_once", 0200, NULL, - NULL, &clear_warn_once_fops); + debugfs_create_file_unsafe("clear_warn_once", 0200, NULL, NULL, + &clear_warn_once_fops); return 0; } diff --git a/kernel/params.c b/kernel/params.c index ce89f757e6da..cf448785d058 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -1,19 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* Helpers for initial module or kernel cmdline parsing Copyright (C) 2001 Rusty Russell. - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program; if not, write to the Free Software - Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include <linux/kernel.h> #include <linux/string.h> diff --git a/kernel/pid.c b/kernel/pid.c index 20881598bdfa..0a9f2e437217 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Generic pidhash and scalable, time-bounded PID allocator * @@ -32,17 +33,18 @@ #include <linux/init.h> #include <linux/rculist.h> #include <linux/memblock.h> -#include <linux/hash.h> #include <linux/pid_namespace.h> #include <linux/init_task.h> #include <linux/syscalls.h> #include <linux/proc_ns.h> -#include <linux/proc_fs.h> +#include <linux/refcount.h> +#include <linux/anon_inodes.h> +#include <linux/sched/signal.h> #include <linux/sched/task.h> #include <linux/idr.h> struct pid init_struct_pid = { - .count = ATOMIC_INIT(1), + .count = REFCOUNT_INIT(1), .tasks = { { .first = NULL }, { .first = NULL }, @@ -106,8 +108,7 @@ void put_pid(struct pid *pid) return; ns = pid->numbers[pid->level].ns; - if ((atomic_read(&pid->count) == 1) || - atomic_dec_and_test(&pid->count)) { + if (refcount_dec_and_test(&pid->count)) { kmem_cache_free(ns->pid_cachep, pid); put_pid_ns(ns); } @@ -210,10 +211,12 @@ struct pid *alloc_pid(struct pid_namespace *ns) } get_pid_ns(ns); - atomic_set(&pid->count, 1); + refcount_set(&pid->count, 1); for (type = 0; type < PIDTYPE_MAX; ++type) INIT_HLIST_HEAD(&pid->tasks[type]); + init_waitqueue_head(&pid->wait_pidfd); + upid = pid->numbers + ns->level; spin_lock_irq(&pidmap_lock); if (!(ns->pid_allocated & PIDNS_ADDING)) @@ -451,6 +454,73 @@ struct pid *find_ge_pid(int nr, struct pid_namespace *ns) return idr_get_next(&ns->idr, &nr); } +/** + * pidfd_create() - Create a new pid file descriptor. + * + * @pid: struct pid that the pidfd will reference + * + * This creates a new pid file descriptor with the O_CLOEXEC flag set. + * + * Note, that this function can only be called after the fd table has + * been unshared to avoid leaking the pidfd to the new process. + * + * Return: On success, a cloexec pidfd is returned. + * On error, a negative errno number will be returned. + */ +static int pidfd_create(struct pid *pid) +{ + int fd; + + fd = anon_inode_getfd("[pidfd]", &pidfd_fops, get_pid(pid), + O_RDWR | O_CLOEXEC); + if (fd < 0) + put_pid(pid); + + return fd; +} + +/** + * pidfd_open() - Open new pid file descriptor. + * + * @pid: pid for which to retrieve a pidfd + * @flags: flags to pass + * + * This creates a new pid file descriptor with the O_CLOEXEC flag set for + * the process identified by @pid. Currently, the process identified by + * @pid must be a thread-group leader. This restriction currently exists + * for all aspects of pidfds including pidfd creation (CLONE_PIDFD cannot + * be used with CLONE_THREAD) and pidfd polling (only supports thread group + * leaders). + * + * Return: On success, a cloexec pidfd is returned. + * On error, a negative errno number will be returned. + */ +SYSCALL_DEFINE2(pidfd_open, pid_t, pid, unsigned int, flags) +{ + int fd, ret; + struct pid *p; + + if (flags) + return -EINVAL; + + if (pid <= 0) + return -EINVAL; + + p = find_get_pid(pid); + if (!p) + return -ESRCH; + + ret = 0; + rcu_read_lock(); + if (!pid_task(p, PIDTYPE_TGID)) + ret = -EINVAL; + rcu_read_unlock(); + + fd = ret ?: pidfd_create(p); + put_pid(p); + return fd; +} + void __init pid_idr_init(void) { /* Verify no one has done anything silly: */ diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index aa6e72fb7c08..6d726cef241c 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Pid namespaces * @@ -325,7 +326,7 @@ int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd) } read_lock(&tasklist_lock); - force_sig(SIGKILL, pid_ns->child_reaper); + send_sig(SIGKILL, pid_ns->child_reaper, 1); read_unlock(&tasklist_lock); do_exit(0); diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index f8fe57d1022e..d3667b4075c1 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0-only config SUSPEND bool "Suspend to RAM and standby" depends on ARCH_SUSPEND_POSSIBLE @@ -65,7 +66,7 @@ config HIBERNATION need to run mkswap against the swap partition used for the suspend. It also works with swap files to a limited extent (for details see - <file:Documentation/power/swsusp-and-swap-files.txt>). + <file:Documentation/power/swsusp-and-swap-files.rst>). Right now you may boot without resuming and resume later but in the meantime you cannot use the swap partition(s)/file(s) involved in @@ -74,7 +75,7 @@ config HIBERNATION MOUNT any journaled filesystems mounted before the suspend or they will get corrupted in a nasty way. - For more information take a look at <file:Documentation/power/swsusp.txt>. + For more information take a look at <file:Documentation/power/swsusp.rst>. config ARCH_SAVE_PAGE_KEYS bool @@ -114,6 +115,15 @@ config PM_SLEEP_SMP depends on PM_SLEEP select HOTPLUG_CPU +config PM_SLEEP_SMP_NONZERO_CPU + def_bool y + depends on PM_SLEEP_SMP + depends on ARCH_SUSPEND_NONZERO_CPU + ---help--- + If an arch can suspend (for suspend, hibernate, kexec, etc) on a + non-zero numbered CPU, it may define ARCH_SUSPEND_NONZERO_CPU. This + will allow nohz_full mask to include CPU0. + config PM_AUTOSLEEP bool "Opportunistic sleep" depends on PM_SLEEP @@ -246,7 +256,7 @@ config APM_EMULATION notification of APM "events" (e.g. battery status change). In order to use APM, you will need supporting software. For location - and more information, read <file:Documentation/power/apm-acpi.txt> + and more information, read <file:Documentation/power/apm-acpi.rst> and the Battery Powered Linux mini-HOWTO, available from <http://www.tldp.org/docs.html#howto>. diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c index d9dc2c38764a..0a9326f5f421 100644 --- a/kernel/power/energy_model.c +++ b/kernel/power/energy_model.c @@ -10,6 +10,7 @@ #include <linux/cpu.h> #include <linux/cpumask.h> +#include <linux/debugfs.h> #include <linux/energy_model.h> #include <linux/sched/topology.h> #include <linux/slab.h> @@ -23,6 +24,60 @@ static DEFINE_PER_CPU(struct em_perf_domain *, em_data); */ static DEFINE_MUTEX(em_pd_mutex); +#ifdef CONFIG_DEBUG_FS +static struct dentry *rootdir; + +static void em_debug_create_cs(struct em_cap_state *cs, struct dentry *pd) +{ + struct dentry *d; + char name[24]; + + snprintf(name, sizeof(name), "cs:%lu", cs->frequency); + + /* Create per-cs directory */ + d = debugfs_create_dir(name, pd); + debugfs_create_ulong("frequency", 0444, d, &cs->frequency); + debugfs_create_ulong("power", 0444, d, &cs->power); + debugfs_create_ulong("cost", 0444, d, &cs->cost); +} + +static int em_debug_cpus_show(struct seq_file *s, void *unused) +{ + seq_printf(s, "%*pbl\n", cpumask_pr_args(to_cpumask(s->private))); + + return 0; +} +DEFINE_SHOW_ATTRIBUTE(em_debug_cpus); + +static void em_debug_create_pd(struct em_perf_domain *pd, int cpu) +{ + struct dentry *d; + char name[8]; + int i; + + snprintf(name, sizeof(name), "pd%d", cpu); + + /* Create the directory of the performance domain */ + d = debugfs_create_dir(name, rootdir); + + debugfs_create_file("cpus", 0444, d, pd->cpus, &em_debug_cpus_fops); + + /* Create a sub-directory for each capacity state */ + for (i = 0; i < pd->nr_cap_states; i++) + em_debug_create_cs(&pd->table[i], d); +} + +static int __init em_debug_init(void) +{ + /* Create /sys/kernel/debug/energy_model directory */ + rootdir = debugfs_create_dir("energy_model", NULL); + + return 0; +} +core_initcall(em_debug_init); +#else /* CONFIG_DEBUG_FS */ +static void em_debug_create_pd(struct em_perf_domain *pd, int cpu) {} +#endif static struct em_perf_domain *em_create_pd(cpumask_t *span, int nr_states, struct em_data_callback *cb) { @@ -102,6 +157,8 @@ static struct em_perf_domain *em_create_pd(cpumask_t *span, int nr_states, pd->nr_cap_states = nr_states; cpumask_copy(to_cpumask(pd->cpus), span); + em_debug_create_pd(pd, cpu); + return pd; free_cs_table: @@ -166,7 +223,7 @@ int em_register_perf_domain(cpumask_t *span, unsigned int nr_states, * All CPUs of a domain must have the same micro-architecture * since they all share the same table. */ - cap = arch_scale_cpu_capacity(NULL, cpu); + cap = arch_scale_cpu_capacity(cpu); if (prev_cap && prev_cap != cap) { pr_err("CPUs of %*pbl must have the same capacity\n", cpumask_pr_args(span)); diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index abef759de7c8..cd7434e6000d 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * kernel/power/hibernate.c - Hibernation (a.k.a suspend-to-disk) support. * @@ -6,15 +7,12 @@ * Copyright (c) 2004 Pavel Machek <pavel@ucw.cz> * Copyright (c) 2009 Rafael J. Wysocki, Novell Inc. * Copyright (C) 2012 Bojan Smojver <bojan@rexursive.com> - * - * This file is released under the GPLv2. */ #define pr_fmt(fmt) "PM: " fmt #include <linux/export.h> #include <linux/suspend.h> -#include <linux/syscalls.h> #include <linux/reboot.h> #include <linux/string.h> #include <linux/device.h> @@ -130,7 +128,7 @@ static int hibernation_test(int level) { return 0; } static int platform_begin(int platform_mode) { return (platform_mode && hibernation_ops) ? - hibernation_ops->begin() : 0; + hibernation_ops->begin(PMSG_FREEZE) : 0; } /** @@ -258,6 +256,11 @@ void swsusp_show_speed(ktime_t start, ktime_t stop, (kps % 1000) / 10); } +__weak int arch_resume_nosmt(void) +{ + return 0; +} + /** * create_image - Create a hibernation image. * @platform_mode: Whether or not to use the platform driver. @@ -281,7 +284,7 @@ static int create_image(int platform_mode) if (error || hibernation_test(TEST_PLATFORM)) goto Platform_finish; - error = disable_nonboot_cpus(); + error = suspend_disable_secondary_cpus(); if (error || hibernation_test(TEST_CPUS)) goto Enable_cpus; @@ -323,7 +326,11 @@ static int create_image(int platform_mode) local_irq_enable(); Enable_cpus: - enable_nonboot_cpus(); + suspend_enable_secondary_cpus(); + + /* Allow architectures to do nosmt-specific post-resume dances */ + if (!in_suspend) + error = arch_resume_nosmt(); Platform_finish: platform_finish(platform_mode); @@ -417,7 +424,7 @@ int hibernation_snapshot(int platform_mode) int __weak hibernate_resume_nonboot_cpu_disable(void) { - return disable_nonboot_cpus(); + return suspend_disable_secondary_cpus(); } /** @@ -486,7 +493,7 @@ static int resume_target_kernel(bool platform_mode) local_irq_enable(); Enable_cpus: - enable_nonboot_cpus(); + suspend_enable_secondary_cpus(); Cleanup: platform_restore_cleanup(platform_mode); @@ -543,7 +550,7 @@ int hibernation_platform_enter(void) * hibernation_ops->finish() before saving the image, so we should let * the firmware know that we're going to enter the sleep state after all */ - error = hibernation_ops->begin(); + error = hibernation_ops->begin(PMSG_HIBERNATE); if (error) goto Close; @@ -564,7 +571,7 @@ int hibernation_platform_enter(void) if (error) goto Platform_finish; - error = disable_nonboot_cpus(); + error = suspend_disable_secondary_cpus(); if (error) goto Enable_cpus; @@ -586,7 +593,7 @@ int hibernation_platform_enter(void) local_irq_enable(); Enable_cpus: - enable_nonboot_cpus(); + suspend_enable_secondary_cpus(); Platform_finish: hibernation_ops->finish(); @@ -709,9 +716,7 @@ int hibernate(void) goto Exit; } - pr_info("Syncing filesystems ... \n"); - ksys_sync(); - pr_info("done.\n"); + ksys_sync_helper(); error = freeze_processes(); if (error) diff --git a/kernel/power/main.c b/kernel/power/main.c index 98e76cad128b..bdbd605c4215 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -1,11 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * kernel/power/main.c - PM subsystem core functionality. * * Copyright (c) 2003 Patrick Mochel * Copyright (c) 2003 Open Source Development Lab - * - * This file is released under the GPLv2 - * */ #include <linux/export.h> @@ -16,6 +14,7 @@ #include <linux/debugfs.h> #include <linux/seq_file.h> #include <linux/suspend.h> +#include <linux/syscalls.h> #include "power.h" @@ -51,6 +50,19 @@ void unlock_system_sleep(void) } EXPORT_SYMBOL_GPL(unlock_system_sleep); +void ksys_sync_helper(void) +{ + ktime_t start; + long elapsed_msecs; + + start = ktime_get(); + ksys_sync(); + elapsed_msecs = ktime_to_ms(ktime_sub(ktime_get(), start)); + pr_info("Filesystems sync: %ld.%03ld seconds\n", + elapsed_msecs / MSEC_PER_SEC, elapsed_msecs % MSEC_PER_SEC); +} +EXPORT_SYMBOL_GPL(ksys_sync_helper); + /* Routines for PM-transition notifications */ static BLOCKING_NOTIFIER_HEAD(pm_chain_head); diff --git a/kernel/power/power.h b/kernel/power/power.h index 9e58bdc8a562..44bee462ff57 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -75,8 +75,6 @@ static inline void hibernate_reserved_size_init(void) {} static inline void hibernate_image_size_init(void) {} #endif /* !CONFIG_HIBERNATION */ -extern int pfn_is_nosave(unsigned long); - #define power_attr(_name) \ static struct kobj_attribute _name##_attr = { \ .attr = { \ diff --git a/kernel/power/poweroff.c b/kernel/power/poweroff.c index 7ef6866b521d..6d475281c730 100644 --- a/kernel/power/poweroff.c +++ b/kernel/power/poweroff.c @@ -1,7 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * poweroff.c - sysrq handler to gracefully power down machine. - * - * This file is released under the GPL v2 */ #include <linux/kernel.h> diff --git a/kernel/power/qos.c b/kernel/power/qos.c index b7a82502857a..33e3febaba53 100644 --- a/kernel/power/qos.c +++ b/kernel/power/qos.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * This module exposes the interface to kernel space for specifying * QoS dependencies. It provides infrastructure for registration of: @@ -582,10 +583,8 @@ static int register_pm_qos_misc(struct pm_qos_object *qos, struct dentry *d) qos->pm_qos_power_miscdev.name = qos->name; qos->pm_qos_power_miscdev.fops = &pm_qos_power_fops; - if (d) { - (void)debugfs_create_file(qos->name, S_IRUGO, d, - (void *)qos, &pm_qos_debug_fops); - } + debugfs_create_file(qos->name, S_IRUGO, d, (void *)qos, + &pm_qos_debug_fops); return misc_register(&qos->pm_qos_power_miscdev); } @@ -685,8 +684,6 @@ static int __init pm_qos_power_init(void) BUILD_BUG_ON(ARRAY_SIZE(pm_qos_array) != PM_QOS_NUM_CLASSES); d = debugfs_create_dir("pm_qos", NULL); - if (IS_ERR_OR_NULL(d)) - d = NULL; for (i = PM_QOS_CPU_DMA_LATENCY; i < PM_QOS_NUM_CLASSES; i++) { ret = register_pm_qos_misc(pm_qos_array[i], d); diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 640b2034edd6..83105874f255 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * linux/kernel/power/snapshot.c * @@ -5,9 +6,6 @@ * * Copyright (C) 1998-2005 Pavel Machek <pavel@ucw.cz> * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl> - * - * This file is released under the GPLv2. - * */ #define pr_fmt(fmt) "PM: " fmt @@ -965,6 +963,9 @@ void __init __register_nosave_region(unsigned long start_pfn, /* This allocation cannot fail */ region = memblock_alloc(sizeof(struct nosave_region), SMP_CACHE_BYTES); + if (!region) + panic("%s: Failed to allocate %zu bytes\n", __func__, + sizeof(struct nosave_region)); } region->start_pfn = start_pfn; region->end_pfn = end_pfn; @@ -1215,14 +1216,16 @@ static struct page *saveable_highmem_page(struct zone *zone, unsigned long pfn) if (!pfn_valid(pfn)) return NULL; - page = pfn_to_page(pfn); - if (page_zone(page) != zone) + page = pfn_to_online_page(pfn); + if (!page || page_zone(page) != zone) return NULL; BUG_ON(!PageHighMem(page)); - if (swsusp_page_is_forbidden(page) || swsusp_page_is_free(page) || - PageReserved(page)) + if (swsusp_page_is_forbidden(page) || swsusp_page_is_free(page)) + return NULL; + + if (PageReserved(page) || PageOffline(page)) return NULL; if (page_is_guard(page)) @@ -1277,8 +1280,8 @@ static struct page *saveable_page(struct zone *zone, unsigned long pfn) if (!pfn_valid(pfn)) return NULL; - page = pfn_to_page(pfn); - if (page_zone(page) != zone) + page = pfn_to_online_page(pfn); + if (!page || page_zone(page) != zone) return NULL; BUG_ON(PageHighMem(page)); @@ -1286,6 +1289,9 @@ static struct page *saveable_page(struct zone *zone, unsigned long pfn) if (swsusp_page_is_forbidden(page) || swsusp_page_is_free(page)) return NULL; + if (PageOffline(page)) + return NULL; + if (PageReserved(page) && (!kernel_page_present(page) || pfn_is_nosave(pfn))) return NULL; @@ -1334,8 +1340,9 @@ static inline void do_copy_page(long *dst, long *src) * safe_copy_page - Copy a page in a safe way. * * Check if the page we are going to copy is marked as present in the kernel - * page tables (this always is the case if CONFIG_DEBUG_PAGEALLOC is not set - * and in that case kernel_page_present() always returns 'true'). + * page tables. This always is the case if CONFIG_DEBUG_PAGEALLOC or + * CONFIG_ARCH_HAS_SET_DIRECT_MAP is not set. In that case kernel_page_present() + * always returns 'true'. */ static void safe_copy_page(void *dst, struct page *s_page) { diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 0bd595a0b610..c874a7026e24 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -1,11 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * kernel/power/suspend.c - Suspend to RAM and standby functionality. * * Copyright (c) 2003 Patrick Mochel * Copyright (c) 2003 Open Source Development Lab * Copyright (c) 2009 Rafael J. Wysocki <rjw@sisk.pl>, Novell Inc. - * - * This file is released under the GPLv2. */ #define pr_fmt(fmt) "PM: " fmt @@ -17,7 +16,6 @@ #include <linux/console.h> #include <linux/cpu.h> #include <linux/cpuidle.h> -#include <linux/syscalls.h> #include <linux/gfp.h> #include <linux/io.h> #include <linux/kernel.h> @@ -63,11 +61,17 @@ static DECLARE_SWAIT_QUEUE_HEAD(s2idle_wait_head); enum s2idle_states __read_mostly s2idle_state; static DEFINE_RAW_SPINLOCK(s2idle_lock); -bool pm_suspend_via_s2idle(void) +/** + * pm_suspend_default_s2idle - Check if suspend-to-idle is the default suspend. + * + * Return 'true' if suspend-to-idle has been selected as the default system + * suspend method. + */ +bool pm_suspend_default_s2idle(void) { return mem_sleep_current == PM_SUSPEND_TO_IDLE; } -EXPORT_SYMBOL_GPL(pm_suspend_via_s2idle); +EXPORT_SYMBOL_GPL(pm_suspend_default_s2idle); void s2idle_set_ops(const struct platform_s2idle_ops *ops) { @@ -428,7 +432,7 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) if (suspend_test(TEST_PLATFORM)) goto Platform_wake; - error = disable_nonboot_cpus(); + error = suspend_disable_secondary_cpus(); if (error || suspend_test(TEST_CPUS)) goto Enable_cpus; @@ -458,7 +462,7 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) BUG_ON(irqs_disabled()); Enable_cpus: - enable_nonboot_cpus(); + suspend_enable_secondary_cpus(); Platform_wake: platform_resume_noirq(state); @@ -489,6 +493,9 @@ int suspend_devices_and_enter(suspend_state_t state) pm_suspend_target_state = state; + if (state == PM_SUSPEND_TO_IDLE) + pm_set_suspend_no_platform(); + error = platform_suspend_begin(state); if (error) goto Close; @@ -568,13 +575,11 @@ static int enter_state(suspend_state_t state) if (state == PM_SUSPEND_TO_IDLE) s2idle_begin(); -#ifndef CONFIG_SUSPEND_SKIP_SYNC - trace_suspend_resume(TPS("sync_filesystems"), 0, true); - pr_info("Syncing filesystems ... "); - ksys_sync(); - pr_cont("done.\n"); - trace_suspend_resume(TPS("sync_filesystems"), 0, false); -#endif + if (!IS_ENABLED(CONFIG_SUSPEND_SKIP_SYNC)) { + trace_suspend_resume(TPS("sync_filesystems"), 0, true); + ksys_sync_helper(); + trace_suspend_resume(TPS("sync_filesystems"), 0, false); + } pm_pr_dbg("Preparing system for sleep (%s)\n", mem_sleep_labels[state]); pm_suspend_clear_flags(); diff --git a/kernel/power/suspend_test.c b/kernel/power/suspend_test.c index 6a897e8b2a88..60564b58de07 100644 --- a/kernel/power/suspend_test.c +++ b/kernel/power/suspend_test.c @@ -1,9 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * kernel/power/suspend_test.c - Suspend to RAM and standby test facility. * * Copyright (c) 2009 Pavel Machek <pavel@ucw.cz> - * - * This file is released under the GPLv2. */ #include <linux/init.h> diff --git a/kernel/power/swap.c b/kernel/power/swap.c index d7f6c1a288d3..ca0fcb5ced71 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * linux/kernel/power/swap.c * @@ -7,9 +8,6 @@ * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@ucw.cz> * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl> * Copyright (C) 2010-2012 Bojan Smojver <bojan@rexursive.com> - * - * This file is released under the GPLv2. - * */ #define pr_fmt(fmt) "PM: " fmt @@ -976,12 +974,11 @@ static int get_swap_reader(struct swap_map_handle *handle, last = handle->maps = NULL; offset = swsusp_header->image; while (offset) { - tmp = kmalloc(sizeof(*handle->maps), GFP_KERNEL); + tmp = kzalloc(sizeof(*handle->maps), GFP_KERNEL); if (!tmp) { release_swap_reader(handle); return -ENOMEM; } - memset(tmp, 0, sizeof(*tmp)); if (!handle->maps) handle->maps = tmp; if (last) diff --git a/kernel/power/user.c b/kernel/power/user.c index 2d8b60a3c86b..77438954cc2b 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -1,16 +1,13 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * linux/kernel/power/user.c * * This file provides the user space interface for software suspend/resume. * * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl> - * - * This file is released under the GPLv2. - * */ #include <linux/suspend.h> -#include <linux/syscalls.h> #include <linux/reboot.h> #include <linux/string.h> #include <linux/device.h> @@ -228,9 +225,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, if (data->frozen) break; - printk("Syncing filesystems ... "); - ksys_sync(); - printk("done.\n"); + ksys_sync_helper(); error = freeze_processes(); if (error) diff --git a/kernel/printk/Makefile b/kernel/printk/Makefile index 4a2ffc39eb95..4d052fc6bcde 100644 --- a/kernel/printk/Makefile +++ b/kernel/printk/Makefile @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0-only obj-y = printk.o obj-$(CONFIG_PRINTK) += printk_safe.o obj-$(CONFIG_A11Y_BRAILLE_CONSOLE) += braille.o diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h index 0f1898820cba..c8e6ab689d42 100644 --- a/kernel/printk/internal.h +++ b/kernel/printk/internal.h @@ -1,18 +1,6 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ /* * internal.h - printk internal definitions - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see <http://www.gnu.org/licenses/>. */ #include <linux/percpu.h> diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index d3d170374ceb..1888f6a3b694 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * linux/kernel/printk.c * @@ -65,6 +66,7 @@ int console_printk[4] = { CONSOLE_LOGLEVEL_MIN, /* minimum_console_loglevel */ CONSOLE_LOGLEVEL_DEFAULT, /* default_console_loglevel */ }; +EXPORT_SYMBOL_GPL(console_printk); atomic_t ignore_console_lock_warning __read_mostly = ATOMIC_INIT(0); EXPORT_SYMBOL(ignore_console_lock_warning); @@ -85,6 +87,12 @@ static DEFINE_SEMAPHORE(console_sem); struct console *console_drivers; EXPORT_SYMBOL_GPL(console_drivers); +/* + * System may need to suppress printk message under certain + * circumstances, like after kernel panic happens. + */ +int __read_mostly suppress_printk; + #ifdef CONFIG_LOCKDEP static struct lockdep_map console_lock_dep_map = { .name = "console_lock" @@ -344,7 +352,6 @@ static int console_msg_format = MSG_FORMAT_DEFAULT; enum log_flags { LOG_NEWLINE = 2, /* text ended with a newline */ - LOG_PREFIX = 4, /* text started with a prefix */ LOG_CONT = 8, /* text is a fragment of a continuation line */ }; @@ -356,6 +363,9 @@ struct printk_log { u8 facility; /* syslog facility */ u8 flags:5; /* internal record flags */ u8 level:3; /* syslog level */ +#ifdef CONFIG_PRINTK_CALLER + u32 caller_id; /* thread id or processor id */ +#endif } #ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS __packed __aligned(4) @@ -422,7 +432,11 @@ static u64 exclusive_console_stop_seq; static u64 clear_seq; static u32 clear_idx; +#ifdef CONFIG_PRINTK_CALLER +#define PREFIX_MAX 48 +#else #define PREFIX_MAX 32 +#endif #define LOG_LINE_MAX (1024 - PREFIX_MAX) #define LOG_LEVEL(v) ((v) & 0x07) @@ -577,7 +591,7 @@ static u32 truncate_msg(u16 *text_len, u16 *trunc_msg_len, } /* insert record into the buffer, discard old ones, update heads */ -static int log_store(int facility, int level, +static int log_store(u32 caller_id, int facility, int level, enum log_flags flags, u64 ts_nsec, const char *dict, u16 dict_len, const char *text, u16 text_len) @@ -625,6 +639,9 @@ static int log_store(int facility, int level, msg->ts_nsec = ts_nsec; else msg->ts_nsec = local_clock(); +#ifdef CONFIG_PRINTK_CALLER + msg->caller_id = caller_id; +#endif memset(log_dict(msg) + dict_len, 0, pad_len); msg->len = size; @@ -688,12 +705,21 @@ static ssize_t msg_print_ext_header(char *buf, size_t size, struct printk_log *msg, u64 seq) { u64 ts_usec = msg->ts_nsec; + char caller[20]; +#ifdef CONFIG_PRINTK_CALLER + u32 id = msg->caller_id; + + snprintf(caller, sizeof(caller), ",caller=%c%u", + id & 0x80000000 ? 'C' : 'T', id & ~0x80000000); +#else + caller[0] = '\0'; +#endif do_div(ts_usec, 1000); - return scnprintf(buf, size, "%u,%llu,%llu,%c;", - (msg->facility << 3) | msg->level, seq, ts_usec, - msg->flags & LOG_CONT ? 'c' : '-'); + return scnprintf(buf, size, "%u,%llu,%llu,%c%s;", + (msg->facility << 3) | msg->level, seq, ts_usec, + msg->flags & LOG_CONT ? 'c' : '-', caller); } static ssize_t msg_print_ext_body(char *buf, size_t size, @@ -1038,6 +1064,9 @@ void log_buf_vmcoreinfo_setup(void) VMCOREINFO_OFFSET(printk_log, len); VMCOREINFO_OFFSET(printk_log, text_len); VMCOREINFO_OFFSET(printk_log, dict_len); +#ifdef CONFIG_PRINTK_CALLER + VMCOREINFO_OFFSET(printk_log, caller_id); +#endif } #endif @@ -1122,14 +1151,7 @@ void __init setup_log_buf(int early) if (!new_log_buf_len) return; - if (early) { - new_log_buf = - memblock_alloc(new_log_buf_len, LOG_ALIGN); - } else { - new_log_buf = memblock_alloc_nopanic(new_log_buf_len, - LOG_ALIGN); - } - + new_log_buf = memblock_alloc(new_log_buf_len, LOG_ALIGN); if (unlikely(!new_log_buf)) { pr_err("log_buf_len: %lu bytes not available\n", new_log_buf_len); @@ -1236,10 +1258,23 @@ static size_t print_time(u64 ts, char *buf) { unsigned long rem_nsec = do_div(ts, 1000000000); - return sprintf(buf, "[%5lu.%06lu] ", + return sprintf(buf, "[%5lu.%06lu]", (unsigned long)ts, rem_nsec / 1000); } +#ifdef CONFIG_PRINTK_CALLER +static size_t print_caller(u32 id, char *buf) +{ + char caller[12]; + + snprintf(caller, sizeof(caller), "%c%u", + id & 0x80000000 ? 'C' : 'T', id & ~0x80000000); + return sprintf(buf, "[%6s]", caller); +} +#else +#define print_caller(id, buf) 0 +#endif + static size_t print_prefix(const struct printk_log *msg, bool syslog, bool time, char *buf) { @@ -1247,8 +1282,17 @@ static size_t print_prefix(const struct printk_log *msg, bool syslog, if (syslog) len = print_syslog((msg->facility << 3) | msg->level, buf); + if (time) len += print_time(msg->ts_nsec, buf + len); + + len += print_caller(msg->caller_id, buf + len); + + if (IS_ENABLED(CONFIG_PRINTK_CALLER) || time) { + buf[len++] = ' '; + buf[len] = '\0'; + } + return len; } @@ -1752,6 +1796,12 @@ static inline void printk_delay(void) } } +static inline u32 printk_caller_id(void) +{ + return in_task() ? task_pid_nr(current) : + 0x80000000 + raw_smp_processor_id(); +} + /* * Continuation lines are buffered, and not committed to the record buffer * until the line is complete, or a race forces it. The line fragments @@ -1761,7 +1811,7 @@ static inline void printk_delay(void) static struct cont { char buf[LOG_LINE_MAX]; size_t len; /* length == 0 means unused buffer */ - struct task_struct *owner; /* task of first print*/ + u32 caller_id; /* printk_caller_id() of first print */ u64 ts_nsec; /* time of first print */ u8 level; /* log level of first message */ u8 facility; /* log facility of first message */ @@ -1773,12 +1823,13 @@ static void cont_flush(void) if (cont.len == 0) return; - log_store(cont.facility, cont.level, cont.flags, cont.ts_nsec, - NULL, 0, cont.buf, cont.len); + log_store(cont.caller_id, cont.facility, cont.level, cont.flags, + cont.ts_nsec, NULL, 0, cont.buf, cont.len); cont.len = 0; } -static bool cont_add(int facility, int level, enum log_flags flags, const char *text, size_t len) +static bool cont_add(u32 caller_id, int facility, int level, + enum log_flags flags, const char *text, size_t len) { /* If the line gets too long, split it up in separate records. */ if (cont.len + len > sizeof(cont.buf)) { @@ -1789,7 +1840,7 @@ static bool cont_add(int facility, int level, enum log_flags flags, const char * if (!cont.len) { cont.facility = facility; cont.level = level; - cont.owner = current; + cont.caller_id = caller_id; cont.ts_nsec = local_clock(); cont.flags = flags; } @@ -1809,13 +1860,15 @@ static bool cont_add(int facility, int level, enum log_flags flags, const char * static size_t log_output(int facility, int level, enum log_flags lflags, const char *dict, size_t dictlen, char *text, size_t text_len) { + const u32 caller_id = printk_caller_id(); + /* * If an earlier line was buffered, and we're a continuation - * write from the same process, try to add it to the buffer. + * write from the same context, try to add it to the buffer. */ if (cont.len) { - if (cont.owner == current && (lflags & LOG_CONT)) { - if (cont_add(facility, level, lflags, text, text_len)) + if (cont.caller_id == caller_id && (lflags & LOG_CONT)) { + if (cont_add(caller_id, facility, level, lflags, text, text_len)) return text_len; } /* Otherwise, make sure it's flushed */ @@ -1828,12 +1881,13 @@ static size_t log_output(int facility, int level, enum log_flags lflags, const c /* If it doesn't end in a newline, try to buffer the current line */ if (!(lflags & LOG_NEWLINE)) { - if (cont_add(facility, level, lflags, text, text_len)) + if (cont_add(caller_id, facility, level, lflags, text, text_len)) return text_len; } /* Store it in the record log */ - return log_store(facility, level, lflags, 0, dict, dictlen, text, text_len); + return log_store(caller_id, facility, level, lflags, 0, + dict, dictlen, text, text_len); } /* Must be called under logbuf_lock. */ @@ -1867,9 +1921,6 @@ int vprintk_store(int facility, int level, case '0' ... '7': if (level == LOGLEVEL_DEFAULT) level = kern_level - '0'; - /* fallthrough */ - case 'd': /* KERN_DEFAULT */ - lflags |= LOG_PREFIX; break; case 'c': /* KERN_CONT */ lflags |= LOG_CONT; @@ -1884,7 +1935,7 @@ int vprintk_store(int facility, int level, level = default_message_loglevel; if (dict) - lflags |= LOG_PREFIX|LOG_NEWLINE; + lflags |= LOG_NEWLINE; return log_output(facility, level, lflags, dict, dictlen, text, text_len); @@ -1899,6 +1950,10 @@ asmlinkage int vprintk_emit(int facility, int level, unsigned long flags; u64 curr_log_seq; + /* Suppress unimportant messages after panic happens */ + if (unlikely(suppress_printk)) + return 0; + if (level == LOGLEVEL_SCHED) { level = LOGLEVEL_DEFAULT; in_sched = true; @@ -2481,10 +2536,11 @@ void console_unblank(void) /** * console_flush_on_panic - flush console content on panic + * @mode: flush all messages in buffer or just the pending ones * * Immediately output all pending messages no matter what. */ -void console_flush_on_panic(void) +void console_flush_on_panic(enum con_flush_mode mode) { /* * If someone else is holding the console lock, trylock will fail @@ -2495,6 +2551,15 @@ void console_flush_on_panic(void) */ console_trylock(); console_may_schedule = 0; + + if (mode == CONSOLE_REPLAY_ALL) { + unsigned long flags; + + logbuf_lock_irqsave(flags); + console_seq = log_first_seq; + console_idx = log_first_idx; + logbuf_unlock_irqrestore(flags); + } console_unlock(); } diff --git a/kernel/printk/printk_safe.c b/kernel/printk/printk_safe.c index 0913b4d385de..b4045e782743 100644 --- a/kernel/printk/printk_safe.c +++ b/kernel/printk/printk_safe.c @@ -1,18 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * printk_safe.c - Safe printk for printk-deadlock-prone contexts - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see <http://www.gnu.org/licenses/>. */ #include <linux/preempt.h> diff --git a/kernel/profile.c b/kernel/profile.c index 9c08a2c7cb1d..af7c94bf5fa1 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * linux/kernel/profile.c * Simple profiling. Manages a direct-mapped profile hit count buffer, diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 771e93f9c43f..cb9ddcc08119 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * linux/kernel/ptrace.c * @@ -29,6 +30,9 @@ #include <linux/hw_breakpoint.h> #include <linux/cn_proc.h> #include <linux/compat.h> +#include <linux/sched/signal.h> + +#include <asm/syscall.h> /* for syscall_get_* */ /* * Access another process' address space via ptrace. @@ -77,9 +81,7 @@ void __ptrace_link(struct task_struct *child, struct task_struct *new_parent, */ static void ptrace_link(struct task_struct *child, struct task_struct *new_parent) { - rcu_read_lock(); - __ptrace_link(child, new_parent, __task_cred(new_parent)); - rcu_read_unlock(); + __ptrace_link(child, new_parent, current_cred()); } /** @@ -116,6 +118,9 @@ void __ptrace_unlink(struct task_struct *child) BUG_ON(!child->ptrace); clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE); +#ifdef TIF_SYSCALL_EMU + clear_tsk_thread_flag(child, TIF_SYSCALL_EMU); +#endif child->parent = child->real_parent; list_del_init(&child->ptrace_entry); @@ -322,6 +327,16 @@ static int __ptrace_may_access(struct task_struct *task, unsigned int mode) return -EPERM; ok: rcu_read_unlock(); + /* + * If a task drops privileges and becomes nondumpable (through a syscall + * like setresuid()) while we are trying to access it, we must ensure + * that the dumpability is read after the credentials; otherwise, + * we may be able to attach to a task that we shouldn't be able to + * attach to (as if the task had dropped privileges without becoming + * nondumpable). + * Pairs with a write barrier in commit_creds(). + */ + smp_rmb(); mm = task->mm; if (mm && ((get_dumpable(mm) != SUID_DUMP_USER) && @@ -703,6 +718,10 @@ static int ptrace_peek_siginfo(struct task_struct *child, if (arg.nr < 0) return -EINVAL; + /* Ensure arg.off fits in an unsigned long */ + if (arg.off > ULONG_MAX) + return 0; + if (arg.flags & PTRACE_PEEKSIGINFO_SHARED) pending = &child->signal->shared_pending; else @@ -710,18 +729,20 @@ static int ptrace_peek_siginfo(struct task_struct *child, for (i = 0; i < arg.nr; ) { kernel_siginfo_t info; - s32 off = arg.off + i; + unsigned long off = arg.off + i; + bool found = false; spin_lock_irq(&child->sighand->siglock); list_for_each_entry(q, &pending->list, list) { if (!off--) { + found = true; copy_siginfo(&info, &q->info); break; } } spin_unlock_irq(&child->sighand->siglock); - if (off >= 0) /* beyond the end of the list */ + if (!found) /* beyond the end of the list */ break; #ifdef CONFIG_COMPAT @@ -878,7 +899,100 @@ static int ptrace_regset(struct task_struct *task, int req, unsigned int type, * to ensure no machine forgets it. */ EXPORT_SYMBOL_GPL(task_user_regset_view); -#endif + +static unsigned long +ptrace_get_syscall_info_entry(struct task_struct *child, struct pt_regs *regs, + struct ptrace_syscall_info *info) +{ + unsigned long args[ARRAY_SIZE(info->entry.args)]; + int i; + + info->op = PTRACE_SYSCALL_INFO_ENTRY; + info->entry.nr = syscall_get_nr(child, regs); + syscall_get_arguments(child, regs, args); + for (i = 0; i < ARRAY_SIZE(args); i++) + info->entry.args[i] = args[i]; + + /* args is the last field in struct ptrace_syscall_info.entry */ + return offsetofend(struct ptrace_syscall_info, entry.args); +} + +static unsigned long +ptrace_get_syscall_info_seccomp(struct task_struct *child, struct pt_regs *regs, + struct ptrace_syscall_info *info) +{ + /* + * As struct ptrace_syscall_info.entry is currently a subset + * of struct ptrace_syscall_info.seccomp, it makes sense to + * initialize that subset using ptrace_get_syscall_info_entry(). + * This can be reconsidered in the future if these structures + * diverge significantly enough. + */ + ptrace_get_syscall_info_entry(child, regs, info); + info->op = PTRACE_SYSCALL_INFO_SECCOMP; + info->seccomp.ret_data = child->ptrace_message; + + /* ret_data is the last field in struct ptrace_syscall_info.seccomp */ + return offsetofend(struct ptrace_syscall_info, seccomp.ret_data); +} + +static unsigned long +ptrace_get_syscall_info_exit(struct task_struct *child, struct pt_regs *regs, + struct ptrace_syscall_info *info) +{ + info->op = PTRACE_SYSCALL_INFO_EXIT; + info->exit.rval = syscall_get_error(child, regs); + info->exit.is_error = !!info->exit.rval; + if (!info->exit.is_error) + info->exit.rval = syscall_get_return_value(child, regs); + + /* is_error is the last field in struct ptrace_syscall_info.exit */ + return offsetofend(struct ptrace_syscall_info, exit.is_error); +} + +static int +ptrace_get_syscall_info(struct task_struct *child, unsigned long user_size, + void __user *datavp) +{ + struct pt_regs *regs = task_pt_regs(child); + struct ptrace_syscall_info info = { + .op = PTRACE_SYSCALL_INFO_NONE, + .arch = syscall_get_arch(child), + .instruction_pointer = instruction_pointer(regs), + .stack_pointer = user_stack_pointer(regs), + }; + unsigned long actual_size = offsetof(struct ptrace_syscall_info, entry); + unsigned long write_size; + + /* + * This does not need lock_task_sighand() to access + * child->last_siginfo because ptrace_freeze_traced() + * called earlier by ptrace_check_attach() ensures that + * the tracee cannot go away and clear its last_siginfo. + */ + switch (child->last_siginfo ? child->last_siginfo->si_code : 0) { + case SIGTRAP | 0x80: + switch (child->ptrace_message) { + case PTRACE_EVENTMSG_SYSCALL_ENTRY: + actual_size = ptrace_get_syscall_info_entry(child, regs, + &info); + break; + case PTRACE_EVENTMSG_SYSCALL_EXIT: + actual_size = ptrace_get_syscall_info_exit(child, regs, + &info); + break; + } + break; + case SIGTRAP | (PTRACE_EVENT_SECCOMP << 8): + actual_size = ptrace_get_syscall_info_seccomp(child, regs, + &info); + break; + } + + write_size = min(actual_size, user_size); + return copy_to_user(datavp, &info, write_size) ? -EFAULT : actual_size; +} +#endif /* CONFIG_HAVE_ARCH_TRACEHOOK */ int ptrace_request(struct task_struct *child, long request, unsigned long addr, unsigned long data) @@ -924,18 +1038,26 @@ int ptrace_request(struct task_struct *child, long request, ret = ptrace_setsiginfo(child, &siginfo); break; - case PTRACE_GETSIGMASK: + case PTRACE_GETSIGMASK: { + sigset_t *mask; + if (addr != sizeof(sigset_t)) { ret = -EINVAL; break; } - if (copy_to_user(datavp, &child->blocked, sizeof(sigset_t))) + if (test_tsk_restore_sigmask(child)) + mask = &child->saved_sigmask; + else + mask = &child->blocked; + + if (copy_to_user(datavp, mask, sizeof(sigset_t))) ret = -EFAULT; else ret = 0; break; + } case PTRACE_SETSIGMASK: { sigset_t new_set; @@ -961,6 +1083,8 @@ int ptrace_request(struct task_struct *child, long request, child->blocked = new_set; spin_unlock_irq(&child->sighand->siglock); + clear_tsk_restore_sigmask(child); + ret = 0; break; } @@ -1085,6 +1209,10 @@ int ptrace_request(struct task_struct *child, long request, ret = __put_user(kiov.iov_len, &uiov->iov_len); break; } + + case PTRACE_GET_SYSCALL_INFO: + ret = ptrace_get_syscall_info(child, addr, datavp); + break; #endif case PTRACE_SECCOMP_GET_FILTER: diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig index 939a2056c87a..480edf328b51 100644 --- a/kernel/rcu/Kconfig +++ b/kernel/rcu/Kconfig @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0-only # # RCU-related configuration options # @@ -87,36 +88,6 @@ config RCU_STALL_COMMON config RCU_NEED_SEGCBLIST def_bool ( TREE_RCU || PREEMPT_RCU || TREE_SRCU ) -config CONTEXT_TRACKING - bool - -config CONTEXT_TRACKING_FORCE - bool "Force context tracking" - depends on CONTEXT_TRACKING - default y if !NO_HZ_FULL - help - The major pre-requirement for full dynticks to work is to - support the context tracking subsystem. But there are also - other dependencies to provide in order to make the full - dynticks working. - - This option stands for testing when an arch implements the - context tracking backend but doesn't yet fullfill all the - requirements to make the full dynticks feature working. - Without the full dynticks, there is no way to test the support - for context tracking and the subsystems that rely on it: RCU - userspace extended quiescent state and tickless cputime - accounting. This option copes with the absence of the full - dynticks subsystem by forcing the context tracking on all - CPUs in the system. - - Say Y only if you're working on the development of an - architecture backend for the context tracking. - - Say N otherwise, this option brings an overhead that you - don't want in production. - - config RCU_FANOUT int "Tree-based hierarchical RCU fanout value" range 2 64 if 64BIT diff --git a/kernel/rcu/Kconfig.debug b/kernel/rcu/Kconfig.debug index 0ec7d1d33a14..5ec3ea4028e2 100644 --- a/kernel/rcu/Kconfig.debug +++ b/kernel/rcu/Kconfig.debug @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0-only # # RCU-related debugging configuration options # diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index a393e24a9195..5290b01de534 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -1,36 +1,18 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ /* * Read-Copy Update definitions shared among RCU implementations. * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, you can access it online at - * http://www.gnu.org/licenses/gpl-2.0.html. - * * Copyright IBM Corporation, 2011 * - * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com> + * Author: Paul E. McKenney <paulmck@linux.ibm.com> */ #ifndef __LINUX_RCU_H #define __LINUX_RCU_H #include <trace/events/rcu.h> -#ifdef CONFIG_RCU_TRACE -#define RCU_TRACE(stmt) stmt -#else /* #ifdef CONFIG_RCU_TRACE */ -#define RCU_TRACE(stmt) -#endif /* #else #ifdef CONFIG_RCU_TRACE */ -/* Offset to allow for unmatched rcu_irq_{enter,exit}(). */ +/* Offset to allow distinguishing irq vs. task-based idle entry/exit. */ #define DYNTICK_IRQ_NONIDLE ((LONG_MAX / 2) + 1) @@ -229,12 +211,12 @@ static inline bool __rcu_reclaim(const char *rn, struct rcu_head *head) rcu_lock_acquire(&rcu_callback_map); if (__is_kfree_rcu_offset(offset)) { - RCU_TRACE(trace_rcu_invoke_kfree_callback(rn, head, offset);) + trace_rcu_invoke_kfree_callback(rn, head, offset); kfree((void *)head - offset); rcu_lock_release(&rcu_callback_map); return true; } else { - RCU_TRACE(trace_rcu_invoke_callback(rn, head);) + trace_rcu_invoke_callback(rn, head); f = head->func; WRITE_ONCE(head->func, (rcu_callback_t)0L); f(head); @@ -246,6 +228,7 @@ static inline bool __rcu_reclaim(const char *rn, struct rcu_head *head) #ifdef CONFIG_RCU_STALL_COMMON extern int rcu_cpu_stall_suppress; +extern int rcu_cpu_stall_timeout; int rcu_jiffies_till_stall_check(void); #define rcu_ftrace_dump_stall_suppress() \ @@ -462,9 +445,8 @@ void rcu_request_urgent_qs_task(struct task_struct *t); enum rcutorture_type { RCU_FLAVOR, - RCU_BH_FLAVOR, - RCU_SCHED_FLAVOR, RCU_TASKS_FLAVOR, + RCU_TRIVIAL_FLAVOR, SRCU_FLAVOR, INVALID_RCU_FLAVOR }; @@ -498,6 +480,10 @@ void do_trace_rcu_torture_read(const char *rcutorturename, #endif #endif +#if IS_ENABLED(CONFIG_RCU_TORTURE_TEST) || IS_MODULE(CONFIG_RCU_TORTURE_TEST) +long rcutorture_sched_setaffinity(pid_t pid, const struct cpumask *in_mask); +#endif + #ifdef CONFIG_TINY_SRCU static inline void srcutorture_get_gp_data(enum rcutorture_type test_type, diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c index 5aff271adf1e..9bd5f6023c21 100644 --- a/kernel/rcu/rcu_segcblist.c +++ b/kernel/rcu/rcu_segcblist.c @@ -1,23 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0+ /* * RCU segmented callback lists, function definitions * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, you can access it online at - * http://www.gnu.org/licenses/gpl-2.0.html. - * * Copyright IBM Corporation, 2017 * - * Authors: Paul E. McKenney <paulmck@linux.vnet.ibm.com> + * Authors: Paul E. McKenney <paulmck@linux.ibm.com> */ #include <linux/types.h> diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h index 948470cef385..71b64648464e 100644 --- a/kernel/rcu/rcu_segcblist.h +++ b/kernel/rcu/rcu_segcblist.h @@ -1,23 +1,10 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ /* * RCU segmented callback lists, internal-to-rcu header file * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, you can access it online at - * http://www.gnu.org/licenses/gpl-2.0.html. - * * Copyright IBM Corporation, 2017 * - * Authors: Paul E. McKenney <paulmck@linux.vnet.ibm.com> + * Authors: Paul E. McKenney <paulmck@linux.ibm.com> */ #include <linux/rcu_segcblist.h> diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c index b459da70b4fc..7a6890b23c5f 100644 --- a/kernel/rcu/rcuperf.c +++ b/kernel/rcu/rcuperf.c @@ -1,23 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0+ /* * Read-Copy Update module-based performance-test facility * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, you can access it online at - * http://www.gnu.org/licenses/gpl-2.0.html. - * * Copyright (C) IBM Corporation, 2015 * - * Authors: Paul E. McKenney <paulmck@us.ibm.com> + * Authors: Paul E. McKenney <paulmck@linux.ibm.com> */ #define pr_fmt(fmt) fmt @@ -54,7 +41,7 @@ #include "rcu.h" MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.vnet.ibm.com>"); +MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.ibm.com>"); #define PERF_FLAG "-perf:" #define PERFOUT_STRING(s) \ @@ -83,13 +70,19 @@ MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.vnet.ibm.com>"); * Various other use cases may of course be specified. */ +#ifdef MODULE +# define RCUPERF_SHUTDOWN 0 +#else +# define RCUPERF_SHUTDOWN 1 +#endif + torture_param(bool, gp_async, false, "Use asynchronous GP wait primitives"); torture_param(int, gp_async_max, 1000, "Max # outstanding waits per reader"); torture_param(bool, gp_exp, false, "Use expedited GP wait primitives"); torture_param(int, holdoff, 10, "Holdoff time before test start (s)"); torture_param(int, nreaders, -1, "Number of RCU reader threads"); torture_param(int, nwriters, -1, "Number of RCU updater threads"); -torture_param(bool, shutdown, !IS_ENABLED(MODULE), +torture_param(bool, shutdown, RCUPERF_SHUTDOWN, "Shutdown at end of performance tests."); torture_param(int, verbose, 1, "Enable verbose debugging printk()s"); torture_param(int, writer_holdoff, 0, "Holdoff (us) between GPs, zero to disable"); @@ -501,6 +494,10 @@ rcu_perf_cleanup(void) if (torture_cleanup_begin()) return; + if (!cur_ops) { + torture_cleanup_end(); + return; + } if (reader_tasks) { for (i = 0; i < nrealreaders; i++) @@ -621,6 +618,7 @@ rcu_perf_init(void) pr_cont("\n"); WARN_ON(!IS_MODULE(CONFIG_RCU_PERF_TEST)); firsterr = -EINVAL; + cur_ops = NULL; goto unwind; } if (cur_ops->init) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index f6e85faa4ff4..fce4e7e6f502 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1,23 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0+ /* * Read-Copy Update module-based torture test facility * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, you can access it online at - * http://www.gnu.org/licenses/gpl-2.0.html. - * * Copyright (C) IBM Corporation, 2005, 2006 * - * Authors: Paul E. McKenney <paulmck@us.ibm.com> + * Authors: Paul E. McKenney <paulmck@linux.ibm.com> * Josh Triplett <josh@joshtriplett.org> * * See also: Documentation/RCU/torture.txt @@ -61,7 +48,7 @@ #include "rcu.h" MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and Josh Triplett <josh@joshtriplett.org>"); +MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.ibm.com> and Josh Triplett <josh@joshtriplett.org>"); /* Bits for ->extendables field, extendables param, and related definitions. */ @@ -312,7 +299,7 @@ struct rcu_torture_ops { int irq_capable; int can_boost; int extendables; - int ext_irq_conflict; + int slow_gps; const char *name; }; @@ -605,12 +592,7 @@ static void srcu_torture_init(void) static void srcu_torture_cleanup(void) { - static DEFINE_TORTURE_RANDOM(rand); - - if (torture_random(&rand) & 0x800) - cleanup_srcu_struct(&srcu_ctld); - else - cleanup_srcu_struct_quiesced(&srcu_ctld); + cleanup_srcu_struct(&srcu_ctld); srcu_ctlp = &srcu_ctl; /* In case of a later rcutorture run. */ } @@ -686,9 +668,51 @@ static struct rcu_torture_ops tasks_ops = { .fqs = NULL, .stats = NULL, .irq_capable = 1, + .slow_gps = 1, .name = "tasks" }; +/* + * Definitions for trivial CONFIG_PREEMPT=n-only torture testing. + * This implementation does not necessarily work well with CPU hotplug. + */ + +static void synchronize_rcu_trivial(void) +{ + int cpu; + + for_each_online_cpu(cpu) { + rcutorture_sched_setaffinity(current->pid, cpumask_of(cpu)); + WARN_ON_ONCE(raw_smp_processor_id() != cpu); + } +} + +static int rcu_torture_read_lock_trivial(void) __acquires(RCU) +{ + preempt_disable(); + return 0; +} + +static void rcu_torture_read_unlock_trivial(int idx) __releases(RCU) +{ + preempt_enable(); +} + +static struct rcu_torture_ops trivial_ops = { + .ttype = RCU_TRIVIAL_FLAVOR, + .init = rcu_sync_torture_init, + .readlock = rcu_torture_read_lock_trivial, + .read_delay = rcu_read_delay, /* just reuse rcu's version. */ + .readunlock = rcu_torture_read_unlock_trivial, + .get_gp_seq = rcu_no_completed, + .sync = synchronize_rcu_trivial, + .exp_sync = synchronize_rcu_trivial, + .fqs = NULL, + .stats = NULL, + .irq_capable = 1, + .name = "trivial" +}; + static unsigned long rcutorture_seq_diff(unsigned long new, unsigned long old) { if (!cur_ops->gp_diff) @@ -1029,10 +1053,17 @@ rcu_torture_writer(void *arg) !rcu_gp_is_normal(); } rcu_torture_writer_state = RTWS_STUTTER; - if (stutter_wait("rcu_torture_writer")) + if (stutter_wait("rcu_torture_writer") && + !READ_ONCE(rcu_fwd_cb_nodelay) && + !cur_ops->slow_gps && + !torture_must_stop()) for (i = 0; i < ARRAY_SIZE(rcu_tortures); i++) - if (list_empty(&rcu_tortures[i].rtort_free)) - WARN_ON_ONCE(1); + if (list_empty(&rcu_tortures[i].rtort_free) && + rcu_access_pointer(rcu_torture_current) != + &rcu_tortures[i]) { + rcu_ftrace_dump(DUMP_ALL); + WARN(1, "%s: rtort_pipe_count: %d\n", __func__, rcu_tortures[i].rtort_pipe_count); + } } while (!torture_must_stop()); /* Reset expediting back to unexpedited. */ if (expediting > 0) @@ -1173,7 +1204,7 @@ rcutorture_extend_mask(int oldmask, struct torture_random_state *trsp) unsigned long randmask2 = randmask1 >> 3; WARN_ON_ONCE(mask >> RCUTORTURE_RDR_SHIFT); - /* Most of the time lots of bits, half the time only one bit. */ + /* Mostly only one bit (need preemption!), sometimes lots of bits. */ if (!(randmask1 & 0x7)) mask = mask & randmask2; else @@ -1183,10 +1214,6 @@ rcutorture_extend_mask(int oldmask, struct torture_random_state *trsp) ((!(mask & RCUTORTURE_RDR_BH) && (oldmask & RCUTORTURE_RDR_BH)) || (!(mask & RCUTORTURE_RDR_RBH) && (oldmask & RCUTORTURE_RDR_RBH)))) mask |= RCUTORTURE_RDR_BH | RCUTORTURE_RDR_RBH; - if ((mask & RCUTORTURE_RDR_IRQ) && - !(mask & cur_ops->ext_irq_conflict) && - (oldmask & cur_ops->ext_irq_conflict)) - mask |= cur_ops->ext_irq_conflict; /* Or if readers object. */ return mask ?: RCUTORTURE_RDR_RCU; } @@ -1381,8 +1408,9 @@ rcu_torture_stats_print(void) } pr_alert("%s%s ", torture_type, TORTURE_FLAG); - pr_cont("rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d ", + pr_cont("rtc: %p %s: %lu tfle: %d rta: %d rtaf: %d rtf: %d ", rcu_torture_current, + rcu_torture_current ? "ver" : "VER", rcu_torture_current_version, list_empty(&rcu_torture_freelist), atomic_read(&n_rcu_torture_alloc), @@ -1630,21 +1658,34 @@ static bool rcu_fwd_emergency_stop; #define MIN_FWD_CB_LAUNDERS 3 /* This many CB invocations to count. */ #define MIN_FWD_CBS_LAUNDERED 100 /* Number of counted CBs. */ #define FWD_CBS_HIST_DIV 10 /* Histogram buckets/second. */ -static long n_launders_hist[2 * MAX_FWD_CB_JIFFIES / (HZ / FWD_CBS_HIST_DIV)]; +struct rcu_launder_hist { + long n_launders; + unsigned long launder_gp_seq; +}; +#define N_LAUNDERS_HIST (2 * MAX_FWD_CB_JIFFIES / (HZ / FWD_CBS_HIST_DIV)) +static struct rcu_launder_hist n_launders_hist[N_LAUNDERS_HIST]; +static unsigned long rcu_launder_gp_seq_start; static void rcu_torture_fwd_cb_hist(void) { + unsigned long gps; + unsigned long gps_old; int i; int j; for (i = ARRAY_SIZE(n_launders_hist) - 1; i > 0; i--) - if (n_launders_hist[i] > 0) + if (n_launders_hist[i].n_launders > 0) break; pr_alert("%s: Callback-invocation histogram (duration %lu jiffies):", __func__, jiffies - rcu_fwd_startat); - for (j = 0; j <= i; j++) - pr_cont(" %ds/%d: %ld", - j + 1, FWD_CBS_HIST_DIV, n_launders_hist[j]); + gps_old = rcu_launder_gp_seq_start; + for (j = 0; j <= i; j++) { + gps = n_launders_hist[j].launder_gp_seq; + pr_cont(" %ds/%d: %ld:%ld", + j + 1, FWD_CBS_HIST_DIV, n_launders_hist[j].n_launders, + rcutorture_seq_diff(gps, gps_old)); + gps_old = gps; + } pr_cont("\n"); } @@ -1666,10 +1707,22 @@ static void rcu_torture_fwd_cb_cr(struct rcu_head *rhp) i = ((jiffies - rcu_fwd_startat) / (HZ / FWD_CBS_HIST_DIV)); if (i >= ARRAY_SIZE(n_launders_hist)) i = ARRAY_SIZE(n_launders_hist) - 1; - n_launders_hist[i]++; + n_launders_hist[i].n_launders++; + n_launders_hist[i].launder_gp_seq = cur_ops->get_gp_seq(); spin_unlock_irqrestore(&rcu_fwd_lock, flags); } +// Give the scheduler a chance, even on nohz_full CPUs. +static void rcu_torture_fwd_prog_cond_resched(void) +{ + if (IS_ENABLED(CONFIG_PREEMPT) && IS_ENABLED(CONFIG_NO_HZ_FULL)) { + if (need_resched()) + schedule(); + } else { + cond_resched(); + } +} + /* * Free all callbacks on the rcu_fwd_cb_head list, either because the * test is over or because we hit an OOM event. @@ -1683,16 +1736,18 @@ static unsigned long rcu_torture_fwd_prog_cbfree(void) for (;;) { spin_lock_irqsave(&rcu_fwd_lock, flags); rfcp = rcu_fwd_cb_head; - if (!rfcp) + if (!rfcp) { + spin_unlock_irqrestore(&rcu_fwd_lock, flags); break; + } rcu_fwd_cb_head = rfcp->rfc_next; if (!rcu_fwd_cb_head) rcu_fwd_cb_tail = &rcu_fwd_cb_head; spin_unlock_irqrestore(&rcu_fwd_lock, flags); kfree(rfcp); freed++; + rcu_torture_fwd_prog_cond_resched(); } - spin_unlock_irqrestore(&rcu_fwd_lock, flags); return freed; } @@ -1716,6 +1771,8 @@ static void rcu_torture_fwd_prog_nr(int *tested, int *tested_tries) } /* Tight loop containing cond_resched(). */ + WRITE_ONCE(rcu_fwd_cb_nodelay, true); + cur_ops->sync(); /* Later readers see above write. */ if (selfpropcb) { WRITE_ONCE(fcs.stop, 0); cur_ops->call(&fcs.rh, rcu_torture_fwd_prog_cb); @@ -1733,7 +1790,7 @@ static void rcu_torture_fwd_prog_nr(int *tested, int *tested_tries) udelay(10); cur_ops->readunlock(idx); if (!fwd_progress_need_resched || need_resched()) - cond_resched(); + rcu_torture_fwd_prog_cond_resched(); } (*tested_tries)++; if (!time_before(jiffies, stopat) && @@ -1754,6 +1811,8 @@ static void rcu_torture_fwd_prog_nr(int *tested, int *tested_tries) WARN_ON(READ_ONCE(fcs.stop) != 2); destroy_rcu_head_on_stack(&fcs.rh); } + schedule_timeout_uninterruptible(HZ / 10); /* Let kthreads recover. */ + WRITE_ONCE(rcu_fwd_cb_nodelay, false); } /* Carry out call_rcu() forward-progress testing. */ @@ -1774,6 +1833,8 @@ static void rcu_torture_fwd_prog_cr(void) if (READ_ONCE(rcu_fwd_emergency_stop)) return; /* Get out of the way quickly, no GP wait! */ + if (!cur_ops->call) + return; /* Can't do call_rcu() fwd prog without ->call. */ /* Loop continuously posting RCU callbacks. */ WRITE_ONCE(rcu_fwd_cb_nodelay, true); @@ -1786,9 +1847,10 @@ static void rcu_torture_fwd_prog_cr(void) n_max_cbs = 0; n_max_gps = 0; for (i = 0; i < ARRAY_SIZE(n_launders_hist); i++) - n_launders_hist[i] = 0; + n_launders_hist[i].n_launders = 0; cver = READ_ONCE(rcu_torture_current_version); gps = cur_ops->get_gp_seq(); + rcu_launder_gp_seq_start = gps; while (time_before(jiffies, stopat) && !READ_ONCE(rcu_fwd_emergency_stop) && !torture_must_stop()) { rfcp = READ_ONCE(rcu_fwd_cb_head); @@ -1813,7 +1875,7 @@ static void rcu_torture_fwd_prog_cr(void) rfcp->rfc_gps = 0; } cur_ops->call(&rfcp->rh, rcu_torture_fwd_cb_cr); - cond_resched(); + rcu_torture_fwd_prog_cond_resched(); } stoppedat = jiffies; n_launders_cb_snap = READ_ONCE(n_launders_cb); @@ -1822,7 +1884,6 @@ static void rcu_torture_fwd_prog_cr(void) cur_ops->cb_barrier(); /* Wait for callbacks to be invoked. */ (void)rcu_torture_fwd_prog_cbfree(); - WRITE_ONCE(rcu_fwd_cb_nodelay, false); if (!torture_must_stop() && !READ_ONCE(rcu_fwd_emergency_stop)) { WARN_ON(n_max_gps < MIN_FWD_CBS_LAUNDERED); pr_alert("%s Duration %lu barrier: %lu pending %ld n_launders: %ld n_launders_sa: %ld n_max_gps: %ld n_max_cbs: %ld cver %ld gps %ld\n", @@ -1833,6 +1894,8 @@ static void rcu_torture_fwd_prog_cr(void) n_max_gps, n_max_cbs, cver, gps); rcu_torture_fwd_cb_hist(); } + schedule_timeout_uninterruptible(HZ); /* Let CBs drain. */ + WRITE_ONCE(rcu_fwd_cb_nodelay, false); } @@ -1846,7 +1909,7 @@ static int rcutorture_oom_notify(struct notifier_block *self, WARN(1, "%s invoked upon OOM during forward-progress testing.\n", __func__); rcu_torture_fwd_cb_hist(); - rcu_fwd_progress_check(1 + (jiffies - READ_ONCE(rcu_fwd_startat) / 2)); + rcu_fwd_progress_check(1 + (jiffies - READ_ONCE(rcu_fwd_startat)) / 2); WRITE_ONCE(rcu_fwd_emergency_stop, true); smp_mb(); /* Emergency stop before free and wait to avoid hangs. */ pr_info("%s: Freed %lu RCU callbacks.\n", @@ -2092,6 +2155,10 @@ rcu_torture_cleanup(void) cur_ops->cb_barrier(); return; } + if (!cur_ops) { + torture_cleanup_end(); + return; + } rcu_torture_barrier_cleanup(); torture_stop_kthread(rcu_torture_fwd_prog, fwd_prog_task); @@ -2228,6 +2295,14 @@ static void rcu_test_debug_objects(void) #endif /* #else #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ } +static void rcutorture_sync(void) +{ + static unsigned long n; + + if (cur_ops->sync && !(++n & 0xfff)) + cur_ops->sync(); +} + static int __init rcu_torture_init(void) { @@ -2236,7 +2311,7 @@ rcu_torture_init(void) int firsterr = 0; static struct rcu_torture_ops *torture_ops[] = { &rcu_ops, &rcu_busted_ops, &srcu_ops, &srcud_ops, - &busted_srcud_ops, &tasks_ops, + &busted_srcud_ops, &tasks_ops, &trivial_ops, }; if (!torture_init_begin(torture_type, verbose)) @@ -2257,6 +2332,7 @@ rcu_torture_init(void) pr_cont("\n"); WARN_ON(!IS_MODULE(CONFIG_RCU_TORTURE_TEST)); firsterr = -EINVAL; + cur_ops = NULL; goto unwind; } if (cur_ops->fqs == NULL && fqs_duration != 0) { @@ -2358,7 +2434,10 @@ rcu_torture_init(void) if (stutter < 0) stutter = 0; if (stutter) { - firsterr = torture_stutter_init(stutter * HZ); + int t; + + t = cur_ops->stall_dur ? cur_ops->stall_dur() : stutter * HZ; + firsterr = torture_stutter_init(stutter * HZ, t); if (firsterr) goto unwind; } @@ -2389,7 +2468,8 @@ rcu_torture_init(void) firsterr = torture_shutdown_init(shutdown_secs, rcu_torture_cleanup); if (firsterr) goto unwind; - firsterr = torture_onoff_init(onoff_holdoff * HZ, onoff_interval); + firsterr = torture_onoff_init(onoff_holdoff * HZ, onoff_interval, + rcutorture_sync); if (firsterr) goto unwind; firsterr = rcu_torture_stall_init(); diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c index 32dfd6522548..44d6606b8325 100644 --- a/kernel/rcu/srcutiny.c +++ b/kernel/rcu/srcutiny.c @@ -1,24 +1,11 @@ +// SPDX-License-Identifier: GPL-2.0+ /* * Sleepable Read-Copy Update mechanism for mutual exclusion, * tiny version for non-preemptible single-CPU use. * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, you can access it online at - * http://www.gnu.org/licenses/gpl-2.0.html. - * * Copyright (C) IBM Corporation, 2017 * - * Author: Paul McKenney <paulmck@us.ibm.com> + * Author: Paul McKenney <paulmck@linux.ibm.com> */ #include <linux/export.h> @@ -89,19 +76,16 @@ EXPORT_SYMBOL_GPL(init_srcu_struct); * Must invoke this after you are finished using a given srcu_struct that * was initialized via init_srcu_struct(), else you leak memory. */ -void _cleanup_srcu_struct(struct srcu_struct *ssp, bool quiesced) +void cleanup_srcu_struct(struct srcu_struct *ssp) { WARN_ON(ssp->srcu_lock_nesting[0] || ssp->srcu_lock_nesting[1]); - if (quiesced) - WARN_ON(work_pending(&ssp->srcu_work)); - else - flush_work(&ssp->srcu_work); + flush_work(&ssp->srcu_work); WARN_ON(ssp->srcu_gp_running); WARN_ON(ssp->srcu_gp_waiting); WARN_ON(ssp->srcu_cb_head); WARN_ON(&ssp->srcu_cb_head != ssp->srcu_cb_tail); } -EXPORT_SYMBOL_GPL(_cleanup_srcu_struct); +EXPORT_SYMBOL_GPL(cleanup_srcu_struct); /* * Removes the count for the old reader from the appropriate element of diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 3600d88d8956..cf0e886314f2 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -1,24 +1,11 @@ +// SPDX-License-Identifier: GPL-2.0+ /* * Sleepable Read-Copy Update mechanism for mutual exclusion. * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, you can access it online at - * http://www.gnu.org/licenses/gpl-2.0.html. - * * Copyright (C) IBM Corporation, 2006 * Copyright (C) Fujitsu, 2012 * - * Author: Paul McKenney <paulmck@us.ibm.com> + * Author: Paul McKenney <paulmck@linux.ibm.com> * Lai Jiangshan <laijs@cn.fujitsu.com> * * For detailed explanation of Read-Copy Update mechanism see - @@ -58,6 +45,7 @@ static bool __read_mostly srcu_init_done; static void srcu_invoke_callbacks(struct work_struct *work); static void srcu_reschedule(struct srcu_struct *ssp, unsigned long delay); static void process_srcu(struct work_struct *work); +static void srcu_delay_timer(struct timer_list *t); /* Wrappers for lock acquisition and release, see raw_spin_lock_rcu_node(). */ #define spin_lock_rcu_node(p) \ @@ -156,7 +144,8 @@ static void init_srcu_struct_nodes(struct srcu_struct *ssp, bool is_static) snp->grphi = cpu; } sdp->cpu = cpu; - INIT_DELAYED_WORK(&sdp->work, srcu_invoke_callbacks); + INIT_WORK(&sdp->work, srcu_invoke_callbacks); + timer_setup(&sdp->delay_work, srcu_delay_timer, 0); sdp->ssp = ssp; sdp->grpmask = 1 << (cpu - sdp->mynode->grplo); if (is_static) @@ -371,8 +360,14 @@ static unsigned long srcu_get_delay(struct srcu_struct *ssp) return SRCU_INTERVAL; } -/* Helper for cleanup_srcu_struct() and cleanup_srcu_struct_quiesced(). */ -void _cleanup_srcu_struct(struct srcu_struct *ssp, bool quiesced) +/** + * cleanup_srcu_struct - deconstruct a sleep-RCU structure + * @ssp: structure to clean up. + * + * Must invoke this after you are finished using a given srcu_struct that + * was initialized via init_srcu_struct(), else you leak memory. + */ +void cleanup_srcu_struct(struct srcu_struct *ssp) { int cpu; @@ -380,19 +375,15 @@ void _cleanup_srcu_struct(struct srcu_struct *ssp, bool quiesced) return; /* Just leak it! */ if (WARN_ON(srcu_readers_active(ssp))) return; /* Just leak it! */ - if (quiesced) { - if (WARN_ON(delayed_work_pending(&ssp->work))) - return; /* Just leak it! */ - } else { - flush_delayed_work(&ssp->work); + flush_delayed_work(&ssp->work); + for_each_possible_cpu(cpu) { + struct srcu_data *sdp = per_cpu_ptr(ssp->sda, cpu); + + del_timer_sync(&sdp->delay_work); + flush_work(&sdp->work); + if (WARN_ON(rcu_segcblist_n_cbs(&sdp->srcu_cblist))) + return; /* Forgot srcu_barrier(), so just leak it! */ } - for_each_possible_cpu(cpu) - if (quiesced) { - if (WARN_ON(delayed_work_pending(&per_cpu_ptr(ssp->sda, cpu)->work))) - return; /* Just leak it! */ - } else { - flush_delayed_work(&per_cpu_ptr(ssp->sda, cpu)->work); - } if (WARN_ON(rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq)) != SRCU_STATE_IDLE) || WARN_ON(srcu_readers_active(ssp))) { pr_info("%s: Active srcu_struct %p state: %d\n", @@ -402,7 +393,7 @@ void _cleanup_srcu_struct(struct srcu_struct *ssp, bool quiesced) free_percpu(ssp->sda); ssp->sda = NULL; } -EXPORT_SYMBOL_GPL(_cleanup_srcu_struct); +EXPORT_SYMBOL_GPL(cleanup_srcu_struct); /* * Counts the new reader in the appropriate per-CPU element of the @@ -463,39 +454,23 @@ static void srcu_gp_start(struct srcu_struct *ssp) WARN_ON_ONCE(state != SRCU_STATE_SCAN1); } -/* - * Track online CPUs to guide callback workqueue placement. - */ -DEFINE_PER_CPU(bool, srcu_online); -void srcu_online_cpu(unsigned int cpu) +static void srcu_delay_timer(struct timer_list *t) { - WRITE_ONCE(per_cpu(srcu_online, cpu), true); -} + struct srcu_data *sdp = container_of(t, struct srcu_data, delay_work); -void srcu_offline_cpu(unsigned int cpu) -{ - WRITE_ONCE(per_cpu(srcu_online, cpu), false); + queue_work_on(sdp->cpu, rcu_gp_wq, &sdp->work); } -/* - * Place the workqueue handler on the specified CPU if online, otherwise - * just run it whereever. This is useful for placing workqueue handlers - * that are to invoke the specified CPU's callbacks. - */ -static bool srcu_queue_delayed_work_on(int cpu, struct workqueue_struct *wq, - struct delayed_work *dwork, +static void srcu_queue_delayed_work_on(struct srcu_data *sdp, unsigned long delay) { - bool ret; + if (!delay) { + queue_work_on(sdp->cpu, rcu_gp_wq, &sdp->work); + return; + } - preempt_disable(); - if (READ_ONCE(per_cpu(srcu_online, cpu))) - ret = queue_delayed_work_on(cpu, wq, dwork, delay); - else - ret = queue_delayed_work(wq, dwork, delay); - preempt_enable(); - return ret; + timer_reduce(&sdp->delay_work, jiffies + delay); } /* @@ -504,7 +479,7 @@ static bool srcu_queue_delayed_work_on(int cpu, struct workqueue_struct *wq, */ static void srcu_schedule_cbs_sdp(struct srcu_data *sdp, unsigned long delay) { - srcu_queue_delayed_work_on(sdp->cpu, rcu_gp_wq, &sdp->work, delay); + srcu_queue_delayed_work_on(sdp, delay); } /* @@ -856,8 +831,8 @@ static void srcu_leak_callback(struct rcu_head *rhp) * srcu_read_lock(), and srcu_read_unlock() that are all passed the same * srcu_struct structure. */ -void __call_srcu(struct srcu_struct *ssp, struct rcu_head *rhp, - rcu_callback_t func, bool do_norm) +static void __call_srcu(struct srcu_struct *ssp, struct rcu_head *rhp, + rcu_callback_t func, bool do_norm) { unsigned long flags; int idx; @@ -1186,7 +1161,8 @@ static void srcu_invoke_callbacks(struct work_struct *work) struct srcu_data *sdp; struct srcu_struct *ssp; - sdp = container_of(work, struct srcu_data, work.work); + sdp = container_of(work, struct srcu_data, work); + ssp = sdp->ssp; rcu_cblist_init(&ready_cbs); spin_lock_irq_rcu_node(sdp); @@ -1334,3 +1310,68 @@ void __init srcu_init(void) queue_work(rcu_gp_wq, &ssp->work.work); } } + +#ifdef CONFIG_MODULES + +/* Initialize any global-scope srcu_struct structures used by this module. */ +static int srcu_module_coming(struct module *mod) +{ + int i; + struct srcu_struct **sspp = mod->srcu_struct_ptrs; + int ret; + + for (i = 0; i < mod->num_srcu_structs; i++) { + ret = init_srcu_struct(*(sspp++)); + if (WARN_ON_ONCE(ret)) + return ret; + } + return 0; +} + +/* Clean up any global-scope srcu_struct structures used by this module. */ +static void srcu_module_going(struct module *mod) +{ + int i; + struct srcu_struct **sspp = mod->srcu_struct_ptrs; + + for (i = 0; i < mod->num_srcu_structs; i++) + cleanup_srcu_struct(*(sspp++)); +} + +/* Handle one module, either coming or going. */ +static int srcu_module_notify(struct notifier_block *self, + unsigned long val, void *data) +{ + struct module *mod = data; + int ret = 0; + + switch (val) { + case MODULE_STATE_COMING: + ret = srcu_module_coming(mod); + break; + case MODULE_STATE_GOING: + srcu_module_going(mod); + break; + default: + break; + } + return ret; +} + +static struct notifier_block srcu_module_nb = { + .notifier_call = srcu_module_notify, + .priority = 0, +}; + +static __init int init_srcu_module_notifier(void) +{ + int ret; + + ret = register_module_notifier(&srcu_module_nb); + if (ret) + pr_warn("Failed to register srcu module notifier\n"); + return ret; +} +late_initcall(init_srcu_module_notifier); + +#endif /* #ifdef CONFIG_MODULES */ diff --git a/kernel/rcu/sync.c b/kernel/rcu/sync.c index be10036fa621..d4558ab7a07d 100644 --- a/kernel/rcu/sync.c +++ b/kernel/rcu/sync.c @@ -1,20 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0+ /* * RCU-based infrastructure for lightweight reader-writer locking * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, you can access it online at - * http://www.gnu.org/licenses/gpl-2.0.html. - * * Copyright (c) 2015, Red Hat, Inc. * * Author: Oleg Nesterov <oleg@redhat.com> @@ -23,65 +10,18 @@ #include <linux/rcu_sync.h> #include <linux/sched.h> -#ifdef CONFIG_PROVE_RCU -#define __INIT_HELD(func) .held = func, -#else -#define __INIT_HELD(func) -#endif - -static const struct { - void (*sync)(void); - void (*call)(struct rcu_head *, void (*)(struct rcu_head *)); - void (*wait)(void); -#ifdef CONFIG_PROVE_RCU - int (*held)(void); -#endif -} gp_ops[] = { - [RCU_SYNC] = { - .sync = synchronize_rcu, - .call = call_rcu, - .wait = rcu_barrier, - __INIT_HELD(rcu_read_lock_held) - }, - [RCU_SCHED_SYNC] = { - .sync = synchronize_rcu, - .call = call_rcu, - .wait = rcu_barrier, - __INIT_HELD(rcu_read_lock_sched_held) - }, - [RCU_BH_SYNC] = { - .sync = synchronize_rcu, - .call = call_rcu, - .wait = rcu_barrier, - __INIT_HELD(rcu_read_lock_bh_held) - }, -}; - -enum { GP_IDLE = 0, GP_PENDING, GP_PASSED }; -enum { CB_IDLE = 0, CB_PENDING, CB_REPLAY }; +enum { GP_IDLE = 0, GP_ENTER, GP_PASSED, GP_EXIT, GP_REPLAY }; #define rss_lock gp_wait.lock -#ifdef CONFIG_PROVE_RCU -void rcu_sync_lockdep_assert(struct rcu_sync *rsp) -{ - RCU_LOCKDEP_WARN(!gp_ops[rsp->gp_type].held(), - "suspicious rcu_sync_is_idle() usage"); -} - -EXPORT_SYMBOL_GPL(rcu_sync_lockdep_assert); -#endif - /** * rcu_sync_init() - Initialize an rcu_sync structure * @rsp: Pointer to rcu_sync structure to be initialized - * @type: Flavor of RCU with which to synchronize rcu_sync structure */ -void rcu_sync_init(struct rcu_sync *rsp, enum rcu_sync_type type) +void rcu_sync_init(struct rcu_sync *rsp) { memset(rsp, 0, sizeof(*rsp)); init_waitqueue_head(&rsp->gp_wait); - rsp->gp_type = type; } /** @@ -99,56 +39,26 @@ void rcu_sync_enter_start(struct rcu_sync *rsp) rsp->gp_state = GP_PASSED; } -/** - * rcu_sync_enter() - Force readers onto slowpath - * @rsp: Pointer to rcu_sync structure to use for synchronization - * - * This function is used by updaters who need readers to make use of - * a slowpath during the update. After this function returns, all - * subsequent calls to rcu_sync_is_idle() will return false, which - * tells readers to stay off their fastpaths. A later call to - * rcu_sync_exit() re-enables reader slowpaths. - * - * When called in isolation, rcu_sync_enter() must wait for a grace - * period, however, closely spaced calls to rcu_sync_enter() can - * optimize away the grace-period wait via a state machine implemented - * by rcu_sync_enter(), rcu_sync_exit(), and rcu_sync_func(). - */ -void rcu_sync_enter(struct rcu_sync *rsp) -{ - bool need_wait, need_sync; - spin_lock_irq(&rsp->rss_lock); - need_wait = rsp->gp_count++; - need_sync = rsp->gp_state == GP_IDLE; - if (need_sync) - rsp->gp_state = GP_PENDING; - spin_unlock_irq(&rsp->rss_lock); +static void rcu_sync_func(struct rcu_head *rhp); - WARN_ON_ONCE(need_wait && need_sync); - if (need_sync) { - gp_ops[rsp->gp_type].sync(); - rsp->gp_state = GP_PASSED; - wake_up_all(&rsp->gp_wait); - } else if (need_wait) { - wait_event(rsp->gp_wait, rsp->gp_state == GP_PASSED); - } else { - /* - * Possible when there's a pending CB from a rcu_sync_exit(). - * Nobody has yet been allowed the 'fast' path and thus we can - * avoid doing any sync(). The callback will get 'dropped'. - */ - WARN_ON_ONCE(rsp->gp_state != GP_PASSED); - } +static void rcu_sync_call(struct rcu_sync *rsp) +{ + call_rcu(&rsp->cb_head, rcu_sync_func); } /** * rcu_sync_func() - Callback function managing reader access to fastpath * @rhp: Pointer to rcu_head in rcu_sync structure to use for synchronization * - * This function is passed to one of the call_rcu() functions by + * This function is passed to call_rcu() function by rcu_sync_enter() and * rcu_sync_exit(), so that it is invoked after a grace period following the - * that invocation of rcu_sync_exit(). It takes action based on events that + * that invocation of enter/exit. + * + * If it is called by rcu_sync_enter() it signals that all the readers were + * switched onto slow path. + * + * If it is called by rcu_sync_exit() it takes action based on events that * have taken place in the meantime, so that closely spaced rcu_sync_enter() * and rcu_sync_exit() pairs need not wait for a grace period. * @@ -165,35 +75,88 @@ static void rcu_sync_func(struct rcu_head *rhp) struct rcu_sync *rsp = container_of(rhp, struct rcu_sync, cb_head); unsigned long flags; - WARN_ON_ONCE(rsp->gp_state != GP_PASSED); - WARN_ON_ONCE(rsp->cb_state == CB_IDLE); + WARN_ON_ONCE(READ_ONCE(rsp->gp_state) == GP_IDLE); + WARN_ON_ONCE(READ_ONCE(rsp->gp_state) == GP_PASSED); spin_lock_irqsave(&rsp->rss_lock, flags); if (rsp->gp_count) { /* - * A new rcu_sync_begin() has happened; drop the callback. + * We're at least a GP after the GP_IDLE->GP_ENTER transition. */ - rsp->cb_state = CB_IDLE; - } else if (rsp->cb_state == CB_REPLAY) { + WRITE_ONCE(rsp->gp_state, GP_PASSED); + wake_up_locked(&rsp->gp_wait); + } else if (rsp->gp_state == GP_REPLAY) { /* - * A new rcu_sync_exit() has happened; requeue the callback - * to catch a later GP. + * A new rcu_sync_exit() has happened; requeue the callback to + * catch a later GP. */ - rsp->cb_state = CB_PENDING; - gp_ops[rsp->gp_type].call(&rsp->cb_head, rcu_sync_func); + WRITE_ONCE(rsp->gp_state, GP_EXIT); + rcu_sync_call(rsp); } else { /* - * We're at least a GP after rcu_sync_exit(); eveybody will now - * have observed the write side critical section. Let 'em rip!. + * We're at least a GP after the last rcu_sync_exit(); eveybody + * will now have observed the write side critical section. + * Let 'em rip!. */ - rsp->cb_state = CB_IDLE; - rsp->gp_state = GP_IDLE; + WRITE_ONCE(rsp->gp_state, GP_IDLE); } spin_unlock_irqrestore(&rsp->rss_lock, flags); } /** - * rcu_sync_exit() - Allow readers back onto fast patch after grace period + * rcu_sync_enter() - Force readers onto slowpath + * @rsp: Pointer to rcu_sync structure to use for synchronization + * + * This function is used by updaters who need readers to make use of + * a slowpath during the update. After this function returns, all + * subsequent calls to rcu_sync_is_idle() will return false, which + * tells readers to stay off their fastpaths. A later call to + * rcu_sync_exit() re-enables reader slowpaths. + * + * When called in isolation, rcu_sync_enter() must wait for a grace + * period, however, closely spaced calls to rcu_sync_enter() can + * optimize away the grace-period wait via a state machine implemented + * by rcu_sync_enter(), rcu_sync_exit(), and rcu_sync_func(). + */ +void rcu_sync_enter(struct rcu_sync *rsp) +{ + int gp_state; + + spin_lock_irq(&rsp->rss_lock); + gp_state = rsp->gp_state; + if (gp_state == GP_IDLE) { + WRITE_ONCE(rsp->gp_state, GP_ENTER); + WARN_ON_ONCE(rsp->gp_count); + /* + * Note that we could simply do rcu_sync_call(rsp) here and + * avoid the "if (gp_state == GP_IDLE)" block below. + * + * However, synchronize_rcu() can be faster if rcu_expedited + * or rcu_blocking_is_gp() is true. + * + * Another reason is that we can't wait for rcu callback if + * we are called at early boot time but this shouldn't happen. + */ + } + rsp->gp_count++; + spin_unlock_irq(&rsp->rss_lock); + + if (gp_state == GP_IDLE) { + /* + * See the comment above, this simply does the "synchronous" + * call_rcu(rcu_sync_func) which does GP_ENTER -> GP_PASSED. + */ + synchronize_rcu(); + rcu_sync_func(&rsp->cb_head); + /* Not really needed, wait_event() would see GP_PASSED. */ + return; + } + + wait_event(rsp->gp_wait, READ_ONCE(rsp->gp_state) >= GP_PASSED); +} + +/** + * rcu_sync_exit() - Allow readers back onto fast path after grace period * @rsp: Pointer to rcu_sync structure to use for synchronization * * This function is used by updaters who have completed, and can therefore @@ -204,13 +167,16 @@ static void rcu_sync_func(struct rcu_head *rhp) */ void rcu_sync_exit(struct rcu_sync *rsp) { + WARN_ON_ONCE(READ_ONCE(rsp->gp_state) == GP_IDLE); + WARN_ON_ONCE(READ_ONCE(rsp->gp_count) == 0); + spin_lock_irq(&rsp->rss_lock); if (!--rsp->gp_count) { - if (rsp->cb_state == CB_IDLE) { - rsp->cb_state = CB_PENDING; - gp_ops[rsp->gp_type].call(&rsp->cb_head, rcu_sync_func); - } else if (rsp->cb_state == CB_PENDING) { - rsp->cb_state = CB_REPLAY; + if (rsp->gp_state == GP_PASSED) { + WRITE_ONCE(rsp->gp_state, GP_EXIT); + rcu_sync_call(rsp); + } else if (rsp->gp_state == GP_EXIT) { + WRITE_ONCE(rsp->gp_state, GP_REPLAY); } } spin_unlock_irq(&rsp->rss_lock); @@ -222,18 +188,19 @@ void rcu_sync_exit(struct rcu_sync *rsp) */ void rcu_sync_dtor(struct rcu_sync *rsp) { - int cb_state; + int gp_state; - WARN_ON_ONCE(rsp->gp_count); + WARN_ON_ONCE(READ_ONCE(rsp->gp_count)); + WARN_ON_ONCE(READ_ONCE(rsp->gp_state) == GP_PASSED); spin_lock_irq(&rsp->rss_lock); - if (rsp->cb_state == CB_REPLAY) - rsp->cb_state = CB_PENDING; - cb_state = rsp->cb_state; + if (rsp->gp_state == GP_REPLAY) + WRITE_ONCE(rsp->gp_state, GP_EXIT); + gp_state = rsp->gp_state; spin_unlock_irq(&rsp->rss_lock); - if (cb_state != CB_IDLE) { - gp_ops[rsp->gp_type].wait(); - WARN_ON_ONCE(rsp->cb_state != CB_IDLE); + if (gp_state != GP_IDLE) { + rcu_barrier(); + WARN_ON_ONCE(rsp->gp_state != GP_IDLE); } } diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index 5f5963ba313e..477b4eb44af5 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c @@ -1,23 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0+ /* * Read-Copy Update mechanism for mutual exclusion, the Bloatwatch edition. * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, you can access it online at - * http://www.gnu.org/licenses/gpl-2.0.html. - * * Copyright IBM Corporation, 2008 * - * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com> + * Author: Paul E. McKenney <paulmck@linux.ibm.com> * * For detailed explanation of Read-Copy Update mechanism see - * Documentation/RCU @@ -65,7 +52,7 @@ void rcu_qs(void) local_irq_save(flags); if (rcu_ctrlblk.donetail != rcu_ctrlblk.curtail) { rcu_ctrlblk.donetail = rcu_ctrlblk.curtail; - raise_softirq(RCU_SOFTIRQ); + raise_softirq_irqoff(RCU_SOFTIRQ); } local_irq_restore(flags); } @@ -76,7 +63,7 @@ void rcu_qs(void) * be called from hardirq context. It is normally called from the * scheduling-clock interrupt. */ -void rcu_check_callbacks(int user) +void rcu_sched_clock_irq(int user) { if (user) { rcu_qs(); diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 9180158756d2..a14e5fbbea46 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1,27 +1,14 @@ +// SPDX-License-Identifier: GPL-2.0+ /* * Read-Copy Update mechanism for mutual exclusion * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, you can access it online at - * http://www.gnu.org/licenses/gpl-2.0.html. - * * Copyright IBM Corporation, 2008 * * Authors: Dipankar Sarma <dipankar@in.ibm.com> * Manfred Spraul <manfred@colorfullife.com> - * Paul E. McKenney <paulmck@linux.vnet.ibm.com> Hierarchical version + * Paul E. McKenney <paulmck@linux.ibm.com> Hierarchical version * - * Based on the original work by Paul McKenney <paulmck@us.ibm.com> + * Based on the original work by Paul McKenney <paulmck@linux.ibm.com> * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen. * * For detailed explanation of Read-Copy Update mechanism see - @@ -62,6 +49,14 @@ #include <linux/suspend.h> #include <linux/ftrace.h> #include <linux/tick.h> +#include <linux/sysrq.h> +#include <linux/kprobes.h> +#include <linux/gfp.h> +#include <linux/oom.h> +#include <linux/smpboot.h> +#include <linux/jiffies.h> +#include <linux/sched/isolation.h> +#include "../time/tick-internal.h" #include "tree.h" #include "rcu.h" @@ -103,6 +98,9 @@ struct rcu_state rcu_state = { /* Dump rcu_node combining tree at boot to verify correct setup. */ static bool dump_tree; module_param(dump_tree, bool, 0444); +/* By default, use RCU_SOFTIRQ instead of rcuc kthreads. */ +static bool use_softirq = 1; +module_param(use_softirq, bool, 0444); /* Control rcu_node-tree auto-balancing at boot time. */ static bool rcu_fanout_exact; module_param(rcu_fanout_exact, bool, 0444); @@ -113,8 +111,6 @@ int rcu_num_lvls __read_mostly = RCU_NUM_LVLS; /* Number of rcu_nodes at specified level. */ int num_rcu_lvl[] = NUM_RCU_LVL_INIT; int rcu_num_nodes __read_mostly = NUM_RCU_NODES; /* Total # rcu_nodes in use. */ -/* panic() on RCU Stall sysctl. */ -int sysctl_panic_on_rcu_stall __read_mostly; /* * The rcu_scheduler_active variable is initialized to the value @@ -151,13 +147,12 @@ static void rcu_init_new_rnp(struct rcu_node *rnp_leaf); static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf); static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu); static void invoke_rcu_core(void); -static void invoke_rcu_callbacks(struct rcu_data *rdp); static void rcu_report_exp_rdp(struct rcu_data *rdp); static void sync_sched_exp_online_cleanup(int cpu); /* rcuc/rcub kthread realtime priority */ static int kthread_prio = IS_ENABLED(CONFIG_RCU_BOOST) ? 1 : 0; -module_param(kthread_prio, int, 0644); +module_param(kthread_prio, int, 0444); /* Delay in jiffies for grace-period initialization delays, debug only. */ @@ -381,19 +376,33 @@ static void __maybe_unused rcu_momentary_dyntick_idle(void) } /** - * rcu_is_cpu_rrupt_from_idle - see if idle or immediately interrupted from idle + * rcu_is_cpu_rrupt_from_idle - see if interrupted from idle * - * If the current CPU is idle or running at a first-level (not nested) + * If the current CPU is idle and running at a first-level (not nested) * interrupt from idle, return true. The caller must have at least * disabled preemption. */ static int rcu_is_cpu_rrupt_from_idle(void) { - return __this_cpu_read(rcu_data.dynticks_nesting) <= 0 && - __this_cpu_read(rcu_data.dynticks_nmi_nesting) <= 1; + /* Called only from within the scheduling-clock interrupt */ + lockdep_assert_in_irq(); + + /* Check for counter underflows */ + RCU_LOCKDEP_WARN(__this_cpu_read(rcu_data.dynticks_nesting) < 0, + "RCU dynticks_nesting counter underflow!"); + RCU_LOCKDEP_WARN(__this_cpu_read(rcu_data.dynticks_nmi_nesting) <= 0, + "RCU dynticks_nmi_nesting counter underflow/zero!"); + + /* Are we at first interrupt nesting level? */ + if (__this_cpu_read(rcu_data.dynticks_nmi_nesting) != 1) + return false; + + /* Does CPU appear to be idle from an RCU standpoint? */ + return __this_cpu_read(rcu_data.dynticks_nesting) == 0; } -#define DEFAULT_RCU_BLIMIT 10 /* Maximum callbacks per rcu_do_batch. */ +#define DEFAULT_RCU_BLIMIT 10 /* Maximum callbacks per rcu_do_batch ... */ +#define DEFAULT_MAX_RCU_BLIMIT 10000 /* ... even during callback flood. */ static long blimit = DEFAULT_RCU_BLIMIT; #define DEFAULT_RCU_QHIMARK 10000 /* If this many pending, ignore blimit. */ static long qhimark = DEFAULT_RCU_QHIMARK; @@ -414,7 +423,7 @@ static bool rcu_kick_kthreads; */ static ulong jiffies_till_sched_qs = ULONG_MAX; module_param(jiffies_till_sched_qs, ulong, 0444); -static ulong jiffies_to_sched_qs; /* Adjusted version of above if not default */ +static ulong jiffies_to_sched_qs; /* See adjust_jiffies_till_sched_qs(). */ module_param(jiffies_to_sched_qs, ulong, 0444); /* Display only! */ /* @@ -432,6 +441,7 @@ static void adjust_jiffies_till_sched_qs(void) WRITE_ONCE(jiffies_to_sched_qs, jiffies_till_sched_qs); return; } + /* Otherwise, set to third fqs scan, but bound below on large system. */ j = READ_ONCE(jiffies_till_first_fqs) + 2 * READ_ONCE(jiffies_till_next_fqs); if (j < HZ / 10 + nr_cpu_ids / RCU_JIFFIES_FQS_DIV) @@ -479,7 +489,6 @@ module_param_cb(jiffies_till_next_fqs, &next_fqs_jiffies_ops, &jiffies_till_next module_param(rcu_kick_kthreads, bool, 0644); static void force_qs_rnp(int (*f)(struct rcu_data *rdp)); -static void force_quiescent_state(void); static int rcu_pending(void); /* @@ -504,13 +513,12 @@ unsigned long rcu_exp_batches_completed(void) EXPORT_SYMBOL_GPL(rcu_exp_batches_completed); /* - * Force a quiescent state. + * Return the root node of the rcu_state structure. */ -void rcu_force_quiescent_state(void) +static struct rcu_node *rcu_get_root(void) { - force_quiescent_state(); + return &rcu_state.node[0]; } -EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); /* * Convert a ->gp_state value to a character string. @@ -523,42 +531,6 @@ static const char *gp_state_getname(short gs) } /* - * Show the state of the grace-period kthreads. - */ -void show_rcu_gp_kthreads(void) -{ - int cpu; - unsigned long j; - struct rcu_data *rdp; - struct rcu_node *rnp; - - j = jiffies - READ_ONCE(rcu_state.gp_activity); - pr_info("%s: wait state: %s(%d) ->state: %#lx delta ->gp_activity %ld\n", - rcu_state.name, gp_state_getname(rcu_state.gp_state), - rcu_state.gp_state, rcu_state.gp_kthread->state, j); - rcu_for_each_node_breadth_first(rnp) { - if (ULONG_CMP_GE(rcu_state.gp_seq, rnp->gp_seq_needed)) - continue; - pr_info("\trcu_node %d:%d ->gp_seq %lu ->gp_seq_needed %lu\n", - rnp->grplo, rnp->grphi, rnp->gp_seq, - rnp->gp_seq_needed); - if (!rcu_is_leaf_node(rnp)) - continue; - for_each_leaf_node_possible_cpu(rnp, cpu) { - rdp = per_cpu_ptr(&rcu_data, cpu); - if (rdp->gpwrap || - ULONG_CMP_GE(rcu_state.gp_seq, - rdp->gp_seq_needed)) - continue; - pr_info("\tcpu %d ->gp_seq_needed %lu\n", - cpu, rdp->gp_seq_needed); - } - } - /* sched_show_task(rcu_state.gp_kthread); */ -} -EXPORT_SYMBOL_GPL(show_rcu_gp_kthreads); - -/* * Send along grace-period-related data for rcutorture diagnostics. */ void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags, @@ -566,8 +538,6 @@ void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags, { switch (test_type) { case RCU_FLAVOR: - case RCU_BH_FLAVOR: - case RCU_SCHED_FLAVOR: *flags = READ_ONCE(rcu_state.gp_flags); *gp_seq = rcu_seq_current(&rcu_state.gp_seq); break; @@ -578,14 +548,6 @@ void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags, EXPORT_SYMBOL_GPL(rcutorture_get_gp_data); /* - * Return the root node of the rcu_state structure. - */ -static struct rcu_node *rcu_get_root(void) -{ - return &rcu_state.node[0]; -} - -/* * Enter an RCU extended quiescent state, which can be either the * idle loop or adaptive-tickless usermode execution. * @@ -701,7 +663,6 @@ static __always_inline void rcu_nmi_exit_common(bool irq) /** * rcu_nmi_exit - inform RCU of exit from NMI context - * @irq: Is this call from rcu_irq_exit? * * If you add or remove a call to rcu_nmi_exit(), be sure to test * with CONFIG_RCU_EQS_DEBUG=y. @@ -872,6 +833,7 @@ void rcu_nmi_enter(void) { rcu_nmi_enter_common(false); } +NOKPROBE_SYMBOL(rcu_nmi_enter); /** * rcu_irq_enter - inform RCU that current CPU is entering irq away from idle @@ -1022,27 +984,6 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp) } /* - * Handler for the irq_work request posted when a grace period has - * gone on for too long, but not yet long enough for an RCU CPU - * stall warning. Set state appropriately, but just complain if - * there is unexpected state on entry. - */ -static void rcu_iw_handler(struct irq_work *iwp) -{ - struct rcu_data *rdp; - struct rcu_node *rnp; - - rdp = container_of(iwp, struct rcu_data, rcu_iw); - rnp = rdp->mynode; - raw_spin_lock_rcu_node(rnp); - if (!WARN_ON_ONCE(!rdp->rcu_iw_pending)) { - rdp->rcu_iw_gp_seq = rnp->gp_seq; - rdp->rcu_iw_pending = false; - } - raw_spin_unlock_rcu_node(rnp); -} - -/* * Return true if the specified CPU has passed through a quiescent * state by virtue of being in or having passed through an dynticks * idle state since the last call to dyntick_save_progress_counter() @@ -1115,7 +1056,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) } /* - * NO_HZ_FULL CPUs can run in-kernel without rcu_check_callbacks! + * NO_HZ_FULL CPUs can run in-kernel without rcu_sched_clock_irq! * The above code handles this, but only for straight cond_resched(). * And some in-kernel loops check need_resched() before calling * cond_resched(), which defeats the above code for CPUs that are @@ -1155,295 +1096,6 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) return 0; } -static void record_gp_stall_check_time(void) -{ - unsigned long j = jiffies; - unsigned long j1; - - rcu_state.gp_start = j; - j1 = rcu_jiffies_till_stall_check(); - /* Record ->gp_start before ->jiffies_stall. */ - smp_store_release(&rcu_state.jiffies_stall, j + j1); /* ^^^ */ - rcu_state.jiffies_resched = j + j1 / 2; - rcu_state.n_force_qs_gpstart = READ_ONCE(rcu_state.n_force_qs); -} - -/* - * Complain about starvation of grace-period kthread. - */ -static void rcu_check_gp_kthread_starvation(void) -{ - struct task_struct *gpk = rcu_state.gp_kthread; - unsigned long j; - - j = jiffies - READ_ONCE(rcu_state.gp_activity); - if (j > 2 * HZ) { - pr_err("%s kthread starved for %ld jiffies! g%ld f%#x %s(%d) ->state=%#lx ->cpu=%d\n", - rcu_state.name, j, - (long)rcu_seq_current(&rcu_state.gp_seq), - rcu_state.gp_flags, - gp_state_getname(rcu_state.gp_state), rcu_state.gp_state, - gpk ? gpk->state : ~0, gpk ? task_cpu(gpk) : -1); - if (gpk) { - pr_err("RCU grace-period kthread stack dump:\n"); - sched_show_task(gpk); - wake_up_process(gpk); - } - } -} - -/* - * Dump stacks of all tasks running on stalled CPUs. First try using - * NMIs, but fall back to manual remote stack tracing on architectures - * that don't support NMI-based stack dumps. The NMI-triggered stack - * traces are more accurate because they are printed by the target CPU. - */ -static void rcu_dump_cpu_stacks(void) -{ - int cpu; - unsigned long flags; - struct rcu_node *rnp; - - rcu_for_each_leaf_node(rnp) { - raw_spin_lock_irqsave_rcu_node(rnp, flags); - for_each_leaf_node_possible_cpu(rnp, cpu) - if (rnp->qsmask & leaf_node_cpu_bit(rnp, cpu)) - if (!trigger_single_cpu_backtrace(cpu)) - dump_cpu_task(cpu); - raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - } -} - -/* - * If too much time has passed in the current grace period, and if - * so configured, go kick the relevant kthreads. - */ -static void rcu_stall_kick_kthreads(void) -{ - unsigned long j; - - if (!rcu_kick_kthreads) - return; - j = READ_ONCE(rcu_state.jiffies_kick_kthreads); - if (time_after(jiffies, j) && rcu_state.gp_kthread && - (rcu_gp_in_progress() || READ_ONCE(rcu_state.gp_flags))) { - WARN_ONCE(1, "Kicking %s grace-period kthread\n", - rcu_state.name); - rcu_ftrace_dump(DUMP_ALL); - wake_up_process(rcu_state.gp_kthread); - WRITE_ONCE(rcu_state.jiffies_kick_kthreads, j + HZ); - } -} - -static void panic_on_rcu_stall(void) -{ - if (sysctl_panic_on_rcu_stall) - panic("RCU Stall\n"); -} - -static void print_other_cpu_stall(unsigned long gp_seq) -{ - int cpu; - unsigned long flags; - unsigned long gpa; - unsigned long j; - int ndetected = 0; - struct rcu_node *rnp = rcu_get_root(); - long totqlen = 0; - - /* Kick and suppress, if so configured. */ - rcu_stall_kick_kthreads(); - if (rcu_cpu_stall_suppress) - return; - - /* - * OK, time to rat on our buddy... - * See Documentation/RCU/stallwarn.txt for info on how to debug - * RCU CPU stall warnings. - */ - pr_err("INFO: %s detected stalls on CPUs/tasks:", rcu_state.name); - print_cpu_stall_info_begin(); - rcu_for_each_leaf_node(rnp) { - raw_spin_lock_irqsave_rcu_node(rnp, flags); - ndetected += rcu_print_task_stall(rnp); - if (rnp->qsmask != 0) { - for_each_leaf_node_possible_cpu(rnp, cpu) - if (rnp->qsmask & leaf_node_cpu_bit(rnp, cpu)) { - print_cpu_stall_info(cpu); - ndetected++; - } - } - raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - } - - print_cpu_stall_info_end(); - for_each_possible_cpu(cpu) - totqlen += rcu_get_n_cbs_cpu(cpu); - pr_cont("(detected by %d, t=%ld jiffies, g=%ld, q=%lu)\n", - smp_processor_id(), (long)(jiffies - rcu_state.gp_start), - (long)rcu_seq_current(&rcu_state.gp_seq), totqlen); - if (ndetected) { - rcu_dump_cpu_stacks(); - - /* Complain about tasks blocking the grace period. */ - rcu_print_detail_task_stall(); - } else { - if (rcu_seq_current(&rcu_state.gp_seq) != gp_seq) { - pr_err("INFO: Stall ended before state dump start\n"); - } else { - j = jiffies; - gpa = READ_ONCE(rcu_state.gp_activity); - pr_err("All QSes seen, last %s kthread activity %ld (%ld-%ld), jiffies_till_next_fqs=%ld, root ->qsmask %#lx\n", - rcu_state.name, j - gpa, j, gpa, - READ_ONCE(jiffies_till_next_fqs), - rcu_get_root()->qsmask); - /* In this case, the current CPU might be at fault. */ - sched_show_task(current); - } - } - /* Rewrite if needed in case of slow consoles. */ - if (ULONG_CMP_GE(jiffies, READ_ONCE(rcu_state.jiffies_stall))) - WRITE_ONCE(rcu_state.jiffies_stall, - jiffies + 3 * rcu_jiffies_till_stall_check() + 3); - - rcu_check_gp_kthread_starvation(); - - panic_on_rcu_stall(); - - force_quiescent_state(); /* Kick them all. */ -} - -static void print_cpu_stall(void) -{ - int cpu; - unsigned long flags; - struct rcu_data *rdp = this_cpu_ptr(&rcu_data); - struct rcu_node *rnp = rcu_get_root(); - long totqlen = 0; - - /* Kick and suppress, if so configured. */ - rcu_stall_kick_kthreads(); - if (rcu_cpu_stall_suppress) - return; - - /* - * OK, time to rat on ourselves... - * See Documentation/RCU/stallwarn.txt for info on how to debug - * RCU CPU stall warnings. - */ - pr_err("INFO: %s self-detected stall on CPU", rcu_state.name); - print_cpu_stall_info_begin(); - raw_spin_lock_irqsave_rcu_node(rdp->mynode, flags); - print_cpu_stall_info(smp_processor_id()); - raw_spin_unlock_irqrestore_rcu_node(rdp->mynode, flags); - print_cpu_stall_info_end(); - for_each_possible_cpu(cpu) - totqlen += rcu_get_n_cbs_cpu(cpu); - pr_cont(" (t=%lu jiffies g=%ld q=%lu)\n", - jiffies - rcu_state.gp_start, - (long)rcu_seq_current(&rcu_state.gp_seq), totqlen); - - rcu_check_gp_kthread_starvation(); - - rcu_dump_cpu_stacks(); - - raw_spin_lock_irqsave_rcu_node(rnp, flags); - /* Rewrite if needed in case of slow consoles. */ - if (ULONG_CMP_GE(jiffies, READ_ONCE(rcu_state.jiffies_stall))) - WRITE_ONCE(rcu_state.jiffies_stall, - jiffies + 3 * rcu_jiffies_till_stall_check() + 3); - raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - - panic_on_rcu_stall(); - - /* - * Attempt to revive the RCU machinery by forcing a context switch. - * - * A context switch would normally allow the RCU state machine to make - * progress and it could be we're stuck in kernel space without context - * switches for an entirely unreasonable amount of time. - */ - set_tsk_need_resched(current); - set_preempt_need_resched(); -} - -static void check_cpu_stall(struct rcu_data *rdp) -{ - unsigned long gs1; - unsigned long gs2; - unsigned long gps; - unsigned long j; - unsigned long jn; - unsigned long js; - struct rcu_node *rnp; - - if ((rcu_cpu_stall_suppress && !rcu_kick_kthreads) || - !rcu_gp_in_progress()) - return; - rcu_stall_kick_kthreads(); - j = jiffies; - - /* - * Lots of memory barriers to reject false positives. - * - * The idea is to pick up rcu_state.gp_seq, then - * rcu_state.jiffies_stall, then rcu_state.gp_start, and finally - * another copy of rcu_state.gp_seq. These values are updated in - * the opposite order with memory barriers (or equivalent) during - * grace-period initialization and cleanup. Now, a false positive - * can occur if we get an new value of rcu_state.gp_start and a old - * value of rcu_state.jiffies_stall. But given the memory barriers, - * the only way that this can happen is if one grace period ends - * and another starts between these two fetches. This is detected - * by comparing the second fetch of rcu_state.gp_seq with the - * previous fetch from rcu_state.gp_seq. - * - * Given this check, comparisons of jiffies, rcu_state.jiffies_stall, - * and rcu_state.gp_start suffice to forestall false positives. - */ - gs1 = READ_ONCE(rcu_state.gp_seq); - smp_rmb(); /* Pick up ->gp_seq first... */ - js = READ_ONCE(rcu_state.jiffies_stall); - smp_rmb(); /* ...then ->jiffies_stall before the rest... */ - gps = READ_ONCE(rcu_state.gp_start); - smp_rmb(); /* ...and finally ->gp_start before ->gp_seq again. */ - gs2 = READ_ONCE(rcu_state.gp_seq); - if (gs1 != gs2 || - ULONG_CMP_LT(j, js) || - ULONG_CMP_GE(gps, js)) - return; /* No stall or GP completed since entering function. */ - rnp = rdp->mynode; - jn = jiffies + 3 * rcu_jiffies_till_stall_check() + 3; - if (rcu_gp_in_progress() && - (READ_ONCE(rnp->qsmask) & rdp->grpmask) && - cmpxchg(&rcu_state.jiffies_stall, js, jn) == js) { - - /* We haven't checked in, so go dump stack. */ - print_cpu_stall(); - - } else if (rcu_gp_in_progress() && - ULONG_CMP_GE(j, js + RCU_STALL_RAT_DELAY) && - cmpxchg(&rcu_state.jiffies_stall, js, jn) == js) { - - /* They had a few time units to dump stack, so complain. */ - print_other_cpu_stall(gs2); - } -} - -/** - * rcu_cpu_stall_reset - prevent further stall warnings in current grace period - * - * Set the stall-warning timeout way off into the future, thus preventing - * any RCU CPU stall-warning messages from appearing in the current set of - * RCU grace periods. - * - * The caller must disable hard irqs. - */ -void rcu_cpu_stall_reset(void) -{ - WRITE_ONCE(rcu_state.jiffies_stall, jiffies + ULONG_MAX / 2); -} - /* Trace-event wrapper function for trace_rcu_future_grace_period. */ static void trace_rcu_this_gp(struct rcu_node *rnp, struct rcu_data *rdp, unsigned long gp_seq_req, const char *s) @@ -1557,17 +1209,28 @@ static bool rcu_future_gp_cleanup(struct rcu_node *rnp) } /* - * Awaken the grace-period kthread. Don't do a self-awaken, and don't - * bother awakening when there is nothing for the grace-period kthread - * to do (as in several CPUs raced to awaken, and we lost), and finally - * don't try to awaken a kthread that has not yet been created. + * Awaken the grace-period kthread. Don't do a self-awaken (unless in + * an interrupt or softirq handler), and don't bother awakening when there + * is nothing for the grace-period kthread to do (as in several CPUs raced + * to awaken, and we lost), and finally don't try to awaken a kthread that + * has not yet been created. If all those checks are passed, track some + * debug information and awaken. + * + * So why do the self-wakeup when in an interrupt or softirq handler + * in the grace-period kthread's context? Because the kthread might have + * been interrupted just as it was going to sleep, and just after the final + * pre-sleep check of the awaken condition. In this case, a wakeup really + * is required, and is therefore supplied. */ static void rcu_gp_kthread_wake(void) { - if (current == rcu_state.gp_kthread || + if ((current == rcu_state.gp_kthread && + !in_irq() && !in_serving_softirq()) || !READ_ONCE(rcu_state.gp_flags) || !rcu_state.gp_kthread) return; + WRITE_ONCE(rcu_state.gp_wake_time, jiffies); + WRITE_ONCE(rcu_state.gp_wake_seq, READ_ONCE(rcu_state.gp_seq)); swake_up_one(&rcu_state.gp_wq); } @@ -1711,7 +1374,7 @@ static bool __note_gp_changes(struct rcu_node *rnp, struct rcu_data *rdp) zero_cpu_stall_ticks(rdp); } rdp->gp_seq = rnp->gp_seq; /* Remember new grace-period state. */ - if (ULONG_CMP_GE(rnp->gp_seq_needed, rdp->gp_seq_needed) || rdp->gpwrap) + if (ULONG_CMP_LT(rdp->gp_seq_needed, rnp->gp_seq_needed) || rdp->gpwrap) rdp->gp_seq_needed = rnp->gp_seq_needed; WRITE_ONCE(rdp->gpwrap, false); rcu_gpnum_ovf(rnp, rdp); @@ -1939,7 +1602,7 @@ static void rcu_gp_fqs_loop(void) if (!ret) { rcu_state.jiffies_force_qs = jiffies + j; WRITE_ONCE(rcu_state.jiffies_kick_kthreads, - jiffies + 3 * j); + jiffies + (j ? 3 * j : 2)); } trace_rcu_grace_period(rcu_state.name, READ_ONCE(rcu_state.gp_seq), @@ -2272,11 +1935,10 @@ rcu_report_qs_rdp(int cpu, struct rcu_data *rdp) return; } mask = rdp->grpmask; + rdp->core_needs_qs = false; if ((rnp->qsmask & mask) == 0) { raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } else { - rdp->core_needs_qs = false; - /* * This GP can't end until cpu checks in, so all of our * callbacks can be processed during the next GP. @@ -2329,14 +1991,14 @@ rcu_check_quiescent_state(struct rcu_data *rdp) */ int rcutree_dying_cpu(unsigned int cpu) { - RCU_TRACE(bool blkd;) - RCU_TRACE(struct rcu_data *rdp = this_cpu_ptr(&rcu_data);) - RCU_TRACE(struct rcu_node *rnp = rdp->mynode;) + bool blkd; + struct rcu_data *rdp = this_cpu_ptr(&rcu_data); + struct rcu_node *rnp = rdp->mynode; if (!IS_ENABLED(CONFIG_HOTPLUG_CPU)) return 0; - RCU_TRACE(blkd = !!(rnp->qsmask & rdp->grpmask);) + blkd = !!(rnp->qsmask & rdp->grpmask); trace_rcu_grace_period(rcu_state.name, rnp->gp_seq, blkd ? TPS("cpuofl") : TPS("cpuofl-bgp")); return 0; @@ -2473,7 +2135,7 @@ static void rcu_do_batch(struct rcu_data *rdp) /* Reinstate batch limit if we have worked down the excess. */ count = rcu_segcblist_n_cbs(&rdp->cblist); - if (rdp->blimit == LONG_MAX && count <= qlowmark) + if (rdp->blimit >= DEFAULT_MAX_RCU_BLIMIT && count <= qlowmark) rdp->blimit = blimit; /* Reset ->qlen_last_fqs_check trigger if enough CBs have drained. */ @@ -2497,14 +2159,14 @@ static void rcu_do_batch(struct rcu_data *rdp) } /* - * Check to see if this CPU is in a non-context-switch quiescent state - * (user mode or idle loop for rcu, non-softirq execution for rcu_bh). - * Also schedule RCU core processing. - * - * This function must be called from hardirq context. It is normally - * invoked from the scheduling-clock interrupt. + * This function is invoked from each scheduling-clock interrupt, + * and checks to see if this CPU is in a non-context-switch quiescent + * state, for example, user mode or idle loop. It also schedules RCU + * core processing. If the current grace period has gone on too long, + * it will ask the scheduler to manufacture a context switch for the sole + * purpose of providing a providing the needed quiescent state. */ -void rcu_check_callbacks(int user) +void rcu_sched_clock_irq(int user) { trace_rcu_utilization(TPS("Start scheduler-tick")); raw_cpu_inc(rcu_data.ticks_this_gp); @@ -2517,7 +2179,7 @@ void rcu_check_callbacks(int user) } __this_cpu_write(rcu_data.rcu_urgent_qs, false); } - rcu_flavor_check_callbacks(user); + rcu_flavor_sched_clock_irq(user); if (rcu_pending()) invoke_rcu_core(); @@ -2525,11 +2187,11 @@ void rcu_check_callbacks(int user) } /* - * Scan the leaf rcu_node structures, processing dyntick state for any that - * have not yet encountered a quiescent state, using the function specified. - * Also initiate boosting for any threads blocked on the root rcu_node. - * - * The caller must have suppressed start of new grace periods. + * Scan the leaf rcu_node structures. For each structure on which all + * CPUs have reported a quiescent state and on which there are tasks + * blocking the current grace period, initiate RCU priority boosting. + * Otherwise, invoke the specified function to check dyntick state for + * each CPU that has not yet reported a quiescent state. */ static void force_qs_rnp(int (*f)(struct rcu_data *rdp)) { @@ -2578,7 +2240,7 @@ static void force_qs_rnp(int (*f)(struct rcu_data *rdp)) * Force quiescent states on reluctant CPUs, and also detect which * CPUs are in dyntick-idle mode. */ -static void force_quiescent_state(void) +void rcu_force_quiescent_state(void) { unsigned long flags; bool ret; @@ -2610,113 +2272,10 @@ static void force_quiescent_state(void) raw_spin_unlock_irqrestore_rcu_node(rnp_old, flags); rcu_gp_kthread_wake(); } +EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); -/* - * This function checks for grace-period requests that fail to motivate - * RCU to come out of its idle mode. - */ -void -rcu_check_gp_start_stall(struct rcu_node *rnp, struct rcu_data *rdp, - const unsigned long gpssdelay) -{ - unsigned long flags; - unsigned long j; - struct rcu_node *rnp_root = rcu_get_root(); - static atomic_t warned = ATOMIC_INIT(0); - - if (!IS_ENABLED(CONFIG_PROVE_RCU) || rcu_gp_in_progress() || - ULONG_CMP_GE(rnp_root->gp_seq, rnp_root->gp_seq_needed)) - return; - j = jiffies; /* Expensive access, and in common case don't get here. */ - if (time_before(j, READ_ONCE(rcu_state.gp_req_activity) + gpssdelay) || - time_before(j, READ_ONCE(rcu_state.gp_activity) + gpssdelay) || - atomic_read(&warned)) - return; - - raw_spin_lock_irqsave_rcu_node(rnp, flags); - j = jiffies; - if (rcu_gp_in_progress() || - ULONG_CMP_GE(rnp_root->gp_seq, rnp_root->gp_seq_needed) || - time_before(j, READ_ONCE(rcu_state.gp_req_activity) + gpssdelay) || - time_before(j, READ_ONCE(rcu_state.gp_activity) + gpssdelay) || - atomic_read(&warned)) { - raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - return; - } - /* Hold onto the leaf lock to make others see warned==1. */ - - if (rnp_root != rnp) - raw_spin_lock_rcu_node(rnp_root); /* irqs already disabled. */ - j = jiffies; - if (rcu_gp_in_progress() || - ULONG_CMP_GE(rnp_root->gp_seq, rnp_root->gp_seq_needed) || - time_before(j, rcu_state.gp_req_activity + gpssdelay) || - time_before(j, rcu_state.gp_activity + gpssdelay) || - atomic_xchg(&warned, 1)) { - raw_spin_unlock_rcu_node(rnp_root); /* irqs remain disabled. */ - raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - return; - } - pr_alert("%s: g%ld->%ld gar:%lu ga:%lu f%#x gs:%d %s->state:%#lx\n", - __func__, (long)READ_ONCE(rcu_state.gp_seq), - (long)READ_ONCE(rnp_root->gp_seq_needed), - j - rcu_state.gp_req_activity, j - rcu_state.gp_activity, - rcu_state.gp_flags, rcu_state.gp_state, rcu_state.name, - rcu_state.gp_kthread ? rcu_state.gp_kthread->state : 0x1ffffL); - WARN_ON(1); - if (rnp_root != rnp) - raw_spin_unlock_rcu_node(rnp_root); - raw_spin_unlock_irqrestore_rcu_node(rnp, flags); -} - -/* - * Do a forward-progress check for rcutorture. This is normally invoked - * due to an OOM event. The argument "j" gives the time period during - * which rcutorture would like progress to have been made. - */ -void rcu_fwd_progress_check(unsigned long j) -{ - unsigned long cbs; - int cpu; - unsigned long max_cbs = 0; - int max_cpu = -1; - struct rcu_data *rdp; - - if (rcu_gp_in_progress()) { - pr_info("%s: GP age %lu jiffies\n", - __func__, jiffies - rcu_state.gp_start); - show_rcu_gp_kthreads(); - } else { - pr_info("%s: Last GP end %lu jiffies ago\n", - __func__, jiffies - rcu_state.gp_end); - preempt_disable(); - rdp = this_cpu_ptr(&rcu_data); - rcu_check_gp_start_stall(rdp->mynode, rdp, j); - preempt_enable(); - } - for_each_possible_cpu(cpu) { - cbs = rcu_get_n_cbs_cpu(cpu); - if (!cbs) - continue; - if (max_cpu < 0) - pr_info("%s: callbacks", __func__); - pr_cont(" %d: %lu", cpu, cbs); - if (cbs <= max_cbs) - continue; - max_cbs = cbs; - max_cpu = cpu; - } - if (max_cpu >= 0) - pr_cont("\n"); -} -EXPORT_SYMBOL_GPL(rcu_fwd_progress_check); - -/* - * This does the RCU core processing work for the specified rcu_data - * structures. This may be called only from the CPU to whom the rdp - * belongs. - */ -static __latent_entropy void rcu_process_callbacks(struct softirq_action *unused) +/* Perform RCU core processing work for the current CPU. */ +static __latent_entropy void rcu_core(void) { unsigned long flags; struct rcu_data *rdp = raw_cpu_ptr(&rcu_data); @@ -2750,37 +2309,126 @@ static __latent_entropy void rcu_process_callbacks(struct softirq_action *unused rcu_check_gp_start_stall(rnp, rdp, rcu_jiffies_till_stall_check()); /* If there are callbacks ready, invoke them. */ - if (rcu_segcblist_ready_cbs(&rdp->cblist)) - invoke_rcu_callbacks(rdp); + if (rcu_segcblist_ready_cbs(&rdp->cblist) && + likely(READ_ONCE(rcu_scheduler_fully_active))) + rcu_do_batch(rdp); /* Do any needed deferred wakeups of rcuo kthreads. */ do_nocb_deferred_wakeup(rdp); trace_rcu_utilization(TPS("End RCU core")); } +static void rcu_core_si(struct softirq_action *h) +{ + rcu_core(); +} + +static void rcu_wake_cond(struct task_struct *t, int status) +{ + /* + * If the thread is yielding, only wake it when this + * is invoked from idle + */ + if (t && (status != RCU_KTHREAD_YIELDING || is_idle_task(current))) + wake_up_process(t); +} + +static void invoke_rcu_core_kthread(void) +{ + struct task_struct *t; + unsigned long flags; + + local_irq_save(flags); + __this_cpu_write(rcu_data.rcu_cpu_has_work, 1); + t = __this_cpu_read(rcu_data.rcu_cpu_kthread_task); + if (t != NULL && t != current) + rcu_wake_cond(t, __this_cpu_read(rcu_data.rcu_cpu_kthread_status)); + local_irq_restore(flags); +} + /* - * Schedule RCU callback invocation. If the running implementation of RCU - * does not support RCU priority boosting, just do a direct call, otherwise - * wake up the per-CPU kernel kthread. Note that because we are running - * on the current CPU with softirqs disabled, the rcu_cpu_kthread_task - * cannot disappear out from under us. + * Wake up this CPU's rcuc kthread to do RCU core processing. */ -static void invoke_rcu_callbacks(struct rcu_data *rdp) +static void invoke_rcu_core(void) { - if (unlikely(!READ_ONCE(rcu_scheduler_fully_active))) - return; - if (likely(!rcu_state.boost)) { - rcu_do_batch(rdp); + if (!cpu_online(smp_processor_id())) return; + if (use_softirq) + raise_softirq(RCU_SOFTIRQ); + else + invoke_rcu_core_kthread(); +} + +static void rcu_cpu_kthread_park(unsigned int cpu) +{ + per_cpu(rcu_data.rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU; +} + +static int rcu_cpu_kthread_should_run(unsigned int cpu) +{ + return __this_cpu_read(rcu_data.rcu_cpu_has_work); +} + +/* + * Per-CPU kernel thread that invokes RCU callbacks. This replaces + * the RCU softirq used in configurations of RCU that do not support RCU + * priority boosting. + */ +static void rcu_cpu_kthread(unsigned int cpu) +{ + unsigned int *statusp = this_cpu_ptr(&rcu_data.rcu_cpu_kthread_status); + char work, *workp = this_cpu_ptr(&rcu_data.rcu_cpu_has_work); + int spincnt; + + for (spincnt = 0; spincnt < 10; spincnt++) { + trace_rcu_utilization(TPS("Start CPU kthread@rcu_wait")); + local_bh_disable(); + *statusp = RCU_KTHREAD_RUNNING; + local_irq_disable(); + work = *workp; + *workp = 0; + local_irq_enable(); + if (work) + rcu_core(); + local_bh_enable(); + if (*workp == 0) { + trace_rcu_utilization(TPS("End CPU kthread@rcu_wait")); + *statusp = RCU_KTHREAD_WAITING; + return; + } } - invoke_rcu_callbacks_kthread(); + *statusp = RCU_KTHREAD_YIELDING; + trace_rcu_utilization(TPS("Start CPU kthread@rcu_yield")); + schedule_timeout_interruptible(2); + trace_rcu_utilization(TPS("End CPU kthread@rcu_yield")); + *statusp = RCU_KTHREAD_WAITING; } -static void invoke_rcu_core(void) +static struct smp_hotplug_thread rcu_cpu_thread_spec = { + .store = &rcu_data.rcu_cpu_kthread_task, + .thread_should_run = rcu_cpu_kthread_should_run, + .thread_fn = rcu_cpu_kthread, + .thread_comm = "rcuc/%u", + .setup = rcu_cpu_kthread_setup, + .park = rcu_cpu_kthread_park, +}; + +/* + * Spawn per-CPU RCU core processing kthreads. + */ +static int __init rcu_spawn_core_kthreads(void) { - if (cpu_online(smp_processor_id())) - raise_softirq(RCU_SOFTIRQ); + int cpu; + + for_each_possible_cpu(cpu) + per_cpu(rcu_data.rcu_cpu_has_work, cpu) = 0; + if (!IS_ENABLED(CONFIG_RCU_BOOST) && use_softirq) + return 0; + WARN_ONCE(smpboot_register_percpu_thread(&rcu_cpu_thread_spec), + "%s: Could not start rcuc kthread, OOM is now expected behavior\n", __func__); + return 0; } +early_initcall(rcu_spawn_core_kthreads); /* * Handle any core-RCU processing required by a call_rcu() invocation. @@ -2801,9 +2449,9 @@ static void __call_rcu_core(struct rcu_data *rdp, struct rcu_head *head, /* * Force the grace period if too many callbacks or too long waiting. - * Enforce hysteresis, and don't invoke force_quiescent_state() + * Enforce hysteresis, and don't invoke rcu_force_quiescent_state() * if some other CPU has recently done so. Also, don't bother - * invoking force_quiescent_state() if the newly enqueued callback + * invoking rcu_force_quiescent_state() if the newly enqueued callback * is the only one waiting for a grace period to complete. */ if (unlikely(rcu_segcblist_n_cbs(&rdp->cblist) > @@ -2817,10 +2465,10 @@ static void __call_rcu_core(struct rcu_data *rdp, struct rcu_head *head, rcu_accelerate_cbs_unlocked(rdp->mynode, rdp); } else { /* Give the grace period a kick. */ - rdp->blimit = LONG_MAX; + rdp->blimit = DEFAULT_MAX_RCU_BLIMIT; if (rcu_state.n_force_qs == rdp->n_force_qs_snap && rcu_segcblist_first_pend_cb(&rdp->cblist) != head) - force_quiescent_state(); + rcu_force_quiescent_state(); rdp->n_force_qs_snap = rcu_state.n_force_qs; rdp->qlen_last_fqs_check = rcu_segcblist_n_cbs(&rdp->cblist); } @@ -2855,7 +2503,7 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func, int cpu, bool lazy) * Use rcu:rcu_callback trace event to find the previous * time callback was passed to __call_rcu(). */ - WARN_ONCE(1, "__call_rcu(): Double-freed CB %p->%pF()!!!\n", + WARN_ONCE(1, "__call_rcu(): Double-freed CB %p->%pS()!!!\n", head, head->func); WRITE_ONCE(head->func, rcu_leak_callback); return; @@ -2889,9 +2537,6 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func, int cpu, bool lazy) rcu_segcblist_init(&rdp->cblist); } rcu_segcblist_enqueue(&rdp->cblist, head, lazy); - if (!lazy) - rcu_idle_count_callbacks_posted(); - if (__is_kfree_rcu_offset((unsigned long)func)) trace_rcu_kfree_callback(rcu_state.name, head, (unsigned long)func, @@ -2961,6 +2606,79 @@ void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func) } EXPORT_SYMBOL_GPL(kfree_call_rcu); +/* + * During early boot, any blocking grace-period wait automatically + * implies a grace period. Later on, this is never the case for PREEMPT. + * + * Howevr, because a context switch is a grace period for !PREEMPT, any + * blocking grace-period wait automatically implies a grace period if + * there is only one CPU online at any point time during execution of + * either synchronize_rcu() or synchronize_rcu_expedited(). It is OK to + * occasionally incorrectly indicate that there are multiple CPUs online + * when there was in fact only one the whole time, as this just adds some + * overhead: RCU still operates correctly. + */ +static int rcu_blocking_is_gp(void) +{ + int ret; + + if (IS_ENABLED(CONFIG_PREEMPT)) + return rcu_scheduler_active == RCU_SCHEDULER_INACTIVE; + might_sleep(); /* Check for RCU read-side critical section. */ + preempt_disable(); + ret = num_online_cpus() <= 1; + preempt_enable(); + return ret; +} + +/** + * synchronize_rcu - wait until a grace period has elapsed. + * + * Control will return to the caller some time after a full grace + * period has elapsed, in other words after all currently executing RCU + * read-side critical sections have completed. Note, however, that + * upon return from synchronize_rcu(), the caller might well be executing + * concurrently with new RCU read-side critical sections that began while + * synchronize_rcu() was waiting. RCU read-side critical sections are + * delimited by rcu_read_lock() and rcu_read_unlock(), and may be nested. + * In addition, regions of code across which interrupts, preemption, or + * softirqs have been disabled also serve as RCU read-side critical + * sections. This includes hardware interrupt handlers, softirq handlers, + * and NMI handlers. + * + * Note that this guarantee implies further memory-ordering guarantees. + * On systems with more than one CPU, when synchronize_rcu() returns, + * each CPU is guaranteed to have executed a full memory barrier since + * the end of its last RCU read-side critical section whose beginning + * preceded the call to synchronize_rcu(). In addition, each CPU having + * an RCU read-side critical section that extends beyond the return from + * synchronize_rcu() is guaranteed to have executed a full memory barrier + * after the beginning of synchronize_rcu() and before the beginning of + * that RCU read-side critical section. Note that these guarantees include + * CPUs that are offline, idle, or executing in user mode, as well as CPUs + * that are executing in the kernel. + * + * Furthermore, if CPU A invoked synchronize_rcu(), which returned + * to its caller on CPU B, then both CPU A and CPU B are guaranteed + * to have executed a full memory barrier during the execution of + * synchronize_rcu() -- even if CPU A and CPU B are the same CPU (but + * again only if the system has more than one CPU). + */ +void synchronize_rcu(void) +{ + RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) || + lock_is_held(&rcu_lock_map) || + lock_is_held(&rcu_sched_lock_map), + "Illegal synchronize_rcu() in RCU read-side critical section"); + if (rcu_blocking_is_gp()) + return; + if (rcu_gp_is_expedited()) + synchronize_rcu_expedited(); + else + wait_rcu_gp(call_rcu); +} +EXPORT_SYMBOL_GPL(synchronize_rcu); + /** * get_state_synchronize_rcu - Snapshot current RCU state * @@ -3049,28 +2767,6 @@ static int rcu_pending(void) } /* - * Return true if the specified CPU has any callback. If all_lazy is - * non-NULL, store an indication of whether all callbacks are lazy. - * (If there are no callbacks, all of them are deemed to be lazy.) - */ -static bool rcu_cpu_has_callbacks(bool *all_lazy) -{ - bool al = true; - bool hc = false; - struct rcu_data *rdp; - - rdp = this_cpu_ptr(&rcu_data); - if (!rcu_segcblist_empty(&rdp->cblist)) { - hc = true; - if (rcu_segcblist_n_nonlazy_cbs(&rdp->cblist)) - al = false; - } - if (all_lazy) - *all_lazy = al; - return hc; -} - -/* * Helper function for rcu_barrier() tracing. If tracing is disabled, * the compiler is expected to optimize this away. */ @@ -3299,7 +2995,7 @@ int rcutree_prepare_cpu(unsigned int cpu) trace_rcu_grace_period(rcu_state.name, rdp->gp_seq, TPS("cpuonl")); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); rcu_prepare_kthreads(cpu); - rcu_spawn_all_nocb_kthreads(cpu); + rcu_spawn_cpu_nocb_kthread(cpu); return 0; } @@ -3329,8 +3025,6 @@ int rcutree_online_cpu(unsigned int cpu) raw_spin_lock_irqsave_rcu_node(rnp, flags); rnp->ffmask |= rdp->grpmask; raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - if (IS_ENABLED(CONFIG_TREE_SRCU)) - srcu_online_cpu(cpu); if (rcu_scheduler_active == RCU_SCHEDULER_INACTIVE) return 0; /* Too early in boot for scheduler work. */ sync_sched_exp_online_cleanup(cpu); @@ -3355,8 +3049,6 @@ int rcutree_offline_cpu(unsigned int cpu) raw_spin_unlock_irqrestore_rcu_node(rnp, flags); rcutree_affinity_setting(cpu, cpu); - if (IS_ENABLED(CONFIG_TREE_SRCU)) - srcu_offline_cpu(cpu); return 0; } @@ -3500,13 +3192,11 @@ static int rcu_pm_notify(struct notifier_block *self, switch (action) { case PM_HIBERNATION_PREPARE: case PM_SUSPEND_PREPARE: - if (nr_cpu_ids <= 256) /* Expediting bad for large systems. */ - rcu_expedite_gp(); + rcu_expedite_gp(); break; case PM_POST_HIBERNATION: case PM_POST_SUSPEND: - if (nr_cpu_ids <= 256) /* Expediting bad for large systems. */ - rcu_unexpedite_gp(); + rcu_unexpedite_gp(); break; default: break; @@ -3683,8 +3373,7 @@ static void __init rcu_init_geometry(void) jiffies_till_first_fqs = d; if (jiffies_till_next_fqs == ULONG_MAX) jiffies_till_next_fqs = d; - if (jiffies_till_sched_qs == ULONG_MAX) - adjust_jiffies_till_sched_qs(); + adjust_jiffies_till_sched_qs(); /* If the compile-time values are accurate, just leave. */ if (rcu_fanout_leaf == RCU_FANOUT_LEAF && @@ -3777,7 +3466,8 @@ void __init rcu_init(void) rcu_init_one(); if (dump_tree) rcu_dump_rcu_node_tree(); - open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); + if (use_softirq) + open_softirq(RCU_SOFTIRQ, rcu_core_si); /* * We don't need protection against CPU-hotplug here because @@ -3799,5 +3489,6 @@ void __init rcu_init(void) srcu_init(); } +#include "tree_stall.h" #include "tree_exp.h" #include "tree_plugin.h" diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index d90b02b53c0e..7acaf3a62d39 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -1,25 +1,12 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ /* * Read-Copy Update mechanism for mutual exclusion (tree-based version) * Internal non-public definitions. * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, you can access it online at - * http://www.gnu.org/licenses/gpl-2.0.html. - * * Copyright IBM Corporation, 2008 * * Author: Ingo Molnar <mingo@elte.hu> - * Paul E. McKenney <paulmck@linux.vnet.ibm.com> + * Paul E. McKenney <paulmck@linux.ibm.com> */ #include <linux/cache.h> @@ -36,7 +23,6 @@ /* Communicate arguments to a workqueue handler. */ struct rcu_exp_work { - smp_call_func_t rew_func; unsigned long rew_s; struct work_struct rew_work; }; @@ -168,13 +154,15 @@ struct rcu_data { bool core_needs_qs; /* Core waits for quiesc state. */ bool beenonline; /* CPU online at least once. */ bool gpwrap; /* Possible ->gp_seq wrap. */ - bool deferred_qs; /* This CPU awaiting a deferred QS? */ + bool exp_deferred_qs; /* This CPU awaiting a deferred QS? */ struct rcu_node *mynode; /* This CPU's leaf of hierarchy */ unsigned long grpmask; /* Mask to apply to leaf qsmask. */ unsigned long ticks_this_gp; /* The number of scheduling-clock */ /* ticks this CPU has handled */ /* during and after the last grace */ /* period it is aware of. */ + struct irq_work defer_qs_iw; /* Obtain later scheduler attention. */ + bool defer_qs_iw_pending; /* Scheduler attention pending? */ /* 2) batch handling */ struct rcu_segcblist cblist; /* Segmented callback list, with */ @@ -194,10 +182,7 @@ struct rcu_data { bool rcu_need_heavy_qs; /* GP old, so heavy quiescent state! */ bool rcu_urgent_qs; /* GP old need light quiescent state. */ #ifdef CONFIG_RCU_FAST_NO_HZ - bool all_lazy; /* Are all CPU's CBs lazy? */ - unsigned long nonlazy_posted; /* # times non-lazy CB posted to CPU. */ - unsigned long nonlazy_posted_snap; - /* Nonlazy_posted snapshot. */ + bool all_lazy; /* All CPU's CBs lazy at idle start? */ unsigned long last_accelerate; /* Last jiffy CBs were accelerated. */ unsigned long last_advance_all; /* Last jiffy CBs were all advanced. */ int tick_nohz_enabled_snap; /* Previously seen value from sysfs. */ @@ -234,7 +219,13 @@ struct rcu_data { /* Leader CPU takes GP-end wakeups. */ #endif /* #ifdef CONFIG_RCU_NOCB_CPU */ - /* 6) Diagnostic data, including RCU CPU stall warnings. */ + /* 6) RCU priority boosting. */ + struct task_struct *rcu_cpu_kthread_task; + /* rcuc per-CPU kthread or NULL. */ + unsigned int rcu_cpu_kthread_status; + char rcu_cpu_has_work; + + /* 7) Diagnostic data, including RCU CPU stall warnings. */ unsigned int softirq_snap; /* Snapshot of softirq activity. */ /* ->rcu_iw* fields protected by leaf rcu_node ->lock. */ struct irq_work rcu_iw; /* Check for non-irq activity. */ @@ -303,6 +294,8 @@ struct rcu_state { struct swait_queue_head gp_wq; /* Where GP task waits. */ short gp_flags; /* Commands for GP task. */ short gp_state; /* GP kthread sleep state. */ + unsigned long gp_wake_time; /* Last GP kthread wake. */ + unsigned long gp_wake_seq; /* ->gp_seq at ^^^. */ /* End of fields guarded by root rcu_node's lock. */ @@ -402,42 +395,29 @@ static const char *tp_rcu_varname __used __tracepoint_string = rcu_name; int rcu_dynticks_snap(struct rcu_data *rdp); -#ifdef CONFIG_RCU_BOOST -DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_status); -DECLARE_PER_CPU(int, rcu_cpu_kthread_cpu); -DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_loops); -DECLARE_PER_CPU(char, rcu_cpu_has_work); -#endif /* #ifdef CONFIG_RCU_BOOST */ - -/* Forward declarations for rcutree_plugin.h */ +/* Forward declarations for tree_plugin.h */ static void rcu_bootup_announce(void); static void rcu_qs(void); static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp); #ifdef CONFIG_HOTPLUG_CPU static bool rcu_preempt_has_tasks(struct rcu_node *rnp); #endif /* #ifdef CONFIG_HOTPLUG_CPU */ -static void rcu_print_detail_task_stall(void); -static int rcu_print_task_stall(struct rcu_node *rnp); static int rcu_print_task_exp_stall(struct rcu_node *rnp); static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp); -static void rcu_flavor_check_callbacks(int user); +static void rcu_flavor_sched_clock_irq(int user); void call_rcu(struct rcu_head *head, rcu_callback_t func); static void dump_blkd_tasks(struct rcu_node *rnp, int ncheck); static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags); static void rcu_preempt_boost_start_gp(struct rcu_node *rnp); -static void invoke_rcu_callbacks_kthread(void); static bool rcu_is_callbacks_kthread(void); +static void rcu_cpu_kthread_setup(unsigned int cpu); static void __init rcu_spawn_boost_kthreads(void); static void rcu_prepare_kthreads(int cpu); static void rcu_cleanup_after_idle(void); static void rcu_prepare_for_idle(void); -static void rcu_idle_count_callbacks_posted(void); static bool rcu_preempt_has_tasks(struct rcu_node *rnp); static bool rcu_preempt_need_deferred_qs(struct task_struct *t); static void rcu_preempt_deferred_qs(struct task_struct *t); -static void print_cpu_stall_info_begin(void); -static void print_cpu_stall_info(int cpu); -static void print_cpu_stall_info_end(void); static void zero_cpu_stall_ticks(struct rcu_data *rdp); static bool rcu_nocb_cpu_needs_barrier(int cpu); static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp); @@ -451,7 +431,7 @@ static bool rcu_nocb_adopt_orphan_cbs(struct rcu_data *my_rdp, static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp); static void do_nocb_deferred_wakeup(struct rcu_data *rdp); static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp); -static void rcu_spawn_all_nocb_kthreads(int cpu); +static void rcu_spawn_cpu_nocb_kthread(int cpu); static void __init rcu_spawn_nocb_kthreads(void); #ifdef CONFIG_RCU_NOCB_CPU static void __init rcu_organize_nocb_kthreads(void); @@ -463,10 +443,9 @@ static bool rcu_nohz_full_cpu(void); static void rcu_dynticks_task_enter(void); static void rcu_dynticks_task_exit(void); -#ifdef CONFIG_SRCU -void srcu_online_cpu(unsigned int cpu); -void srcu_offline_cpu(unsigned int cpu); -#else /* #ifdef CONFIG_SRCU */ -void srcu_online_cpu(unsigned int cpu) { } -void srcu_offline_cpu(unsigned int cpu) { } -#endif /* #else #ifdef CONFIG_SRCU */ +/* Forward declarations for tree_stall.h */ +static void record_gp_stall_check_time(void); +static void rcu_iw_handler(struct irq_work *iwp); +static void check_cpu_stall(struct rcu_data *rdp); +static void rcu_check_gp_start_stall(struct rcu_node *rnp, struct rcu_data *rdp, + const unsigned long gpssdelay); diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 928fe5893a57..af7e7b9c86af 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -1,27 +1,17 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ /* * RCU expedited grace periods * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, you can access it online at - * http://www.gnu.org/licenses/gpl-2.0.html. - * * Copyright IBM Corporation, 2016 * - * Authors: Paul E. McKenney <paulmck@linux.vnet.ibm.com> + * Authors: Paul E. McKenney <paulmck@linux.ibm.com> */ #include <linux/lockdep.h> +static void rcu_exp_handler(void *unused); +static int rcu_print_task_exp_stall(struct rcu_node *rnp); + /* * Record the start of an expedited grace period. */ @@ -260,7 +250,7 @@ static void rcu_report_exp_cpu_mult(struct rcu_node *rnp, */ static void rcu_report_exp_rdp(struct rcu_data *rdp) { - WRITE_ONCE(rdp->deferred_qs, false); + WRITE_ONCE(rdp->exp_deferred_qs, false); rcu_report_exp_cpu_mult(rdp->mynode, rdp->grpmask, true); } @@ -269,8 +259,7 @@ static bool sync_exp_work_done(unsigned long s) { if (rcu_exp_gp_seq_done(s)) { trace_rcu_exp_grace_period(rcu_state.name, s, TPS("done")); - /* Ensure test happens before caller kfree(). */ - smp_mb__before_atomic(); /* ^^^ */ + smp_mb(); /* Ensure test happens before caller kfree(). */ return true; } return false; @@ -344,7 +333,6 @@ static void sync_rcu_exp_select_node_cpus(struct work_struct *wp) { int cpu; unsigned long flags; - smp_call_func_t func; unsigned long mask_ofl_test; unsigned long mask_ofl_ipi; int ret; @@ -352,7 +340,6 @@ static void sync_rcu_exp_select_node_cpus(struct work_struct *wp) container_of(wp, struct rcu_exp_work, rew_work); struct rcu_node *rnp = container_of(rewp, struct rcu_node, rew); - func = rewp->rew_func; raw_spin_lock_irqsave_rcu_node(rnp, flags); /* Each pass checks a CPU for identity, offline, and idle. */ @@ -396,7 +383,12 @@ retry_ipi: mask_ofl_test |= mask; continue; } - ret = smp_call_function_single(cpu, func, NULL, 0); + if (get_cpu() == cpu) { + put_cpu(); + continue; + } + ret = smp_call_function_single(cpu, rcu_exp_handler, NULL, 0); + put_cpu(); if (!ret) { mask_ofl_ipi &= ~mask; continue; @@ -426,7 +418,7 @@ retry_ipi: * Select the nodes that the upcoming expedited grace period needs * to wait for. */ -static void sync_rcu_exp_select_cpus(smp_call_func_t func) +static void sync_rcu_exp_select_cpus(void) { int cpu; struct rcu_node *rnp; @@ -440,7 +432,6 @@ static void sync_rcu_exp_select_cpus(smp_call_func_t func) rnp->exp_need_flush = false; if (!READ_ONCE(rnp->expmask)) continue; /* Avoid early boot non-existent wq. */ - rnp->rew.rew_func = func; if (!READ_ONCE(rcu_par_gp_wq) || rcu_scheduler_active != RCU_SCHEDULER_RUNNING || rcu_is_last_leaf_node(rnp)) { @@ -449,7 +440,6 @@ static void sync_rcu_exp_select_cpus(smp_call_func_t func) continue; } INIT_WORK(&rnp->rew.rew_work, sync_rcu_exp_select_node_cpus); - preempt_disable(); cpu = find_next_bit(&rnp->ffmask, BITS_PER_LONG, -1); /* If all offline, queue the work on an unbound CPU. */ if (unlikely(cpu > rnp->grphi - rnp->grplo)) @@ -457,7 +447,6 @@ static void sync_rcu_exp_select_cpus(smp_call_func_t func) else cpu += rnp->grplo; queue_work_on(cpu, rcu_par_gp_wq, &rnp->rew.rew_work); - preempt_enable(); rnp->exp_need_flush = true; } @@ -580,10 +569,10 @@ static void rcu_exp_wait_wake(unsigned long s) * Common code to drive an expedited grace period forward, used by * workqueues and mid-boot-time tasks. */ -static void rcu_exp_sel_wait_wake(smp_call_func_t func, unsigned long s) +static void rcu_exp_sel_wait_wake(unsigned long s) { /* Initialize the rcu_node tree in preparation for the wait. */ - sync_rcu_exp_select_cpus(func); + sync_rcu_exp_select_cpus(); /* Wait and clean up, including waking everyone. */ rcu_exp_wait_wake(s); @@ -597,52 +586,7 @@ static void wait_rcu_exp_gp(struct work_struct *wp) struct rcu_exp_work *rewp; rewp = container_of(wp, struct rcu_exp_work, rew_work); - rcu_exp_sel_wait_wake(rewp->rew_func, rewp->rew_s); -} - -/* - * Given a smp_call_function() handler, kick off the specified - * implementation of expedited grace period. - */ -static void _synchronize_rcu_expedited(smp_call_func_t func) -{ - struct rcu_data *rdp; - struct rcu_exp_work rew; - struct rcu_node *rnp; - unsigned long s; - - /* If expedited grace periods are prohibited, fall back to normal. */ - if (rcu_gp_is_normal()) { - wait_rcu_gp(call_rcu); - return; - } - - /* Take a snapshot of the sequence number. */ - s = rcu_exp_gp_seq_snap(); - if (exp_funnel_lock(s)) - return; /* Someone else did our work for us. */ - - /* Ensure that load happens before action based on it. */ - if (unlikely(rcu_scheduler_active == RCU_SCHEDULER_INIT)) { - /* Direct call during scheduler init and early_initcalls(). */ - rcu_exp_sel_wait_wake(func, s); - } else { - /* Marshall arguments & schedule the expedited grace period. */ - rew.rew_func = func; - rew.rew_s = s; - INIT_WORK_ONSTACK(&rew.rew_work, wait_rcu_exp_gp); - queue_work(rcu_gp_wq, &rew.rew_work); - } - - /* Wait for expedited grace period to complete. */ - rdp = per_cpu_ptr(&rcu_data, raw_smp_processor_id()); - rnp = rcu_get_root(); - wait_event(rnp->exp_wq[rcu_seq_ctr(s) & 0x3], - sync_exp_work_done(s)); - smp_mb(); /* Workqueue actions happen before return. */ - - /* Let the next expedited grace period start. */ - mutex_unlock(&rcu_state.exp_mutex); + rcu_exp_sel_wait_wake(rewp->rew_s); } #ifdef CONFIG_PREEMPT_RCU @@ -654,7 +598,7 @@ static void _synchronize_rcu_expedited(smp_call_func_t func) * ->expmask fields in the rcu_node tree. Otherwise, immediately * report the quiescent state. */ -static void sync_rcu_exp_handler(void *unused) +static void rcu_exp_handler(void *unused) { unsigned long flags; struct rcu_data *rdp = this_cpu_ptr(&rcu_data); @@ -671,7 +615,7 @@ static void sync_rcu_exp_handler(void *unused) rcu_dynticks_curr_cpu_in_eqs()) { rcu_report_exp_rdp(rdp); } else { - rdp->deferred_qs = true; + rdp->exp_deferred_qs = true; set_tsk_need_resched(t); set_preempt_need_resched(); } @@ -693,10 +637,11 @@ static void sync_rcu_exp_handler(void *unused) if (t->rcu_read_lock_nesting > 0) { raw_spin_lock_irqsave_rcu_node(rnp, flags); if (rnp->expmask & rdp->grpmask) { - rdp->deferred_qs = true; - WRITE_ONCE(t->rcu_read_unlock_special.b.exp_hint, true); + rdp->exp_deferred_qs = true; + t->rcu_read_unlock_special.b.exp_hint = true; } raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + return; } /* @@ -708,14 +653,14 @@ static void sync_rcu_exp_handler(void *unused) * * If the CPU is fully enabled (or if some buggy RCU-preempt * read-side critical section is being used from idle), just - * invoke rcu_preempt_defer_qs() to immediately report the + * invoke rcu_preempt_deferred_qs() to immediately report the * quiescent state. We cannot use rcu_read_unlock_special() * because we are in an interrupt handler, which will cause that * function to take an early exit without doing anything. * * Otherwise, force a context switch after the CPU enables everything. */ - rdp->deferred_qs = true; + rdp->exp_deferred_qs = true; if (!(preempt_count() & (PREEMPT_MASK | SOFTIRQ_MASK)) || WARN_ON_ONCE(rcu_dynticks_curr_cpu_in_eqs())) { rcu_preempt_deferred_qs(t); @@ -730,43 +675,41 @@ static void sync_sched_exp_online_cleanup(int cpu) { } -/** - * synchronize_rcu_expedited - Brute-force RCU grace period - * - * Wait for an RCU-preempt grace period, but expedite it. The basic - * idea is to IPI all non-idle non-nohz online CPUs. The IPI handler - * checks whether the CPU is in an RCU-preempt critical section, and - * if so, it sets a flag that causes the outermost rcu_read_unlock() - * to report the quiescent state. On the other hand, if the CPU is - * not in an RCU read-side critical section, the IPI handler reports - * the quiescent state immediately. - * - * Although this is a greate improvement over previous expedited - * implementations, it is still unfriendly to real-time workloads, so is - * thus not recommended for any sort of common-case code. In fact, if - * you are using synchronize_rcu_expedited() in a loop, please restructure - * your code to batch your updates, and then Use a single synchronize_rcu() - * instead. - * - * This has the same semantics as (but is more brutal than) synchronize_rcu(). +/* + * Scan the current list of tasks blocked within RCU read-side critical + * sections, printing out the tid of each that is blocking the current + * expedited grace period. */ -void synchronize_rcu_expedited(void) +static int rcu_print_task_exp_stall(struct rcu_node *rnp) { - RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) || - lock_is_held(&rcu_lock_map) || - lock_is_held(&rcu_sched_lock_map), - "Illegal synchronize_rcu_expedited() in RCU read-side critical section"); - - if (rcu_scheduler_active == RCU_SCHEDULER_INACTIVE) - return; - _synchronize_rcu_expedited(sync_rcu_exp_handler); + struct task_struct *t; + int ndetected = 0; + + if (!rnp->exp_tasks) + return 0; + t = list_entry(rnp->exp_tasks->prev, + struct task_struct, rcu_node_entry); + list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) { + pr_cont(" P%d", t->pid); + ndetected++; + } + return ndetected; } -EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); #else /* #ifdef CONFIG_PREEMPT_RCU */ +/* Request an expedited quiescent state. */ +static void rcu_exp_need_qs(void) +{ + __this_cpu_write(rcu_data.cpu_no_qs.b.exp, true); + /* Store .exp before .rcu_urgent_qs. */ + smp_store_release(this_cpu_ptr(&rcu_data.rcu_urgent_qs), true); + set_tsk_need_resched(current); + set_preempt_need_resched(); +} + /* Invoked on each online non-idle CPU for expedited quiescent state. */ -static void sync_sched_exp_handler(void *unused) +static void rcu_exp_handler(void *unused) { struct rcu_data *rdp; struct rcu_node *rnp; @@ -780,62 +723,117 @@ static void sync_sched_exp_handler(void *unused) rcu_report_exp_rdp(this_cpu_ptr(&rcu_data)); return; } - __this_cpu_write(rcu_data.cpu_no_qs.b.exp, true); - /* Store .exp before .rcu_urgent_qs. */ - smp_store_release(this_cpu_ptr(&rcu_data.rcu_urgent_qs), true); - set_tsk_need_resched(current); - set_preempt_need_resched(); + rcu_exp_need_qs(); } /* Send IPI for expedited cleanup if needed at end of CPU-hotplug operation. */ static void sync_sched_exp_online_cleanup(int cpu) { + unsigned long flags; + int my_cpu; struct rcu_data *rdp; int ret; struct rcu_node *rnp; rdp = per_cpu_ptr(&rcu_data, cpu); rnp = rdp->mynode; - if (!(READ_ONCE(rnp->expmask) & rdp->grpmask)) + my_cpu = get_cpu(); + /* Quiescent state either not needed or already requested, leave. */ + if (!(READ_ONCE(rnp->expmask) & rdp->grpmask) || + __this_cpu_read(rcu_data.cpu_no_qs.b.exp)) { + put_cpu(); return; - ret = smp_call_function_single(cpu, sync_sched_exp_handler, NULL, 0); + } + /* Quiescent state needed on current CPU, so set it up locally. */ + if (my_cpu == cpu) { + local_irq_save(flags); + rcu_exp_need_qs(); + local_irq_restore(flags); + put_cpu(); + return; + } + /* Quiescent state needed on some other CPU, send IPI. */ + ret = smp_call_function_single(cpu, rcu_exp_handler, NULL, 0); + put_cpu(); WARN_ON_ONCE(ret); } /* - * Because a context switch is a grace period for !PREEMPT, any - * blocking grace-period wait automatically implies a grace period if - * there is only one CPU online at any point time during execution of - * either synchronize_rcu() or synchronize_rcu_expedited(). It is OK to - * occasionally incorrectly indicate that there are multiple CPUs online - * when there was in fact only one the whole time, as this just adds some - * overhead: RCU still operates correctly. + * Because preemptible RCU does not exist, we never have to check for + * tasks blocked within RCU read-side critical sections that are + * blocking the current expedited grace period. */ -static int rcu_blocking_is_gp(void) +static int rcu_print_task_exp_stall(struct rcu_node *rnp) { - int ret; - - might_sleep(); /* Check for RCU read-side critical section. */ - preempt_disable(); - ret = num_online_cpus() <= 1; - preempt_enable(); - return ret; + return 0; } -/* PREEMPT=n implementation of synchronize_rcu_expedited(). */ +#endif /* #else #ifdef CONFIG_PREEMPT_RCU */ + +/** + * synchronize_rcu_expedited - Brute-force RCU grace period + * + * Wait for an RCU grace period, but expedite it. The basic idea is to + * IPI all non-idle non-nohz online CPUs. The IPI handler checks whether + * the CPU is in an RCU critical section, and if so, it sets a flag that + * causes the outermost rcu_read_unlock() to report the quiescent state + * for RCU-preempt or asks the scheduler for help for RCU-sched. On the + * other hand, if the CPU is not in an RCU read-side critical section, + * the IPI handler reports the quiescent state immediately. + * + * Although this is a greate improvement over previous expedited + * implementations, it is still unfriendly to real-time workloads, so is + * thus not recommended for any sort of common-case code. In fact, if + * you are using synchronize_rcu_expedited() in a loop, please restructure + * your code to batch your updates, and then Use a single synchronize_rcu() + * instead. + * + * This has the same semantics as (but is more brutal than) synchronize_rcu(). + */ void synchronize_rcu_expedited(void) { + struct rcu_exp_work rew; + struct rcu_node *rnp; + unsigned long s; + RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) || lock_is_held(&rcu_lock_map) || lock_is_held(&rcu_sched_lock_map), "Illegal synchronize_rcu_expedited() in RCU read-side critical section"); - /* If only one CPU, this is automatically a grace period. */ + /* Is the state is such that the call is a grace period? */ if (rcu_blocking_is_gp()) return; - _synchronize_rcu_expedited(sync_sched_exp_handler); + /* If expedited grace periods are prohibited, fall back to normal. */ + if (rcu_gp_is_normal()) { + wait_rcu_gp(call_rcu); + return; + } + + /* Take a snapshot of the sequence number. */ + s = rcu_exp_gp_seq_snap(); + if (exp_funnel_lock(s)) + return; /* Someone else did our work for us. */ + + /* Ensure that load happens before action based on it. */ + if (unlikely(rcu_scheduler_active == RCU_SCHEDULER_INIT)) { + /* Direct call during scheduler init and early_initcalls(). */ + rcu_exp_sel_wait_wake(s); + } else { + /* Marshall arguments & schedule the expedited grace period. */ + rew.rew_s = s; + INIT_WORK_ONSTACK(&rew.rew_work, wait_rcu_exp_gp); + queue_work(rcu_gp_wq, &rew.rew_work); + } + + /* Wait for expedited grace period to complete. */ + rnp = rcu_get_root(); + wait_event(rnp->exp_wq[rcu_seq_ctr(s) & 0x3], + sync_exp_work_done(s)); + smp_mb(); /* Workqueue actions happen before return. */ + + /* Let the next expedited grace period start. */ + mutex_unlock(&rcu_state.exp_mutex); } EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); - -#endif /* #else #ifdef CONFIG_PREEMPT_RCU */ diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 1b3dd2fc0cd6..acb225023ed1 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1,63 +1,18 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ /* * Read-Copy Update mechanism for mutual exclusion (tree-based version) * Internal non-public definitions that provide either classic * or preemptible semantics. * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, you can access it online at - * http://www.gnu.org/licenses/gpl-2.0.html. - * * Copyright Red Hat, 2009 * Copyright IBM Corporation, 2009 * * Author: Ingo Molnar <mingo@elte.hu> - * Paul E. McKenney <paulmck@linux.vnet.ibm.com> + * Paul E. McKenney <paulmck@linux.ibm.com> */ -#include <linux/delay.h> -#include <linux/gfp.h> -#include <linux/oom.h> -#include <linux/sched/debug.h> -#include <linux/smpboot.h> -#include <linux/sched/isolation.h> -#include <uapi/linux/sched/types.h> -#include "../time/tick-internal.h" - -#ifdef CONFIG_RCU_BOOST - #include "../locking/rtmutex_common.h" -/* - * Control variables for per-CPU and per-rcu_node kthreads. - */ -static DEFINE_PER_CPU(struct task_struct *, rcu_cpu_kthread_task); -DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_status); -DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_loops); -DEFINE_PER_CPU(char, rcu_cpu_has_work); - -#else /* #ifdef CONFIG_RCU_BOOST */ - -/* - * Some architectures do not define rt_mutexes, but if !CONFIG_RCU_BOOST, - * all uses are in dead code. Provide a definition to keep the compiler - * happy, but add WARN_ON_ONCE() to complain if used in the wrong place. - * This probably needs to be excluded from -rt builds. - */ -#define rt_mutex_owner(a) ({ WARN_ON_ONCE(1); NULL; }) -#define rt_mutex_futex_unlock(x) WARN_ON_ONCE(1) - -#endif /* #else #ifdef CONFIG_RCU_BOOST */ - #ifdef CONFIG_RCU_NOCB_CPU static cpumask_var_t rcu_nocb_mask; /* CPUs to have callbacks offloaded. */ static bool __read_mostly rcu_nocb_poll; /* Offload kthread are to poll. */ @@ -117,6 +72,8 @@ static void __init rcu_bootup_announce_oddness(void) pr_info("\tRCU debug GP init slowdown %d jiffies.\n", gp_init_delay); if (gp_cleanup_delay) pr_info("\tRCU debug GP init slowdown %d jiffies.\n", gp_cleanup_delay); + if (!use_softirq) + pr_info("\tRCU_SOFTIRQ processing moved to rcuc kthreads.\n"); if (IS_ENABLED(CONFIG_RCU_EQS_DEBUG)) pr_info("\tRCU debug extended QS entry/exit.\n"); rcupdate_announce_bootup_oddness(); @@ -280,10 +237,10 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp) * no need to check for a subsequent expedited GP. (Though we are * still in a quiescent state in any case.) */ - if (blkd_state & RCU_EXP_BLKD && rdp->deferred_qs) + if (blkd_state & RCU_EXP_BLKD && rdp->exp_deferred_qs) rcu_report_exp_rdp(rdp); else - WARN_ON_ONCE(rdp->deferred_qs); + WARN_ON_ONCE(rdp->exp_deferred_qs); } /* @@ -307,8 +264,8 @@ static void rcu_qs(void) __this_cpu_read(rcu_data.gp_seq), TPS("cpuqs")); __this_cpu_write(rcu_data.cpu_no_qs.b.norm, false); - barrier(); /* Coordinate with rcu_flavor_check_callbacks(). */ - current->rcu_read_unlock_special.b.need_qs = false; + barrier(); /* Coordinate with rcu_flavor_sched_clock_irq(). */ + WRITE_ONCE(current->rcu_read_unlock_special.b.need_qs, false); } } @@ -380,7 +337,7 @@ void rcu_note_context_switch(bool preempt) * means that we continue to block the current grace period. */ rcu_qs(); - if (rdp->deferred_qs) + if (rdp->exp_deferred_qs) rcu_report_exp_rdp(rdp); trace_rcu_utilization(TPS("End context switch")); barrier(); /* Avoid RCU read-side critical sections leaking up. */ @@ -494,14 +451,15 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags) */ special = t->rcu_read_unlock_special; rdp = this_cpu_ptr(&rcu_data); - if (!special.s && !rdp->deferred_qs) { + if (!special.s && !rdp->exp_deferred_qs) { local_irq_restore(flags); return; } + t->rcu_read_unlock_special.b.deferred_qs = false; if (special.b.need_qs) { rcu_qs(); t->rcu_read_unlock_special.b.need_qs = false; - if (!t->rcu_read_unlock_special.s && !rdp->deferred_qs) { + if (!t->rcu_read_unlock_special.s && !rdp->exp_deferred_qs) { local_irq_restore(flags); return; } @@ -513,7 +471,7 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags) * tasks are handled when removing the task from the * blocked-tasks list below. */ - if (rdp->deferred_qs) { + if (rdp->exp_deferred_qs) { rcu_report_exp_rdp(rdp); if (!t->rcu_read_unlock_special.s) { local_irq_restore(flags); @@ -602,7 +560,7 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags) */ static bool rcu_preempt_need_deferred_qs(struct task_struct *t) { - return (__this_cpu_read(rcu_data.deferred_qs) || + return (__this_cpu_read(rcu_data.exp_deferred_qs) || READ_ONCE(t->rcu_read_unlock_special.s)) && t->rcu_read_lock_nesting <= 0; } @@ -630,6 +588,17 @@ static void rcu_preempt_deferred_qs(struct task_struct *t) } /* + * Minimal handler to give the scheduler a chance to re-evaluate. + */ +static void rcu_preempt_deferred_qs_handler(struct irq_work *iwp) +{ + struct rcu_data *rdp; + + rdp = container_of(iwp, struct rcu_data, defer_qs_iw); + rdp->defer_qs_iw_pending = false; +} + +/* * Handle special cases during rcu_read_unlock(), such as needing to * notify RCU core processing or task having blocked during the RCU * read-side critical section. @@ -648,16 +617,41 @@ static void rcu_read_unlock_special(struct task_struct *t) local_irq_save(flags); irqs_were_disabled = irqs_disabled_flags(flags); if (preempt_bh_were_disabled || irqs_were_disabled) { - WRITE_ONCE(t->rcu_read_unlock_special.b.exp_hint, false); - /* Need to defer quiescent state until everything is enabled. */ - if (irqs_were_disabled) { - /* Enabling irqs does not reschedule, so... */ + bool exp; + struct rcu_data *rdp = this_cpu_ptr(&rcu_data); + struct rcu_node *rnp = rdp->mynode; + + t->rcu_read_unlock_special.b.exp_hint = false; + exp = (t->rcu_blocked_node && t->rcu_blocked_node->exp_tasks) || + (rdp->grpmask & rnp->expmask) || + tick_nohz_full_cpu(rdp->cpu); + // Need to defer quiescent state until everything is enabled. + if ((exp || in_irq()) && irqs_were_disabled && use_softirq && + (in_irq() || !t->rcu_read_unlock_special.b.deferred_qs)) { + // Using softirq, safe to awaken, and we get + // no help from enabling irqs, unlike bh/preempt. raise_softirq_irqoff(RCU_SOFTIRQ); + } else if (exp && irqs_were_disabled && !use_softirq && + !t->rcu_read_unlock_special.b.deferred_qs) { + // Safe to awaken and we get no help from enabling + // irqs, unlike bh/preempt. + invoke_rcu_core(); } else { - /* Enabling BH or preempt does reschedule, so... */ + // Enabling BH or preempt does reschedule, so... + // Also if no expediting or NO_HZ_FULL, slow is OK. set_tsk_need_resched(current); set_preempt_need_resched(); + if (IS_ENABLED(CONFIG_IRQ_WORK) && + !rdp->defer_qs_iw_pending && exp) { + // Get scheduler to re-evaluate and call hooks. + // If !IRQ_WORK, FQS scan will eventually IPI. + init_irq_work(&rdp->defer_qs_iw, + rcu_preempt_deferred_qs_handler); + rdp->defer_qs_iw_pending = true; + irq_work_queue_on(&rdp->defer_qs_iw, rdp->cpu); + } } + t->rcu_read_unlock_special.b.deferred_qs = true; local_irq_restore(flags); return; } @@ -666,100 +660,6 @@ static void rcu_read_unlock_special(struct task_struct *t) } /* - * Dump detailed information for all tasks blocking the current RCU - * grace period on the specified rcu_node structure. - */ -static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp) -{ - unsigned long flags; - struct task_struct *t; - - raw_spin_lock_irqsave_rcu_node(rnp, flags); - if (!rcu_preempt_blocked_readers_cgp(rnp)) { - raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - return; - } - t = list_entry(rnp->gp_tasks->prev, - struct task_struct, rcu_node_entry); - list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) { - /* - * We could be printing a lot while holding a spinlock. - * Avoid triggering hard lockup. - */ - touch_nmi_watchdog(); - sched_show_task(t); - } - raw_spin_unlock_irqrestore_rcu_node(rnp, flags); -} - -/* - * Dump detailed information for all tasks blocking the current RCU - * grace period. - */ -static void rcu_print_detail_task_stall(void) -{ - struct rcu_node *rnp = rcu_get_root(); - - rcu_print_detail_task_stall_rnp(rnp); - rcu_for_each_leaf_node(rnp) - rcu_print_detail_task_stall_rnp(rnp); -} - -static void rcu_print_task_stall_begin(struct rcu_node *rnp) -{ - pr_err("\tTasks blocked on level-%d rcu_node (CPUs %d-%d):", - rnp->level, rnp->grplo, rnp->grphi); -} - -static void rcu_print_task_stall_end(void) -{ - pr_cont("\n"); -} - -/* - * Scan the current list of tasks blocked within RCU read-side critical - * sections, printing out the tid of each. - */ -static int rcu_print_task_stall(struct rcu_node *rnp) -{ - struct task_struct *t; - int ndetected = 0; - - if (!rcu_preempt_blocked_readers_cgp(rnp)) - return 0; - rcu_print_task_stall_begin(rnp); - t = list_entry(rnp->gp_tasks->prev, - struct task_struct, rcu_node_entry); - list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) { - pr_cont(" P%d", t->pid); - ndetected++; - } - rcu_print_task_stall_end(); - return ndetected; -} - -/* - * Scan the current list of tasks blocked within RCU read-side critical - * sections, printing out the tid of each that is blocking the current - * expedited grace period. - */ -static int rcu_print_task_exp_stall(struct rcu_node *rnp) -{ - struct task_struct *t; - int ndetected = 0; - - if (!rnp->exp_tasks) - return 0; - t = list_entry(rnp->exp_tasks->prev, - struct task_struct, rcu_node_entry); - list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) { - pr_cont(" P%d", t->pid); - ndetected++; - } - return ndetected; -} - -/* * Check that the list of blocked tasks for the newly completed grace * period is in fact empty. It is a serious bug to complete a grace * period that still has RCU readers blocked! This function must be @@ -788,13 +688,13 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) } /* - * Check for a quiescent state from the current CPU. When a task blocks, - * the task is recorded in the corresponding CPU's rcu_node structure, - * which is checked elsewhere. - * - * Caller must disable hard irqs. + * Check for a quiescent state from the current CPU, including voluntary + * context switches for Tasks RCU. When a task blocks, the task is + * recorded in the corresponding CPU's rcu_node structure, which is checked + * elsewhere, hence this function need only check for quiescent states + * related to the current CPU, not to those related to tasks. */ -static void rcu_flavor_check_callbacks(int user) +static void rcu_flavor_sched_clock_irq(int user) { struct task_struct *t = current; @@ -825,69 +725,27 @@ static void rcu_flavor_check_callbacks(int user) t->rcu_read_unlock_special.b.need_qs = true; } -/** - * synchronize_rcu - wait until a grace period has elapsed. - * - * Control will return to the caller some time after a full grace - * period has elapsed, in other words after all currently executing RCU - * read-side critical sections have completed. Note, however, that - * upon return from synchronize_rcu(), the caller might well be executing - * concurrently with new RCU read-side critical sections that began while - * synchronize_rcu() was waiting. RCU read-side critical sections are - * delimited by rcu_read_lock() and rcu_read_unlock(), and may be nested. - * In addition, regions of code across which interrupts, preemption, or - * softirqs have been disabled also serve as RCU read-side critical - * sections. This includes hardware interrupt handlers, softirq handlers, - * and NMI handlers. - * - * Note that this guarantee implies further memory-ordering guarantees. - * On systems with more than one CPU, when synchronize_rcu() returns, - * each CPU is guaranteed to have executed a full memory barrier since - * the end of its last RCU read-side critical section whose beginning - * preceded the call to synchronize_rcu(). In addition, each CPU having - * an RCU read-side critical section that extends beyond the return from - * synchronize_rcu() is guaranteed to have executed a full memory barrier - * after the beginning of synchronize_rcu() and before the beginning of - * that RCU read-side critical section. Note that these guarantees include - * CPUs that are offline, idle, or executing in user mode, as well as CPUs - * that are executing in the kernel. - * - * Furthermore, if CPU A invoked synchronize_rcu(), which returned - * to its caller on CPU B, then both CPU A and CPU B are guaranteed - * to have executed a full memory barrier during the execution of - * synchronize_rcu() -- even if CPU A and CPU B are the same CPU (but - * again only if the system has more than one CPU). - */ -void synchronize_rcu(void) -{ - RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) || - lock_is_held(&rcu_lock_map) || - lock_is_held(&rcu_sched_lock_map), - "Illegal synchronize_rcu() in RCU read-side critical section"); - if (rcu_scheduler_active == RCU_SCHEDULER_INACTIVE) - return; - if (rcu_gp_is_expedited()) - synchronize_rcu_expedited(); - else - wait_rcu_gp(call_rcu); -} -EXPORT_SYMBOL_GPL(synchronize_rcu); - /* * Check for a task exiting while in a preemptible-RCU read-side - * critical section, clean up if so. No need to issue warnings, - * as debug_check_no_locks_held() already does this if lockdep - * is enabled. + * critical section, clean up if so. No need to issue warnings, as + * debug_check_no_locks_held() already does this if lockdep is enabled. + * Besides, if this function does anything other than just immediately + * return, there was a bug of some sort. Spewing warnings from this + * function is like as not to simply obscure important prior warnings. */ void exit_rcu(void) { struct task_struct *t = current; - if (likely(list_empty(¤t->rcu_node_entry))) + if (unlikely(!list_empty(¤t->rcu_node_entry))) { + t->rcu_read_lock_nesting = 1; + barrier(); + WRITE_ONCE(t->rcu_read_unlock_special.b.blocked, true); + } else if (unlikely(t->rcu_read_lock_nesting)) { + t->rcu_read_lock_nesting = 1; + } else { return; - t->rcu_read_lock_nesting = 1; - barrier(); - t->rcu_read_unlock_special.b.blocked = true; + } __rcu_read_unlock(); rcu_preempt_deferred_qs(current); } @@ -919,7 +777,7 @@ dump_blkd_tasks(struct rcu_node *rnp, int ncheck) i = 0; list_for_each(lhp, &rnp->blkd_tasks) { pr_cont(" %p", lhp); - if (++i >= 10) + if (++i >= ncheck) break; } pr_cont("\n"); @@ -1051,33 +909,6 @@ static bool rcu_preempt_need_deferred_qs(struct task_struct *t) static void rcu_preempt_deferred_qs(struct task_struct *t) { } /* - * Because preemptible RCU does not exist, we never have to check for - * tasks blocked within RCU read-side critical sections. - */ -static void rcu_print_detail_task_stall(void) -{ -} - -/* - * Because preemptible RCU does not exist, we never have to check for - * tasks blocked within RCU read-side critical sections. - */ -static int rcu_print_task_stall(struct rcu_node *rnp) -{ - return 0; -} - -/* - * Because preemptible RCU does not exist, we never have to check for - * tasks blocked within RCU read-side critical sections that are - * blocking the current expedited grace period. - */ -static int rcu_print_task_exp_stall(struct rcu_node *rnp) -{ - return 0; -} - -/* * Because there is no preemptible RCU, there can be no readers blocked, * so there is no need to check for blocked tasks. So check only for * bogus qsmask values. @@ -1088,14 +919,10 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) } /* - * Check to see if this CPU is in a non-context-switch quiescent state - * (user mode or idle loop for rcu, non-softirq execution for rcu_bh). - * Also schedule RCU core processing. - * - * This function must be called from hardirq context. It is normally - * invoked from the scheduling-clock interrupt. + * Check to see if this CPU is in a non-context-switch quiescent state, + * namely user mode and idle loop. */ -static void rcu_flavor_check_callbacks(int user) +static void rcu_flavor_sched_clock_irq(int user) { if (user || rcu_is_cpu_rrupt_from_idle()) { @@ -1115,22 +942,6 @@ static void rcu_flavor_check_callbacks(int user) } } -/* PREEMPT=n implementation of synchronize_rcu(). */ -void synchronize_rcu(void) -{ - RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) || - lock_is_held(&rcu_lock_map) || - lock_is_held(&rcu_sched_lock_map), - "Illegal synchronize_rcu() in RCU read-side critical section"); - if (rcu_blocking_is_gp()) - return; - if (rcu_gp_is_expedited()) - synchronize_rcu_expedited(); - else - wait_rcu_gp(call_rcu); -} -EXPORT_SYMBOL_GPL(synchronize_rcu); - /* * Because preemptible RCU does not exist, tasks cannot possibly exit * while in preemptible RCU read-side critical sections. @@ -1150,18 +961,21 @@ dump_blkd_tasks(struct rcu_node *rnp, int ncheck) #endif /* #else #ifdef CONFIG_PREEMPT_RCU */ +/* + * If boosting, set rcuc kthreads to realtime priority. + */ +static void rcu_cpu_kthread_setup(unsigned int cpu) +{ #ifdef CONFIG_RCU_BOOST + struct sched_param sp; -static void rcu_wake_cond(struct task_struct *t, int status) -{ - /* - * If the thread is yielding, only wake it when this - * is invoked from idle - */ - if (status != RCU_KTHREAD_YIELDING || is_idle_task(current)) - wake_up_process(t); + sp.sched_priority = kthread_prio; + sched_setscheduler_nocheck(current, SCHED_FIFO, &sp); +#endif /* #ifdef CONFIG_RCU_BOOST */ } +#ifdef CONFIG_RCU_BOOST + /* * Carry out RCU priority boosting on the task indicated by ->exp_tasks * or ->boost_tasks, advancing the pointer to the next task in the @@ -1276,8 +1090,6 @@ static int rcu_boost_kthread(void *arg) static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) __releases(rnp->lock) { - struct task_struct *t; - raw_lockdep_assert_held_rcu_node(rnp); if (!rcu_preempt_blocked_readers_cgp(rnp) && rnp->exp_tasks == NULL) { raw_spin_unlock_irqrestore_rcu_node(rnp, flags); @@ -1291,38 +1103,20 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) if (rnp->exp_tasks == NULL) rnp->boost_tasks = rnp->gp_tasks; raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - t = rnp->boost_kthread_task; - if (t) - rcu_wake_cond(t, rnp->boost_kthread_status); + rcu_wake_cond(rnp->boost_kthread_task, + rnp->boost_kthread_status); } else { raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } } /* - * Wake up the per-CPU kthread to invoke RCU callbacks. - */ -static void invoke_rcu_callbacks_kthread(void) -{ - unsigned long flags; - - local_irq_save(flags); - __this_cpu_write(rcu_cpu_has_work, 1); - if (__this_cpu_read(rcu_cpu_kthread_task) != NULL && - current != __this_cpu_read(rcu_cpu_kthread_task)) { - rcu_wake_cond(__this_cpu_read(rcu_cpu_kthread_task), - __this_cpu_read(rcu_cpu_kthread_status)); - } - local_irq_restore(flags); -} - -/* * Is the current CPU running the RCU-callbacks kthread? * Caller must have preemption disabled. */ static bool rcu_is_callbacks_kthread(void) { - return __this_cpu_read(rcu_cpu_kthread_task) == current; + return __this_cpu_read(rcu_data.rcu_cpu_kthread_task) == current; } #define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000) @@ -1369,65 +1163,6 @@ static int rcu_spawn_one_boost_kthread(struct rcu_node *rnp) return 0; } -static void rcu_kthread_do_work(void) -{ - rcu_do_batch(this_cpu_ptr(&rcu_data)); -} - -static void rcu_cpu_kthread_setup(unsigned int cpu) -{ - struct sched_param sp; - - sp.sched_priority = kthread_prio; - sched_setscheduler_nocheck(current, SCHED_FIFO, &sp); -} - -static void rcu_cpu_kthread_park(unsigned int cpu) -{ - per_cpu(rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU; -} - -static int rcu_cpu_kthread_should_run(unsigned int cpu) -{ - return __this_cpu_read(rcu_cpu_has_work); -} - -/* - * Per-CPU kernel thread that invokes RCU callbacks. This replaces - * the RCU softirq used in configurations of RCU that do not support RCU - * priority boosting. - */ -static void rcu_cpu_kthread(unsigned int cpu) -{ - unsigned int *statusp = this_cpu_ptr(&rcu_cpu_kthread_status); - char work, *workp = this_cpu_ptr(&rcu_cpu_has_work); - int spincnt; - - for (spincnt = 0; spincnt < 10; spincnt++) { - trace_rcu_utilization(TPS("Start CPU kthread@rcu_wait")); - local_bh_disable(); - *statusp = RCU_KTHREAD_RUNNING; - this_cpu_inc(rcu_cpu_kthread_loops); - local_irq_disable(); - work = *workp; - *workp = 0; - local_irq_enable(); - if (work) - rcu_kthread_do_work(); - local_bh_enable(); - if (*workp == 0) { - trace_rcu_utilization(TPS("End CPU kthread@rcu_wait")); - *statusp = RCU_KTHREAD_WAITING; - return; - } - } - *statusp = RCU_KTHREAD_YIELDING; - trace_rcu_utilization(TPS("Start CPU kthread@rcu_yield")); - schedule_timeout_interruptible(2); - trace_rcu_utilization(TPS("End CPU kthread@rcu_yield")); - *statusp = RCU_KTHREAD_WAITING; -} - /* * Set the per-rcu_node kthread's affinity to cover all CPUs that are * served by the rcu_node in question. The CPU hotplug lock is still @@ -1458,27 +1193,13 @@ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) free_cpumask_var(cm); } -static struct smp_hotplug_thread rcu_cpu_thread_spec = { - .store = &rcu_cpu_kthread_task, - .thread_should_run = rcu_cpu_kthread_should_run, - .thread_fn = rcu_cpu_kthread, - .thread_comm = "rcuc/%u", - .setup = rcu_cpu_kthread_setup, - .park = rcu_cpu_kthread_park, -}; - /* * Spawn boost kthreads -- called as soon as the scheduler is running. */ static void __init rcu_spawn_boost_kthreads(void) { struct rcu_node *rnp; - int cpu; - for_each_possible_cpu(cpu) - per_cpu(rcu_cpu_has_work, cpu) = 0; - if (WARN_ONCE(smpboot_register_percpu_thread(&rcu_cpu_thread_spec), "%s: Could not start rcub kthread, OOM is now expected behavior\n", __func__)) - return; rcu_for_each_leaf_node(rnp) (void)rcu_spawn_one_boost_kthread(rnp); } @@ -1501,11 +1222,6 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } -static void invoke_rcu_callbacks_kthread(void) -{ - WARN_ON_ONCE(1); -} - static bool rcu_is_callbacks_kthread(void) { return false; @@ -1543,7 +1259,7 @@ static void rcu_prepare_kthreads(int cpu) int rcu_needs_cpu(u64 basemono, u64 *nextevt) { *nextevt = KTIME_MAX; - return rcu_cpu_has_callbacks(NULL); + return !rcu_segcblist_empty(&this_cpu_ptr(&rcu_data)->cblist); } /* @@ -1562,14 +1278,6 @@ static void rcu_prepare_for_idle(void) { } -/* - * Don't bother keeping a running count of the number of RCU callbacks - * posted because CONFIG_RCU_FAST_NO_HZ=n. - */ -static void rcu_idle_count_callbacks_posted(void) -{ -} - #else /* #if !defined(CONFIG_RCU_FAST_NO_HZ) */ /* @@ -1652,11 +1360,8 @@ int rcu_needs_cpu(u64 basemono, u64 *nextevt) lockdep_assert_irqs_disabled(); - /* Snapshot to detect later posting of non-lazy callback. */ - rdp->nonlazy_posted_snap = rdp->nonlazy_posted; - /* If no callbacks, RCU doesn't need the CPU. */ - if (!rcu_cpu_has_callbacks(&rdp->all_lazy)) { + if (rcu_segcblist_empty(&rdp->cblist)) { *nextevt = KTIME_MAX; return 0; } @@ -1670,11 +1375,12 @@ int rcu_needs_cpu(u64 basemono, u64 *nextevt) rdp->last_accelerate = jiffies; /* Request timer delay depending on laziness, and round. */ - if (!rdp->all_lazy) { + rdp->all_lazy = !rcu_segcblist_n_nonlazy_cbs(&rdp->cblist); + if (rdp->all_lazy) { + dj = round_jiffies(rcu_idle_lazy_gp_delay + jiffies) - jiffies; + } else { dj = round_up(rcu_idle_gp_delay + jiffies, rcu_idle_gp_delay) - jiffies; - } else { - dj = round_jiffies(rcu_idle_lazy_gp_delay + jiffies) - jiffies; } *nextevt = basemono + dj * TICK_NSEC; return 0; @@ -1704,7 +1410,7 @@ static void rcu_prepare_for_idle(void) /* Handle nohz enablement switches conservatively. */ tne = READ_ONCE(tick_nohz_active); if (tne != rdp->tick_nohz_enabled_snap) { - if (rcu_cpu_has_callbacks(NULL)) + if (!rcu_segcblist_empty(&rdp->cblist)) invoke_rcu_core(); /* force nohz to see update. */ rdp->tick_nohz_enabled_snap = tne; return; @@ -1717,10 +1423,8 @@ static void rcu_prepare_for_idle(void) * callbacks, invoke RCU core for the side-effect of recalculating * idle duration on re-entry to idle. */ - if (rdp->all_lazy && - rdp->nonlazy_posted != rdp->nonlazy_posted_snap) { + if (rdp->all_lazy && rcu_segcblist_n_nonlazy_cbs(&rdp->cblist)) { rdp->all_lazy = false; - rdp->nonlazy_posted_snap = rdp->nonlazy_posted; invoke_rcu_core(); return; } @@ -1756,142 +1460,49 @@ static void rcu_cleanup_after_idle(void) invoke_rcu_core(); } -/* - * Keep a running count of the number of non-lazy callbacks posted - * on this CPU. This running counter (which is never decremented) allows - * rcu_prepare_for_idle() to detect when something out of the idle loop - * posts a callback, even if an equal number of callbacks are invoked. - * Of course, callbacks should only be posted from within a trace event - * designed to be called from idle or from within RCU_NONIDLE(). - */ -static void rcu_idle_count_callbacks_posted(void) -{ - __this_cpu_add(rcu_data.nonlazy_posted, 1); -} - #endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */ -#ifdef CONFIG_RCU_FAST_NO_HZ - -static void print_cpu_stall_fast_no_hz(char *cp, int cpu) -{ - struct rcu_data *rdp = &per_cpu(rcu_data, cpu); - unsigned long nlpd = rdp->nonlazy_posted - rdp->nonlazy_posted_snap; - - sprintf(cp, "last_accelerate: %04lx/%04lx, nonlazy_posted: %ld, %c%c", - rdp->last_accelerate & 0xffff, jiffies & 0xffff, - ulong2long(nlpd), - rdp->all_lazy ? 'L' : '.', - rdp->tick_nohz_enabled_snap ? '.' : 'D'); -} - -#else /* #ifdef CONFIG_RCU_FAST_NO_HZ */ - -static void print_cpu_stall_fast_no_hz(char *cp, int cpu) -{ - *cp = '\0'; -} - -#endif /* #else #ifdef CONFIG_RCU_FAST_NO_HZ */ - -/* Initiate the stall-info list. */ -static void print_cpu_stall_info_begin(void) -{ - pr_cont("\n"); -} - -/* - * Print out diagnostic information for the specified stalled CPU. - * - * If the specified CPU is aware of the current RCU grace period, then - * print the number of scheduling clock interrupts the CPU has taken - * during the time that it has been aware. Otherwise, print the number - * of RCU grace periods that this CPU is ignorant of, for example, "1" - * if the CPU was aware of the previous grace period. - * - * Also print out idle and (if CONFIG_RCU_FAST_NO_HZ) idle-entry info. - */ -static void print_cpu_stall_info(int cpu) -{ - unsigned long delta; - char fast_no_hz[72]; - struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); - char *ticks_title; - unsigned long ticks_value; - - /* - * We could be printing a lot while holding a spinlock. Avoid - * triggering hard lockup. - */ - touch_nmi_watchdog(); - - ticks_value = rcu_seq_ctr(rcu_state.gp_seq - rdp->gp_seq); - if (ticks_value) { - ticks_title = "GPs behind"; - } else { - ticks_title = "ticks this GP"; - ticks_value = rdp->ticks_this_gp; - } - print_cpu_stall_fast_no_hz(fast_no_hz, cpu); - delta = rcu_seq_ctr(rdp->mynode->gp_seq - rdp->rcu_iw_gp_seq); - pr_err("\t%d-%c%c%c%c: (%lu %s) idle=%03x/%ld/%#lx softirq=%u/%u fqs=%ld %s\n", - cpu, - "O."[!!cpu_online(cpu)], - "o."[!!(rdp->grpmask & rdp->mynode->qsmaskinit)], - "N."[!!(rdp->grpmask & rdp->mynode->qsmaskinitnext)], - !IS_ENABLED(CONFIG_IRQ_WORK) ? '?' : - rdp->rcu_iw_pending ? (int)min(delta, 9UL) + '0' : - "!."[!delta], - ticks_value, ticks_title, - rcu_dynticks_snap(rdp) & 0xfff, - rdp->dynticks_nesting, rdp->dynticks_nmi_nesting, - rdp->softirq_snap, kstat_softirqs_cpu(RCU_SOFTIRQ, cpu), - READ_ONCE(rcu_state.n_force_qs) - rcu_state.n_force_qs_gpstart, - fast_no_hz); -} - -/* Terminate the stall-info list. */ -static void print_cpu_stall_info_end(void) -{ - pr_err("\t"); -} - -/* Zero ->ticks_this_gp and snapshot the number of RCU softirq handlers. */ -static void zero_cpu_stall_ticks(struct rcu_data *rdp) -{ - rdp->ticks_this_gp = 0; - rdp->softirq_snap = kstat_softirqs_cpu(RCU_SOFTIRQ, smp_processor_id()); - WRITE_ONCE(rdp->last_fqs_resched, jiffies); -} - #ifdef CONFIG_RCU_NOCB_CPU /* * Offload callback processing from the boot-time-specified set of CPUs - * specified by rcu_nocb_mask. For each CPU in the set, there is a - * kthread created that pulls the callbacks from the corresponding CPU, - * waits for a grace period to elapse, and invokes the callbacks. - * The no-CBs CPUs do a wake_up() on their kthread when they insert - * a callback into any empty list, unless the rcu_nocb_poll boot parameter - * has been specified, in which case each kthread actively polls its - * CPU. (Which isn't so great for energy efficiency, but which does - * reduce RCU's overhead on that CPU.) + * specified by rcu_nocb_mask. For the CPUs in the set, there are kthreads + * created that pull the callbacks from the corresponding CPU, wait for + * a grace period to elapse, and invoke the callbacks. These kthreads + * are organized into leaders, which manage incoming callbacks, wait for + * grace periods, and awaken followers, and the followers, which only + * invoke callbacks. Each leader is its own follower. The no-CBs CPUs + * do a wake_up() on their kthread when they insert a callback into any + * empty list, unless the rcu_nocb_poll boot parameter has been specified, + * in which case each kthread actively polls its CPU. (Which isn't so great + * for energy efficiency, but which does reduce RCU's overhead on that CPU.) * * This is intended to be used in conjunction with Frederic Weisbecker's * adaptive-idle work, which would seriously reduce OS jitter on CPUs * running CPU-bound user-mode computations. * - * Offloading of callback processing could also in theory be used as - * an energy-efficiency measure because CPUs with no RCU callbacks - * queued are more aggressive about entering dyntick-idle mode. + * Offloading of callbacks can also be used as an energy-efficiency + * measure because CPUs with no RCU callbacks queued are more aggressive + * about entering dyntick-idle mode. */ -/* Parse the boot-time rcu_nocb_mask CPU list from the kernel parameters. */ +/* + * Parse the boot-time rcu_nocb_mask CPU list from the kernel parameters. + * The string after the "rcu_nocbs=" is either "all" for all CPUs, or a + * comma-separated list of CPUs and/or CPU ranges. If an invalid list is + * given, a warning is emitted and all CPUs are offloaded. + */ static int __init rcu_nocb_setup(char *str) { alloc_bootmem_cpumask_var(&rcu_nocb_mask); - cpulist_parse(str, rcu_nocb_mask); + if (!strcasecmp(str, "all")) + cpumask_setall(rcu_nocb_mask); + else + if (cpulist_parse(str, rcu_nocb_mask)) { + pr_warn("rcu_nocbs= bad CPU range, all CPUs set\n"); + cpumask_setall(rcu_nocb_mask); + } return 1; } __setup("rcu_nocbs=", rcu_nocb_setup); @@ -1987,10 +1598,7 @@ static void wake_nocb_leader_defer(struct rcu_data *rdp, int waketype, raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); } -/* - * Does the specified CPU need an RCU callback for this invocation - * of rcu_barrier()? - */ +/* Does rcu_barrier need to queue an RCU callback on the specified CPU? */ static bool rcu_nocb_cpu_needs_barrier(int cpu) { struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); @@ -2006,8 +1614,8 @@ static bool rcu_nocb_cpu_needs_barrier(int cpu) * callbacks would be posted. In the worst case, the first * barrier in rcu_barrier() suffices (but the caller cannot * necessarily rely on this, not a substitute for the caller - * getting the concurrency design right!). There must also be - * a barrier between the following load an posting of a callback + * getting the concurrency design right!). There must also be a + * barrier between the following load and posting of a callback * (if a callback is in fact needed). This is associated with an * atomic_inc() in the caller. */ @@ -2517,9 +2125,9 @@ static void rcu_spawn_one_nocb_kthread(int cpu) /* * If the specified CPU is a no-CBs CPU that does not already have its - * rcuo kthreads, spawn them. + * rcuo kthread, spawn it. */ -static void rcu_spawn_all_nocb_kthreads(int cpu) +static void rcu_spawn_cpu_nocb_kthread(int cpu) { if (rcu_scheduler_fully_active) rcu_spawn_one_nocb_kthread(cpu); @@ -2536,7 +2144,7 @@ static void __init rcu_spawn_nocb_kthreads(void) int cpu; for_each_online_cpu(cpu) - rcu_spawn_all_nocb_kthreads(cpu); + rcu_spawn_cpu_nocb_kthread(cpu); } /* How many follower CPU IDs per leader? Default of -1 for sqrt(nr_cpu_ids). */ @@ -2670,7 +2278,7 @@ static void do_nocb_deferred_wakeup(struct rcu_data *rdp) { } -static void rcu_spawn_all_nocb_kthreads(int cpu) +static void rcu_spawn_cpu_nocb_kthread(int cpu) { } diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h new file mode 100644 index 000000000000..065183391f75 --- /dev/null +++ b/kernel/rcu/tree_stall.h @@ -0,0 +1,711 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * RCU CPU stall warnings for normal RCU grace periods + * + * Copyright IBM Corporation, 2019 + * + * Author: Paul E. McKenney <paulmck@linux.ibm.com> + */ + +////////////////////////////////////////////////////////////////////////////// +// +// Controlling CPU stall warnings, including delay calculation. + +/* panic() on RCU Stall sysctl. */ +int sysctl_panic_on_rcu_stall __read_mostly; + +#ifdef CONFIG_PROVE_RCU +#define RCU_STALL_DELAY_DELTA (5 * HZ) +#else +#define RCU_STALL_DELAY_DELTA 0 +#endif + +/* Limit-check stall timeouts specified at boottime and runtime. */ +int rcu_jiffies_till_stall_check(void) +{ + int till_stall_check = READ_ONCE(rcu_cpu_stall_timeout); + + /* + * Limit check must be consistent with the Kconfig limits + * for CONFIG_RCU_CPU_STALL_TIMEOUT. + */ + if (till_stall_check < 3) { + WRITE_ONCE(rcu_cpu_stall_timeout, 3); + till_stall_check = 3; + } else if (till_stall_check > 300) { + WRITE_ONCE(rcu_cpu_stall_timeout, 300); + till_stall_check = 300; + } + return till_stall_check * HZ + RCU_STALL_DELAY_DELTA; +} +EXPORT_SYMBOL_GPL(rcu_jiffies_till_stall_check); + +/* Don't do RCU CPU stall warnings during long sysrq printouts. */ +void rcu_sysrq_start(void) +{ + if (!rcu_cpu_stall_suppress) + rcu_cpu_stall_suppress = 2; +} + +void rcu_sysrq_end(void) +{ + if (rcu_cpu_stall_suppress == 2) + rcu_cpu_stall_suppress = 0; +} + +/* Don't print RCU CPU stall warnings during a kernel panic. */ +static int rcu_panic(struct notifier_block *this, unsigned long ev, void *ptr) +{ + rcu_cpu_stall_suppress = 1; + return NOTIFY_DONE; +} + +static struct notifier_block rcu_panic_block = { + .notifier_call = rcu_panic, +}; + +static int __init check_cpu_stall_init(void) +{ + atomic_notifier_chain_register(&panic_notifier_list, &rcu_panic_block); + return 0; +} +early_initcall(check_cpu_stall_init); + +/* If so specified via sysctl, panic, yielding cleaner stall-warning output. */ +static void panic_on_rcu_stall(void) +{ + if (sysctl_panic_on_rcu_stall) + panic("RCU Stall\n"); +} + +/** + * rcu_cpu_stall_reset - prevent further stall warnings in current grace period + * + * Set the stall-warning timeout way off into the future, thus preventing + * any RCU CPU stall-warning messages from appearing in the current set of + * RCU grace periods. + * + * The caller must disable hard irqs. + */ +void rcu_cpu_stall_reset(void) +{ + WRITE_ONCE(rcu_state.jiffies_stall, jiffies + ULONG_MAX / 2); +} + +////////////////////////////////////////////////////////////////////////////// +// +// Interaction with RCU grace periods + +/* Start of new grace period, so record stall time (and forcing times). */ +static void record_gp_stall_check_time(void) +{ + unsigned long j = jiffies; + unsigned long j1; + + rcu_state.gp_start = j; + j1 = rcu_jiffies_till_stall_check(); + /* Record ->gp_start before ->jiffies_stall. */ + smp_store_release(&rcu_state.jiffies_stall, j + j1); /* ^^^ */ + rcu_state.jiffies_resched = j + j1 / 2; + rcu_state.n_force_qs_gpstart = READ_ONCE(rcu_state.n_force_qs); +} + +/* Zero ->ticks_this_gp and snapshot the number of RCU softirq handlers. */ +static void zero_cpu_stall_ticks(struct rcu_data *rdp) +{ + rdp->ticks_this_gp = 0; + rdp->softirq_snap = kstat_softirqs_cpu(RCU_SOFTIRQ, smp_processor_id()); + WRITE_ONCE(rdp->last_fqs_resched, jiffies); +} + +/* + * If too much time has passed in the current grace period, and if + * so configured, go kick the relevant kthreads. + */ +static void rcu_stall_kick_kthreads(void) +{ + unsigned long j; + + if (!rcu_kick_kthreads) + return; + j = READ_ONCE(rcu_state.jiffies_kick_kthreads); + if (time_after(jiffies, j) && rcu_state.gp_kthread && + (rcu_gp_in_progress() || READ_ONCE(rcu_state.gp_flags))) { + WARN_ONCE(1, "Kicking %s grace-period kthread\n", + rcu_state.name); + rcu_ftrace_dump(DUMP_ALL); + wake_up_process(rcu_state.gp_kthread); + WRITE_ONCE(rcu_state.jiffies_kick_kthreads, j + HZ); + } +} + +/* + * Handler for the irq_work request posted about halfway into the RCU CPU + * stall timeout, and used to detect excessive irq disabling. Set state + * appropriately, but just complain if there is unexpected state on entry. + */ +static void rcu_iw_handler(struct irq_work *iwp) +{ + struct rcu_data *rdp; + struct rcu_node *rnp; + + rdp = container_of(iwp, struct rcu_data, rcu_iw); + rnp = rdp->mynode; + raw_spin_lock_rcu_node(rnp); + if (!WARN_ON_ONCE(!rdp->rcu_iw_pending)) { + rdp->rcu_iw_gp_seq = rnp->gp_seq; + rdp->rcu_iw_pending = false; + } + raw_spin_unlock_rcu_node(rnp); +} + +////////////////////////////////////////////////////////////////////////////// +// +// Printing RCU CPU stall warnings + +#ifdef CONFIG_PREEMPT + +/* + * Dump detailed information for all tasks blocking the current RCU + * grace period on the specified rcu_node structure. + */ +static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp) +{ + unsigned long flags; + struct task_struct *t; + + raw_spin_lock_irqsave_rcu_node(rnp, flags); + if (!rcu_preempt_blocked_readers_cgp(rnp)) { + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + return; + } + t = list_entry(rnp->gp_tasks->prev, + struct task_struct, rcu_node_entry); + list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) { + /* + * We could be printing a lot while holding a spinlock. + * Avoid triggering hard lockup. + */ + touch_nmi_watchdog(); + sched_show_task(t); + } + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); +} + +/* + * Scan the current list of tasks blocked within RCU read-side critical + * sections, printing out the tid of each. + */ +static int rcu_print_task_stall(struct rcu_node *rnp) +{ + struct task_struct *t; + int ndetected = 0; + + if (!rcu_preempt_blocked_readers_cgp(rnp)) + return 0; + pr_err("\tTasks blocked on level-%d rcu_node (CPUs %d-%d):", + rnp->level, rnp->grplo, rnp->grphi); + t = list_entry(rnp->gp_tasks->prev, + struct task_struct, rcu_node_entry); + list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) { + pr_cont(" P%d", t->pid); + ndetected++; + } + pr_cont("\n"); + return ndetected; +} + +#else /* #ifdef CONFIG_PREEMPT */ + +/* + * Because preemptible RCU does not exist, we never have to check for + * tasks blocked within RCU read-side critical sections. + */ +static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp) +{ +} + +/* + * Because preemptible RCU does not exist, we never have to check for + * tasks blocked within RCU read-side critical sections. + */ +static int rcu_print_task_stall(struct rcu_node *rnp) +{ + return 0; +} +#endif /* #else #ifdef CONFIG_PREEMPT */ + +/* + * Dump stacks of all tasks running on stalled CPUs. First try using + * NMIs, but fall back to manual remote stack tracing on architectures + * that don't support NMI-based stack dumps. The NMI-triggered stack + * traces are more accurate because they are printed by the target CPU. + */ +static void rcu_dump_cpu_stacks(void) +{ + int cpu; + unsigned long flags; + struct rcu_node *rnp; + + rcu_for_each_leaf_node(rnp) { + raw_spin_lock_irqsave_rcu_node(rnp, flags); + for_each_leaf_node_possible_cpu(rnp, cpu) + if (rnp->qsmask & leaf_node_cpu_bit(rnp, cpu)) + if (!trigger_single_cpu_backtrace(cpu)) + dump_cpu_task(cpu); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + } +} + +#ifdef CONFIG_RCU_FAST_NO_HZ + +static void print_cpu_stall_fast_no_hz(char *cp, int cpu) +{ + struct rcu_data *rdp = &per_cpu(rcu_data, cpu); + + sprintf(cp, "last_accelerate: %04lx/%04lx, Nonlazy posted: %c%c%c", + rdp->last_accelerate & 0xffff, jiffies & 0xffff, + ".l"[rdp->all_lazy], + ".L"[!rcu_segcblist_n_nonlazy_cbs(&rdp->cblist)], + ".D"[!!rdp->tick_nohz_enabled_snap]); +} + +#else /* #ifdef CONFIG_RCU_FAST_NO_HZ */ + +static void print_cpu_stall_fast_no_hz(char *cp, int cpu) +{ + *cp = '\0'; +} + +#endif /* #else #ifdef CONFIG_RCU_FAST_NO_HZ */ + +/* + * Print out diagnostic information for the specified stalled CPU. + * + * If the specified CPU is aware of the current RCU grace period, then + * print the number of scheduling clock interrupts the CPU has taken + * during the time that it has been aware. Otherwise, print the number + * of RCU grace periods that this CPU is ignorant of, for example, "1" + * if the CPU was aware of the previous grace period. + * + * Also print out idle and (if CONFIG_RCU_FAST_NO_HZ) idle-entry info. + */ +static void print_cpu_stall_info(int cpu) +{ + unsigned long delta; + char fast_no_hz[72]; + struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); + char *ticks_title; + unsigned long ticks_value; + + /* + * We could be printing a lot while holding a spinlock. Avoid + * triggering hard lockup. + */ + touch_nmi_watchdog(); + + ticks_value = rcu_seq_ctr(rcu_state.gp_seq - rdp->gp_seq); + if (ticks_value) { + ticks_title = "GPs behind"; + } else { + ticks_title = "ticks this GP"; + ticks_value = rdp->ticks_this_gp; + } + print_cpu_stall_fast_no_hz(fast_no_hz, cpu); + delta = rcu_seq_ctr(rdp->mynode->gp_seq - rdp->rcu_iw_gp_seq); + pr_err("\t%d-%c%c%c%c: (%lu %s) idle=%03x/%ld/%#lx softirq=%u/%u fqs=%ld %s\n", + cpu, + "O."[!!cpu_online(cpu)], + "o."[!!(rdp->grpmask & rdp->mynode->qsmaskinit)], + "N."[!!(rdp->grpmask & rdp->mynode->qsmaskinitnext)], + !IS_ENABLED(CONFIG_IRQ_WORK) ? '?' : + rdp->rcu_iw_pending ? (int)min(delta, 9UL) + '0' : + "!."[!delta], + ticks_value, ticks_title, + rcu_dynticks_snap(rdp) & 0xfff, + rdp->dynticks_nesting, rdp->dynticks_nmi_nesting, + rdp->softirq_snap, kstat_softirqs_cpu(RCU_SOFTIRQ, cpu), + READ_ONCE(rcu_state.n_force_qs) - rcu_state.n_force_qs_gpstart, + fast_no_hz); +} + +/* Complain about starvation of grace-period kthread. */ +static void rcu_check_gp_kthread_starvation(void) +{ + struct task_struct *gpk = rcu_state.gp_kthread; + unsigned long j; + + j = jiffies - READ_ONCE(rcu_state.gp_activity); + if (j > 2 * HZ) { + pr_err("%s kthread starved for %ld jiffies! g%ld f%#x %s(%d) ->state=%#lx ->cpu=%d\n", + rcu_state.name, j, + (long)rcu_seq_current(&rcu_state.gp_seq), + READ_ONCE(rcu_state.gp_flags), + gp_state_getname(rcu_state.gp_state), rcu_state.gp_state, + gpk ? gpk->state : ~0, gpk ? task_cpu(gpk) : -1); + if (gpk) { + pr_err("RCU grace-period kthread stack dump:\n"); + sched_show_task(gpk); + wake_up_process(gpk); + } + } +} + +static void print_other_cpu_stall(unsigned long gp_seq) +{ + int cpu; + unsigned long flags; + unsigned long gpa; + unsigned long j; + int ndetected = 0; + struct rcu_node *rnp; + long totqlen = 0; + + /* Kick and suppress, if so configured. */ + rcu_stall_kick_kthreads(); + if (rcu_cpu_stall_suppress) + return; + + /* + * OK, time to rat on our buddy... + * See Documentation/RCU/stallwarn.txt for info on how to debug + * RCU CPU stall warnings. + */ + pr_err("INFO: %s detected stalls on CPUs/tasks:\n", rcu_state.name); + rcu_for_each_leaf_node(rnp) { + raw_spin_lock_irqsave_rcu_node(rnp, flags); + ndetected += rcu_print_task_stall(rnp); + if (rnp->qsmask != 0) { + for_each_leaf_node_possible_cpu(rnp, cpu) + if (rnp->qsmask & leaf_node_cpu_bit(rnp, cpu)) { + print_cpu_stall_info(cpu); + ndetected++; + } + } + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + } + + for_each_possible_cpu(cpu) + totqlen += rcu_get_n_cbs_cpu(cpu); + pr_cont("\t(detected by %d, t=%ld jiffies, g=%ld, q=%lu)\n", + smp_processor_id(), (long)(jiffies - rcu_state.gp_start), + (long)rcu_seq_current(&rcu_state.gp_seq), totqlen); + if (ndetected) { + rcu_dump_cpu_stacks(); + + /* Complain about tasks blocking the grace period. */ + rcu_for_each_leaf_node(rnp) + rcu_print_detail_task_stall_rnp(rnp); + } else { + if (rcu_seq_current(&rcu_state.gp_seq) != gp_seq) { + pr_err("INFO: Stall ended before state dump start\n"); + } else { + j = jiffies; + gpa = READ_ONCE(rcu_state.gp_activity); + pr_err("All QSes seen, last %s kthread activity %ld (%ld-%ld), jiffies_till_next_fqs=%ld, root ->qsmask %#lx\n", + rcu_state.name, j - gpa, j, gpa, + READ_ONCE(jiffies_till_next_fqs), + rcu_get_root()->qsmask); + /* In this case, the current CPU might be at fault. */ + sched_show_task(current); + } + } + /* Rewrite if needed in case of slow consoles. */ + if (ULONG_CMP_GE(jiffies, READ_ONCE(rcu_state.jiffies_stall))) + WRITE_ONCE(rcu_state.jiffies_stall, + jiffies + 3 * rcu_jiffies_till_stall_check() + 3); + + rcu_check_gp_kthread_starvation(); + + panic_on_rcu_stall(); + + rcu_force_quiescent_state(); /* Kick them all. */ +} + +static void print_cpu_stall(void) +{ + int cpu; + unsigned long flags; + struct rcu_data *rdp = this_cpu_ptr(&rcu_data); + struct rcu_node *rnp = rcu_get_root(); + long totqlen = 0; + + /* Kick and suppress, if so configured. */ + rcu_stall_kick_kthreads(); + if (rcu_cpu_stall_suppress) + return; + + /* + * OK, time to rat on ourselves... + * See Documentation/RCU/stallwarn.txt for info on how to debug + * RCU CPU stall warnings. + */ + pr_err("INFO: %s self-detected stall on CPU\n", rcu_state.name); + raw_spin_lock_irqsave_rcu_node(rdp->mynode, flags); + print_cpu_stall_info(smp_processor_id()); + raw_spin_unlock_irqrestore_rcu_node(rdp->mynode, flags); + for_each_possible_cpu(cpu) + totqlen += rcu_get_n_cbs_cpu(cpu); + pr_cont("\t(t=%lu jiffies g=%ld q=%lu)\n", + jiffies - rcu_state.gp_start, + (long)rcu_seq_current(&rcu_state.gp_seq), totqlen); + + rcu_check_gp_kthread_starvation(); + + rcu_dump_cpu_stacks(); + + raw_spin_lock_irqsave_rcu_node(rnp, flags); + /* Rewrite if needed in case of slow consoles. */ + if (ULONG_CMP_GE(jiffies, READ_ONCE(rcu_state.jiffies_stall))) + WRITE_ONCE(rcu_state.jiffies_stall, + jiffies + 3 * rcu_jiffies_till_stall_check() + 3); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + + panic_on_rcu_stall(); + + /* + * Attempt to revive the RCU machinery by forcing a context switch. + * + * A context switch would normally allow the RCU state machine to make + * progress and it could be we're stuck in kernel space without context + * switches for an entirely unreasonable amount of time. + */ + set_tsk_need_resched(current); + set_preempt_need_resched(); +} + +static void check_cpu_stall(struct rcu_data *rdp) +{ + unsigned long gs1; + unsigned long gs2; + unsigned long gps; + unsigned long j; + unsigned long jn; + unsigned long js; + struct rcu_node *rnp; + + if ((rcu_cpu_stall_suppress && !rcu_kick_kthreads) || + !rcu_gp_in_progress()) + return; + rcu_stall_kick_kthreads(); + j = jiffies; + + /* + * Lots of memory barriers to reject false positives. + * + * The idea is to pick up rcu_state.gp_seq, then + * rcu_state.jiffies_stall, then rcu_state.gp_start, and finally + * another copy of rcu_state.gp_seq. These values are updated in + * the opposite order with memory barriers (or equivalent) during + * grace-period initialization and cleanup. Now, a false positive + * can occur if we get an new value of rcu_state.gp_start and a old + * value of rcu_state.jiffies_stall. But given the memory barriers, + * the only way that this can happen is if one grace period ends + * and another starts between these two fetches. This is detected + * by comparing the second fetch of rcu_state.gp_seq with the + * previous fetch from rcu_state.gp_seq. + * + * Given this check, comparisons of jiffies, rcu_state.jiffies_stall, + * and rcu_state.gp_start suffice to forestall false positives. + */ + gs1 = READ_ONCE(rcu_state.gp_seq); + smp_rmb(); /* Pick up ->gp_seq first... */ + js = READ_ONCE(rcu_state.jiffies_stall); + smp_rmb(); /* ...then ->jiffies_stall before the rest... */ + gps = READ_ONCE(rcu_state.gp_start); + smp_rmb(); /* ...and finally ->gp_start before ->gp_seq again. */ + gs2 = READ_ONCE(rcu_state.gp_seq); + if (gs1 != gs2 || + ULONG_CMP_LT(j, js) || + ULONG_CMP_GE(gps, js)) + return; /* No stall or GP completed since entering function. */ + rnp = rdp->mynode; + jn = jiffies + 3 * rcu_jiffies_till_stall_check() + 3; + if (rcu_gp_in_progress() && + (READ_ONCE(rnp->qsmask) & rdp->grpmask) && + cmpxchg(&rcu_state.jiffies_stall, js, jn) == js) { + + /* We haven't checked in, so go dump stack. */ + print_cpu_stall(); + + } else if (rcu_gp_in_progress() && + ULONG_CMP_GE(j, js + RCU_STALL_RAT_DELAY) && + cmpxchg(&rcu_state.jiffies_stall, js, jn) == js) { + + /* They had a few time units to dump stack, so complain. */ + print_other_cpu_stall(gs2); + } +} + +////////////////////////////////////////////////////////////////////////////// +// +// RCU forward-progress mechanisms, including of callback invocation. + + +/* + * Show the state of the grace-period kthreads. + */ +void show_rcu_gp_kthreads(void) +{ + int cpu; + unsigned long j; + unsigned long ja; + unsigned long jr; + unsigned long jw; + struct rcu_data *rdp; + struct rcu_node *rnp; + + j = jiffies; + ja = j - READ_ONCE(rcu_state.gp_activity); + jr = j - READ_ONCE(rcu_state.gp_req_activity); + jw = j - READ_ONCE(rcu_state.gp_wake_time); + pr_info("%s: wait state: %s(%d) ->state: %#lx delta ->gp_activity %lu ->gp_req_activity %lu ->gp_wake_time %lu ->gp_wake_seq %ld ->gp_seq %ld ->gp_seq_needed %ld ->gp_flags %#x\n", + rcu_state.name, gp_state_getname(rcu_state.gp_state), + rcu_state.gp_state, + rcu_state.gp_kthread ? rcu_state.gp_kthread->state : 0x1ffffL, + ja, jr, jw, (long)READ_ONCE(rcu_state.gp_wake_seq), + (long)READ_ONCE(rcu_state.gp_seq), + (long)READ_ONCE(rcu_get_root()->gp_seq_needed), + READ_ONCE(rcu_state.gp_flags)); + rcu_for_each_node_breadth_first(rnp) { + if (ULONG_CMP_GE(rcu_state.gp_seq, rnp->gp_seq_needed)) + continue; + pr_info("\trcu_node %d:%d ->gp_seq %ld ->gp_seq_needed %ld\n", + rnp->grplo, rnp->grphi, (long)rnp->gp_seq, + (long)rnp->gp_seq_needed); + if (!rcu_is_leaf_node(rnp)) + continue; + for_each_leaf_node_possible_cpu(rnp, cpu) { + rdp = per_cpu_ptr(&rcu_data, cpu); + if (rdp->gpwrap || + ULONG_CMP_GE(rcu_state.gp_seq, + rdp->gp_seq_needed)) + continue; + pr_info("\tcpu %d ->gp_seq_needed %ld\n", + cpu, (long)rdp->gp_seq_needed); + } + } + /* sched_show_task(rcu_state.gp_kthread); */ +} +EXPORT_SYMBOL_GPL(show_rcu_gp_kthreads); + +/* + * This function checks for grace-period requests that fail to motivate + * RCU to come out of its idle mode. + */ +static void rcu_check_gp_start_stall(struct rcu_node *rnp, struct rcu_data *rdp, + const unsigned long gpssdelay) +{ + unsigned long flags; + unsigned long j; + struct rcu_node *rnp_root = rcu_get_root(); + static atomic_t warned = ATOMIC_INIT(0); + + if (!IS_ENABLED(CONFIG_PROVE_RCU) || rcu_gp_in_progress() || + ULONG_CMP_GE(rnp_root->gp_seq, rnp_root->gp_seq_needed)) + return; + j = jiffies; /* Expensive access, and in common case don't get here. */ + if (time_before(j, READ_ONCE(rcu_state.gp_req_activity) + gpssdelay) || + time_before(j, READ_ONCE(rcu_state.gp_activity) + gpssdelay) || + atomic_read(&warned)) + return; + + raw_spin_lock_irqsave_rcu_node(rnp, flags); + j = jiffies; + if (rcu_gp_in_progress() || + ULONG_CMP_GE(rnp_root->gp_seq, rnp_root->gp_seq_needed) || + time_before(j, READ_ONCE(rcu_state.gp_req_activity) + gpssdelay) || + time_before(j, READ_ONCE(rcu_state.gp_activity) + gpssdelay) || + atomic_read(&warned)) { + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + return; + } + /* Hold onto the leaf lock to make others see warned==1. */ + + if (rnp_root != rnp) + raw_spin_lock_rcu_node(rnp_root); /* irqs already disabled. */ + j = jiffies; + if (rcu_gp_in_progress() || + ULONG_CMP_GE(rnp_root->gp_seq, rnp_root->gp_seq_needed) || + time_before(j, rcu_state.gp_req_activity + gpssdelay) || + time_before(j, rcu_state.gp_activity + gpssdelay) || + atomic_xchg(&warned, 1)) { + if (rnp_root != rnp) + /* irqs remain disabled. */ + raw_spin_unlock_rcu_node(rnp_root); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + return; + } + WARN_ON(1); + if (rnp_root != rnp) + raw_spin_unlock_rcu_node(rnp_root); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + show_rcu_gp_kthreads(); +} + +/* + * Do a forward-progress check for rcutorture. This is normally invoked + * due to an OOM event. The argument "j" gives the time period during + * which rcutorture would like progress to have been made. + */ +void rcu_fwd_progress_check(unsigned long j) +{ + unsigned long cbs; + int cpu; + unsigned long max_cbs = 0; + int max_cpu = -1; + struct rcu_data *rdp; + + if (rcu_gp_in_progress()) { + pr_info("%s: GP age %lu jiffies\n", + __func__, jiffies - rcu_state.gp_start); + show_rcu_gp_kthreads(); + } else { + pr_info("%s: Last GP end %lu jiffies ago\n", + __func__, jiffies - rcu_state.gp_end); + preempt_disable(); + rdp = this_cpu_ptr(&rcu_data); + rcu_check_gp_start_stall(rdp->mynode, rdp, j); + preempt_enable(); + } + for_each_possible_cpu(cpu) { + cbs = rcu_get_n_cbs_cpu(cpu); + if (!cbs) + continue; + if (max_cpu < 0) + pr_info("%s: callbacks", __func__); + pr_cont(" %d: %lu", cpu, cbs); + if (cbs <= max_cbs) + continue; + max_cbs = cbs; + max_cpu = cpu; + } + if (max_cpu >= 0) + pr_cont("\n"); +} +EXPORT_SYMBOL_GPL(rcu_fwd_progress_check); + +/* Commandeer a sysrq key to dump RCU's tree. */ +static bool sysrq_rcu; +module_param(sysrq_rcu, bool, 0444); + +/* Dump grace-period-request information due to commandeered sysrq. */ +static void sysrq_show_rcu(int key) +{ + show_rcu_gp_kthreads(); +} + +static struct sysrq_key_op sysrq_rcudump_op = { + .handler = sysrq_show_rcu, + .help_msg = "show-rcu(y)", + .action_msg = "Show RCU tree", + .enable_mask = SYSRQ_ENABLE_DUMP, +}; + +static int __init rcu_sysrq_init(void) +{ + if (sysrq_rcu) + return register_sysrq_key('y', &sysrq_rcudump_op); + return 0; +} +early_initcall(rcu_sysrq_init); diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 1971869c4072..61df2bf08563 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -1,26 +1,13 @@ +// SPDX-License-Identifier: GPL-2.0+ /* * Read-Copy Update mechanism for mutual exclusion * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, you can access it online at - * http://www.gnu.org/licenses/gpl-2.0.html. - * * Copyright IBM Corporation, 2001 * * Authors: Dipankar Sarma <dipankar@in.ibm.com> * Manfred Spraul <manfred@colorfullife.com> * - * Based on the original work by Paul McKenney <paulmck@us.ibm.com> + * Based on the original work by Paul McKenney <paulmck@linux.ibm.com> * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen. * Papers: * http://www.rdrop.com/users/paulmck/paper/rclockpdcsproof.pdf @@ -52,6 +39,7 @@ #include <linux/tick.h> #include <linux/rcupdate_wait.h> #include <linux/sched/isolation.h> +#include <linux/kprobes.h> #define CREATE_TRACE_POINTS @@ -249,6 +237,7 @@ int notrace debug_lockdep_rcu_enabled(void) current->lockdep_recursion == 0; } EXPORT_SYMBOL_GPL(debug_lockdep_rcu_enabled); +NOKPROBE_SYMBOL(debug_lockdep_rcu_enabled); /** * rcu_read_lock_held() - might we be in RCU read-side critical section? @@ -434,69 +423,25 @@ EXPORT_SYMBOL_GPL(do_trace_rcu_torture_read); do { } while (0) #endif -#ifdef CONFIG_RCU_STALL_COMMON +#if IS_ENABLED(CONFIG_RCU_TORTURE_TEST) || IS_MODULE(CONFIG_RCU_TORTURE_TEST) +/* Get rcutorture access to sched_setaffinity(). */ +long rcutorture_sched_setaffinity(pid_t pid, const struct cpumask *in_mask) +{ + int ret; -#ifdef CONFIG_PROVE_RCU -#define RCU_STALL_DELAY_DELTA (5 * HZ) -#else -#define RCU_STALL_DELAY_DELTA 0 + ret = sched_setaffinity(pid, in_mask); + WARN_ONCE(ret, "%s: sched_setaffinity() returned %d\n", __func__, ret); + return ret; +} +EXPORT_SYMBOL_GPL(rcutorture_sched_setaffinity); #endif +#ifdef CONFIG_RCU_STALL_COMMON int rcu_cpu_stall_suppress __read_mostly; /* 1 = suppress stall warnings. */ EXPORT_SYMBOL_GPL(rcu_cpu_stall_suppress); -static int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT; - module_param(rcu_cpu_stall_suppress, int, 0644); +int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT; module_param(rcu_cpu_stall_timeout, int, 0644); - -int rcu_jiffies_till_stall_check(void) -{ - int till_stall_check = READ_ONCE(rcu_cpu_stall_timeout); - - /* - * Limit check must be consistent with the Kconfig limits - * for CONFIG_RCU_CPU_STALL_TIMEOUT. - */ - if (till_stall_check < 3) { - WRITE_ONCE(rcu_cpu_stall_timeout, 3); - till_stall_check = 3; - } else if (till_stall_check > 300) { - WRITE_ONCE(rcu_cpu_stall_timeout, 300); - till_stall_check = 300; - } - return till_stall_check * HZ + RCU_STALL_DELAY_DELTA; -} -EXPORT_SYMBOL_GPL(rcu_jiffies_till_stall_check); - -void rcu_sysrq_start(void) -{ - if (!rcu_cpu_stall_suppress) - rcu_cpu_stall_suppress = 2; -} - -void rcu_sysrq_end(void) -{ - if (rcu_cpu_stall_suppress == 2) - rcu_cpu_stall_suppress = 0; -} - -static int rcu_panic(struct notifier_block *this, unsigned long ev, void *ptr) -{ - rcu_cpu_stall_suppress = 1; - return NOTIFY_DONE; -} - -static struct notifier_block rcu_panic_block = { - .notifier_call = rcu_panic, -}; - -static int __init check_cpu_stall_init(void) -{ - atomic_notifier_chain_register(&panic_notifier_list, &rcu_panic_block); - return 0; -} -early_initcall(check_cpu_stall_init); - #endif /* #ifdef CONFIG_RCU_STALL_COMMON */ #ifdef CONFIG_TASKS_RCU diff --git a/kernel/reboot.c b/kernel/reboot.c index e1b79b6a2735..c4d472b7f1b4 100644 --- a/kernel/reboot.c +++ b/kernel/reboot.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * linux/kernel/reboot.c * @@ -31,6 +32,7 @@ EXPORT_SYMBOL(cad_pid); #define DEFAULT_REBOOT_MODE #endif enum reboot_mode reboot_mode DEFAULT_REBOOT_MODE; +enum reboot_mode panic_reboot_mode = REBOOT_UNDEFINED; /* * This variable is used privately to keep track of whether or not @@ -519,6 +521,8 @@ EXPORT_SYMBOL_GPL(orderly_reboot); static int __init reboot_setup(char *str) { for (;;) { + enum reboot_mode *mode; + /* * Having anything passed on the command line via * reboot= will cause us to disable DMI checking @@ -526,17 +530,24 @@ static int __init reboot_setup(char *str) */ reboot_default = 0; + if (!strncmp(str, "panic_", 6)) { + mode = &panic_reboot_mode; + str += 6; + } else { + mode = &reboot_mode; + } + switch (*str) { case 'w': - reboot_mode = REBOOT_WARM; + *mode = REBOOT_WARM; break; case 'c': - reboot_mode = REBOOT_COLD; + *mode = REBOOT_COLD; break; case 'h': - reboot_mode = REBOOT_HARD; + *mode = REBOOT_HARD; break; case 's': @@ -553,11 +564,11 @@ static int __init reboot_setup(char *str) if (rc) return rc; } else - reboot_mode = REBOOT_SOFT; + *mode = REBOOT_SOFT; break; } case 'g': - reboot_mode = REBOOT_GPIO; + *mode = REBOOT_GPIO; break; case 'b': diff --git a/kernel/relay.c b/kernel/relay.c index 9e0f52375487..ade14fb7ce2e 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -1177,7 +1177,6 @@ static void relay_pipe_buf_release(struct pipe_inode_info *pipe, } static const struct pipe_buf_operations relay_pipe_buf_ops = { - .can_merge = 0, .confirm = generic_pipe_buf_confirm, .release = relay_pipe_buf_release, .steal = generic_pipe_buf_steal, diff --git a/kernel/resource.c b/kernel/resource.c index 915c02e8e5dd..d22423e85cf8 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * linux/kernel/resource.c * @@ -382,7 +383,7 @@ static int __walk_iomem_res_desc(resource_size_t start, resource_size_t end, int (*func)(struct resource *, void *)) { struct resource res; - int ret = -1; + int ret = -EINVAL; while (start < end && !find_next_iomem_res(start, end, flags, desc, first_lvl, &res)) { @@ -448,12 +449,13 @@ int walk_mem_res(u64 start, u64 end, void *arg, arg, func); } -#if !defined(CONFIG_ARCH_HAS_WALK_MEMORY) - /* * This function calls the @func callback against all memory ranges of type * System RAM which are marked as IORESOURCE_SYSTEM_RAM and IORESOUCE_BUSY. * It is to be used only for System RAM. + * + * This will find System RAM ranges that are children of top-level resources + * in addition to top-level System RAM resources. */ int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages, void *arg, int (*func)(unsigned long, unsigned long, void *)) @@ -462,14 +464,14 @@ int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages, unsigned long flags; struct resource res; unsigned long pfn, end_pfn; - int ret = -1; + int ret = -EINVAL; start = (u64) start_pfn << PAGE_SHIFT; end = ((u64)(start_pfn + nr_pages) << PAGE_SHIFT) - 1; flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY; while (start < end && !find_next_iomem_res(start, end, flags, IORES_DESC_NONE, - true, &res)) { + false, &res)) { pfn = (res.start + PAGE_SIZE - 1) >> PAGE_SHIFT; end_pfn = (res.end + 1) >> PAGE_SHIFT; if (end_pfn > pfn) @@ -481,8 +483,6 @@ int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages, return ret; } -#endif - static int __is_ram(unsigned long pfn, unsigned long nr_pages, void *arg) { return 1; @@ -521,21 +521,20 @@ EXPORT_SYMBOL_GPL(page_is_ram); int region_intersects(resource_size_t start, size_t size, unsigned long flags, unsigned long desc) { - resource_size_t end = start + size - 1; + struct resource res; int type = 0; int other = 0; struct resource *p; + res.start = start; + res.end = start + size - 1; + read_lock(&resource_lock); for (p = iomem_resource.child; p ; p = p->sibling) { bool is_type = (((p->flags & flags) == flags) && ((desc == IORES_DESC_NONE) || (desc == p->desc))); - if (start >= p->start && start <= p->end) - is_type ? type++ : other++; - if (end >= p->start && end <= p->end) - is_type ? type++ : other++; - if (p->start >= start && p->end <= end) + if (resource_overlaps(p, &res)) is_type ? type++ : other++; } read_unlock(&resource_lock); @@ -1132,6 +1131,15 @@ struct resource * __request_region(struct resource *parent, conflict = __request_resource(parent, res); if (!conflict) break; + /* + * mm/hmm.c reserves physical addresses which then + * become unavailable to other users. Conflicts are + * not expected. Warn to aid debugging if encountered. + */ + if (conflict->desc == IORES_DESC_DEVICE_PRIVATE_MEMORY) { + pr_warn("Unaddressable device %s %pR conflicts with %pR", + conflict->name, conflict, res); + } if (conflict != parent) { if (!(conflict->flags & IORESOURCE_BUSY)) { parent = conflict; @@ -1620,6 +1628,45 @@ void resource_list_free(struct list_head *head) } EXPORT_SYMBOL(resource_list_free); +#ifdef CONFIG_DEVICE_PRIVATE +/** + * devm_request_free_mem_region - find free region for device private memory + * + * @dev: device struct to bind the resource to + * @size: size in bytes of the device memory to add + * @base: resource tree to look in + * + * This function tries to find an empty range of physical address big enough to + * contain the new resource, so that it can later be hotplugged as ZONE_DEVICE + * memory, which in turn allocates struct pages. + */ +struct resource *devm_request_free_mem_region(struct device *dev, + struct resource *base, unsigned long size) +{ + resource_size_t end, addr; + struct resource *res; + + size = ALIGN(size, 1UL << PA_SECTION_SHIFT); + end = min_t(unsigned long, base->end, (1UL << MAX_PHYSMEM_BITS) - 1); + addr = end - size + 1UL; + + for (; addr > size && addr >= base->start; addr -= size) { + if (region_intersects(addr, size, 0, IORES_DESC_NONE) != + REGION_DISJOINT) + continue; + + res = devm_request_mem_region(dev, addr, size, dev_name(dev)); + if (!res) + return ERR_PTR(-ENOMEM); + res->desc = IORES_DESC_DEVICE_PRIVATE_MEMORY; + return res; + } + + return ERR_PTR(-ERANGE); +} +EXPORT_SYMBOL_GPL(devm_request_free_mem_region); +#endif /* CONFIG_DEVICE_PRIVATE */ + static int __init strict_iomem(char *str) { if (strstr(str, "relaxed")) diff --git a/kernel/rseq.c b/kernel/rseq.c index 25e9a7b60eba..27c48eb7de40 100644 --- a/kernel/rseq.c +++ b/kernel/rseq.c @@ -254,8 +254,7 @@ static int rseq_ip_fixup(struct pt_regs *regs) * - signal delivery, * and return to user-space. * - * This is how we can ensure that the entire rseq critical section, - * consisting of both the C part and the assembly instruction sequence, + * This is how we can ensure that the entire rseq critical section * will issue the commit instruction only if executed atomically with * respect to other threads scheduled on the same CPU, and with respect * to signal handlers. @@ -278,7 +277,7 @@ void __rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs) error: sig = ksig ? ksig->sig : 0; - force_sigsegv(sig, t); + force_sigsegv(sig); } #ifdef CONFIG_DEBUG_RSEQ @@ -297,7 +296,7 @@ void rseq_syscall(struct pt_regs *regs) return; if (!access_ok(t->rseq, sizeof(*t->rseq)) || rseq_get_rseq_cs(t, &rseq_cs) || in_rseq_cs(ip, &rseq_cs)) - force_sig(SIGSEGV, t); + force_sig(SIGSEGV); } #endif @@ -314,7 +313,7 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, /* Unregister rseq for current thread. */ if (current->rseq != rseq || !current->rseq) return -EINVAL; - if (current->rseq_len != rseq_len) + if (rseq_len != sizeof(*rseq)) return -EINVAL; if (current->rseq_sig != sig) return -EPERM; @@ -322,7 +321,6 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, if (ret) return ret; current->rseq = NULL; - current->rseq_len = 0; current->rseq_sig = 0; return 0; } @@ -336,7 +334,7 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, * the provided address differs from the prior * one. */ - if (current->rseq != rseq || current->rseq_len != rseq_len) + if (current->rseq != rseq || rseq_len != sizeof(*rseq)) return -EINVAL; if (current->rseq_sig != sig) return -EPERM; @@ -354,7 +352,6 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, if (!access_ok(rseq, rseq_len)) return -EFAULT; current->rseq = rseq; - current->rseq_len = rseq_len; current->rseq_sig = sig; /* * If rseq was previously inactive, and has just been diff --git a/kernel/sched/autogroup.c b/kernel/sched/autogroup.c index 2d4ff5353ded..2067080bb235 100644 --- a/kernel/sched/autogroup.c +++ b/kernel/sched/autogroup.c @@ -259,7 +259,6 @@ out: } #endif /* CONFIG_PROC_FS */ -#ifdef CONFIG_SCHED_DEBUG int autogroup_path(struct task_group *tg, char *buf, int buflen) { if (!task_group_is_autogroup(tg)) @@ -267,4 +266,3 @@ int autogroup_path(struct task_group *tg, char *buf, int buflen) return snprintf(buf, buflen, "%s-%ld", "/autogroup", tg->autogroup->id); } -#endif diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index e3e3b979f9bd..1152259a4ca0 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * sched_clock() for unstable CPU clocks * diff --git a/kernel/sched/core.c b/kernel/sched/core.c index d8d76a65cfdd..2b037f195473 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * kernel/sched/core.c * @@ -22,6 +23,17 @@ #define CREATE_TRACE_POINTS #include <trace/events/sched.h> +/* + * Export tracepoints that act as a bare tracehook (ie: have no trace event + * associated with them) to allow external modules to probe them. + */ +EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_cfs_tp); +EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_rt_tp); +EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_dl_tp); +EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_irq_tp); +EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_se_tp); +EXPORT_TRACEPOINT_SYMBOL_GPL(sched_overutilized_tp); + DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); #if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_JUMP_LABEL) @@ -107,11 +119,12 @@ struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) * [L] ->on_rq * RELEASE (rq->lock) * - * If we observe the old CPU in task_rq_lock, the acquire of + * If we observe the old CPU in task_rq_lock(), the acquire of * the old rq->lock will fully serialize against the stores. * - * If we observe the new CPU in task_rq_lock, the acquire will - * pair with the WMB to ensure we must then also see migrating. + * If we observe the new CPU in task_rq_lock(), the address + * dependency headed by '[L] rq = task_rq()' and the acquire + * will pair with the WMB to ensure we then also see migrating. */ if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) { rq_pin_lock(rq, rf); @@ -180,6 +193,7 @@ static void update_rq_clock_task(struct rq *rq, s64 delta) if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY)) update_irq_load_avg(rq, irq_delta + steal); #endif + update_rq_clock_pelt(rq, delta); } void update_rq_clock(struct rq *rq) @@ -396,19 +410,7 @@ static bool set_nr_if_polling(struct task_struct *p) #endif #endif -/** - * wake_q_add() - queue a wakeup for 'later' waking. - * @head: the wake_q_head to add @task to - * @task: the task to queue for 'later' wakeup - * - * Queue a task for later wakeup, most likely by the wake_up_q() call in the - * same context, _HOWEVER_ this is not guaranteed, the wakeup can come - * instantly. - * - * This function must be used as-if it were wake_up_process(); IOW the task - * must be ready to be woken at this location. - */ -void wake_q_add(struct wake_q_head *head, struct task_struct *task) +static bool __wake_q_add(struct wake_q_head *head, struct task_struct *task) { struct wake_q_node *node = &task->wake_q; @@ -421,16 +423,56 @@ void wake_q_add(struct wake_q_head *head, struct task_struct *task) * state, even in the failed case, an explicit smp_mb() must be used. */ smp_mb__before_atomic(); - if (cmpxchg_relaxed(&node->next, NULL, WAKE_Q_TAIL)) - return; - - get_task_struct(task); + if (unlikely(cmpxchg_relaxed(&node->next, NULL, WAKE_Q_TAIL))) + return false; /* * The head is context local, there can be no concurrency. */ *head->lastp = node; head->lastp = &node->next; + return true; +} + +/** + * wake_q_add() - queue a wakeup for 'later' waking. + * @head: the wake_q_head to add @task to + * @task: the task to queue for 'later' wakeup + * + * Queue a task for later wakeup, most likely by the wake_up_q() call in the + * same context, _HOWEVER_ this is not guaranteed, the wakeup can come + * instantly. + * + * This function must be used as-if it were wake_up_process(); IOW the task + * must be ready to be woken at this location. + */ +void wake_q_add(struct wake_q_head *head, struct task_struct *task) +{ + if (__wake_q_add(head, task)) + get_task_struct(task); +} + +/** + * wake_q_add_safe() - safely queue a wakeup for 'later' waking. + * @head: the wake_q_head to add @task to + * @task: the task to queue for 'later' wakeup + * + * Queue a task for later wakeup, most likely by the wake_up_q() call in the + * same context, _HOWEVER_ this is not guaranteed, the wakeup can come + * instantly. + * + * This function must be used as-if it were wake_up_process(); IOW the task + * must be ready to be woken at this location. + * + * This function is essentially a task-safe equivalent to wake_q_add(). Callers + * that already hold reference to @task can call the 'safe' version and trust + * wake_q to do the right thing depending whether or not the @task is already + * queued for wakeup. + */ +void wake_q_add_safe(struct wake_q_head *head, struct task_struct *task) +{ + if (!__wake_q_add(head, task)) + put_task_struct(task); } void wake_up_q(struct wake_q_head *head) @@ -730,6 +772,401 @@ static void set_load_weight(struct task_struct *p, bool update_load) } } +#ifdef CONFIG_UCLAMP_TASK +/* Max allowed minimum utilization */ +unsigned int sysctl_sched_uclamp_util_min = SCHED_CAPACITY_SCALE; + +/* Max allowed maximum utilization */ +unsigned int sysctl_sched_uclamp_util_max = SCHED_CAPACITY_SCALE; + +/* All clamps are required to be less or equal than these values */ +static struct uclamp_se uclamp_default[UCLAMP_CNT]; + +/* Integer rounded range for each bucket */ +#define UCLAMP_BUCKET_DELTA DIV_ROUND_CLOSEST(SCHED_CAPACITY_SCALE, UCLAMP_BUCKETS) + +#define for_each_clamp_id(clamp_id) \ + for ((clamp_id) = 0; (clamp_id) < UCLAMP_CNT; (clamp_id)++) + +static inline unsigned int uclamp_bucket_id(unsigned int clamp_value) +{ + return clamp_value / UCLAMP_BUCKET_DELTA; +} + +static inline unsigned int uclamp_bucket_base_value(unsigned int clamp_value) +{ + return UCLAMP_BUCKET_DELTA * uclamp_bucket_id(clamp_value); +} + +static inline unsigned int uclamp_none(int clamp_id) +{ + if (clamp_id == UCLAMP_MIN) + return 0; + return SCHED_CAPACITY_SCALE; +} + +static inline void uclamp_se_set(struct uclamp_se *uc_se, + unsigned int value, bool user_defined) +{ + uc_se->value = value; + uc_se->bucket_id = uclamp_bucket_id(value); + uc_se->user_defined = user_defined; +} + +static inline unsigned int +uclamp_idle_value(struct rq *rq, unsigned int clamp_id, + unsigned int clamp_value) +{ + /* + * Avoid blocked utilization pushing up the frequency when we go + * idle (which drops the max-clamp) by retaining the last known + * max-clamp. + */ + if (clamp_id == UCLAMP_MAX) { + rq->uclamp_flags |= UCLAMP_FLAG_IDLE; + return clamp_value; + } + + return uclamp_none(UCLAMP_MIN); +} + +static inline void uclamp_idle_reset(struct rq *rq, unsigned int clamp_id, + unsigned int clamp_value) +{ + /* Reset max-clamp retention only on idle exit */ + if (!(rq->uclamp_flags & UCLAMP_FLAG_IDLE)) + return; + + WRITE_ONCE(rq->uclamp[clamp_id].value, clamp_value); +} + +static inline +unsigned int uclamp_rq_max_value(struct rq *rq, unsigned int clamp_id, + unsigned int clamp_value) +{ + struct uclamp_bucket *bucket = rq->uclamp[clamp_id].bucket; + int bucket_id = UCLAMP_BUCKETS - 1; + + /* + * Since both min and max clamps are max aggregated, find the + * top most bucket with tasks in. + */ + for ( ; bucket_id >= 0; bucket_id--) { + if (!bucket[bucket_id].tasks) + continue; + return bucket[bucket_id].value; + } + + /* No tasks -- default clamp values */ + return uclamp_idle_value(rq, clamp_id, clamp_value); +} + +/* + * The effective clamp bucket index of a task depends on, by increasing + * priority: + * - the task specific clamp value, when explicitly requested from userspace + * - the system default clamp value, defined by the sysadmin + */ +static inline struct uclamp_se +uclamp_eff_get(struct task_struct *p, unsigned int clamp_id) +{ + struct uclamp_se uc_req = p->uclamp_req[clamp_id]; + struct uclamp_se uc_max = uclamp_default[clamp_id]; + + /* System default restrictions always apply */ + if (unlikely(uc_req.value > uc_max.value)) + return uc_max; + + return uc_req; +} + +unsigned int uclamp_eff_value(struct task_struct *p, unsigned int clamp_id) +{ + struct uclamp_se uc_eff; + + /* Task currently refcounted: use back-annotated (effective) value */ + if (p->uclamp[clamp_id].active) + return p->uclamp[clamp_id].value; + + uc_eff = uclamp_eff_get(p, clamp_id); + + return uc_eff.value; +} + +/* + * When a task is enqueued on a rq, the clamp bucket currently defined by the + * task's uclamp::bucket_id is refcounted on that rq. This also immediately + * updates the rq's clamp value if required. + * + * Tasks can have a task-specific value requested from user-space, track + * within each bucket the maximum value for tasks refcounted in it. + * This "local max aggregation" allows to track the exact "requested" value + * for each bucket when all its RUNNABLE tasks require the same clamp. + */ +static inline void uclamp_rq_inc_id(struct rq *rq, struct task_struct *p, + unsigned int clamp_id) +{ + struct uclamp_rq *uc_rq = &rq->uclamp[clamp_id]; + struct uclamp_se *uc_se = &p->uclamp[clamp_id]; + struct uclamp_bucket *bucket; + + lockdep_assert_held(&rq->lock); + + /* Update task effective clamp */ + p->uclamp[clamp_id] = uclamp_eff_get(p, clamp_id); + + bucket = &uc_rq->bucket[uc_se->bucket_id]; + bucket->tasks++; + uc_se->active = true; + + uclamp_idle_reset(rq, clamp_id, uc_se->value); + + /* + * Local max aggregation: rq buckets always track the max + * "requested" clamp value of its RUNNABLE tasks. + */ + if (bucket->tasks == 1 || uc_se->value > bucket->value) + bucket->value = uc_se->value; + + if (uc_se->value > READ_ONCE(uc_rq->value)) + WRITE_ONCE(uc_rq->value, uc_se->value); +} + +/* + * When a task is dequeued from a rq, the clamp bucket refcounted by the task + * is released. If this is the last task reference counting the rq's max + * active clamp value, then the rq's clamp value is updated. + * + * Both refcounted tasks and rq's cached clamp values are expected to be + * always valid. If it's detected they are not, as defensive programming, + * enforce the expected state and warn. + */ +static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p, + unsigned int clamp_id) +{ + struct uclamp_rq *uc_rq = &rq->uclamp[clamp_id]; + struct uclamp_se *uc_se = &p->uclamp[clamp_id]; + struct uclamp_bucket *bucket; + unsigned int bkt_clamp; + unsigned int rq_clamp; + + lockdep_assert_held(&rq->lock); + + bucket = &uc_rq->bucket[uc_se->bucket_id]; + SCHED_WARN_ON(!bucket->tasks); + if (likely(bucket->tasks)) + bucket->tasks--; + uc_se->active = false; + + /* + * Keep "local max aggregation" simple and accept to (possibly) + * overboost some RUNNABLE tasks in the same bucket. + * The rq clamp bucket value is reset to its base value whenever + * there are no more RUNNABLE tasks refcounting it. + */ + if (likely(bucket->tasks)) + return; + + rq_clamp = READ_ONCE(uc_rq->value); + /* + * Defensive programming: this should never happen. If it happens, + * e.g. due to future modification, warn and fixup the expected value. + */ + SCHED_WARN_ON(bucket->value > rq_clamp); + if (bucket->value >= rq_clamp) { + bkt_clamp = uclamp_rq_max_value(rq, clamp_id, uc_se->value); + WRITE_ONCE(uc_rq->value, bkt_clamp); + } +} + +static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p) +{ + unsigned int clamp_id; + + if (unlikely(!p->sched_class->uclamp_enabled)) + return; + + for_each_clamp_id(clamp_id) + uclamp_rq_inc_id(rq, p, clamp_id); + + /* Reset clamp idle holding when there is one RUNNABLE task */ + if (rq->uclamp_flags & UCLAMP_FLAG_IDLE) + rq->uclamp_flags &= ~UCLAMP_FLAG_IDLE; +} + +static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p) +{ + unsigned int clamp_id; + + if (unlikely(!p->sched_class->uclamp_enabled)) + return; + + for_each_clamp_id(clamp_id) + uclamp_rq_dec_id(rq, p, clamp_id); +} + +int sysctl_sched_uclamp_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int old_min, old_max; + static DEFINE_MUTEX(mutex); + int result; + + mutex_lock(&mutex); + old_min = sysctl_sched_uclamp_util_min; + old_max = sysctl_sched_uclamp_util_max; + + result = proc_dointvec(table, write, buffer, lenp, ppos); + if (result) + goto undo; + if (!write) + goto done; + + if (sysctl_sched_uclamp_util_min > sysctl_sched_uclamp_util_max || + sysctl_sched_uclamp_util_max > SCHED_CAPACITY_SCALE) { + result = -EINVAL; + goto undo; + } + + if (old_min != sysctl_sched_uclamp_util_min) { + uclamp_se_set(&uclamp_default[UCLAMP_MIN], + sysctl_sched_uclamp_util_min, false); + } + if (old_max != sysctl_sched_uclamp_util_max) { + uclamp_se_set(&uclamp_default[UCLAMP_MAX], + sysctl_sched_uclamp_util_max, false); + } + + /* + * Updating all the RUNNABLE task is expensive, keep it simple and do + * just a lazy update at each next enqueue time. + */ + goto done; + +undo: + sysctl_sched_uclamp_util_min = old_min; + sysctl_sched_uclamp_util_max = old_max; +done: + mutex_unlock(&mutex); + + return result; +} + +static int uclamp_validate(struct task_struct *p, + const struct sched_attr *attr) +{ + unsigned int lower_bound = p->uclamp_req[UCLAMP_MIN].value; + unsigned int upper_bound = p->uclamp_req[UCLAMP_MAX].value; + + if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN) + lower_bound = attr->sched_util_min; + if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX) + upper_bound = attr->sched_util_max; + + if (lower_bound > upper_bound) + return -EINVAL; + if (upper_bound > SCHED_CAPACITY_SCALE) + return -EINVAL; + + return 0; +} + +static void __setscheduler_uclamp(struct task_struct *p, + const struct sched_attr *attr) +{ + unsigned int clamp_id; + + /* + * On scheduling class change, reset to default clamps for tasks + * without a task-specific value. + */ + for_each_clamp_id(clamp_id) { + struct uclamp_se *uc_se = &p->uclamp_req[clamp_id]; + unsigned int clamp_value = uclamp_none(clamp_id); + + /* Keep using defined clamps across class changes */ + if (uc_se->user_defined) + continue; + + /* By default, RT tasks always get 100% boost */ + if (unlikely(rt_task(p) && clamp_id == UCLAMP_MIN)) + clamp_value = uclamp_none(UCLAMP_MAX); + + uclamp_se_set(uc_se, clamp_value, false); + } + + if (likely(!(attr->sched_flags & SCHED_FLAG_UTIL_CLAMP))) + return; + + if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN) { + uclamp_se_set(&p->uclamp_req[UCLAMP_MIN], + attr->sched_util_min, true); + } + + if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX) { + uclamp_se_set(&p->uclamp_req[UCLAMP_MAX], + attr->sched_util_max, true); + } +} + +static void uclamp_fork(struct task_struct *p) +{ + unsigned int clamp_id; + + for_each_clamp_id(clamp_id) + p->uclamp[clamp_id].active = false; + + if (likely(!p->sched_reset_on_fork)) + return; + + for_each_clamp_id(clamp_id) { + unsigned int clamp_value = uclamp_none(clamp_id); + + /* By default, RT tasks always get 100% boost */ + if (unlikely(rt_task(p) && clamp_id == UCLAMP_MIN)) + clamp_value = uclamp_none(UCLAMP_MAX); + + uclamp_se_set(&p->uclamp_req[clamp_id], clamp_value, false); + } +} + +static void __init init_uclamp(void) +{ + struct uclamp_se uc_max = {}; + unsigned int clamp_id; + int cpu; + + for_each_possible_cpu(cpu) { + memset(&cpu_rq(cpu)->uclamp, 0, sizeof(struct uclamp_rq)); + cpu_rq(cpu)->uclamp_flags = 0; + } + + for_each_clamp_id(clamp_id) { + uclamp_se_set(&init_task.uclamp_req[clamp_id], + uclamp_none(clamp_id), false); + } + + /* System defaults allow max clamp values for both indexes */ + uclamp_se_set(&uc_max, uclamp_none(UCLAMP_MAX), false); + for_each_clamp_id(clamp_id) + uclamp_default[clamp_id] = uc_max; +} + +#else /* CONFIG_UCLAMP_TASK */ +static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p) { } +static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p) { } +static inline int uclamp_validate(struct task_struct *p, + const struct sched_attr *attr) +{ + return -EOPNOTSUPP; +} +static void __setscheduler_uclamp(struct task_struct *p, + const struct sched_attr *attr) { } +static inline void uclamp_fork(struct task_struct *p) { } +static inline void init_uclamp(void) { } +#endif /* CONFIG_UCLAMP_TASK */ + static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags) { if (!(flags & ENQUEUE_NOCLOCK)) @@ -740,6 +1177,7 @@ static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags) psi_enqueue(p, flags & ENQUEUE_WAKEUP); } + uclamp_rq_inc(rq, p); p->sched_class->enqueue_task(rq, p, flags); } @@ -753,6 +1191,7 @@ static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags) psi_dequeue(p, flags & DEQUEUE_SLEEP); } + uclamp_rq_dec(rq, p); p->sched_class->dequeue_task(rq, p, flags); } @@ -762,10 +1201,14 @@ void activate_task(struct rq *rq, struct task_struct *p, int flags) rq->nr_uninterruptible--; enqueue_task(rq, p, flags); + + p->on_rq = TASK_ON_RQ_QUEUED; } void deactivate_task(struct rq *rq, struct task_struct *p, int flags) { + p->on_rq = (flags & DEQUEUE_SLEEP) ? 0 : TASK_ON_RQ_MIGRATING; + if (task_contributes_to_load(p)) rq->nr_uninterruptible++; @@ -890,12 +1333,12 @@ static inline bool is_per_cpu_kthread(struct task_struct *p) } /* - * Per-CPU kthreads are allowed to run on !actie && online CPUs, see + * Per-CPU kthreads are allowed to run on !active && online CPUs, see * __set_cpus_allowed_ptr() and select_fallback_rq(). */ static inline bool is_cpu_allowed(struct task_struct *p, int cpu) { - if (!cpumask_test_cpu(cpu, &p->cpus_allowed)) + if (!cpumask_test_cpu(cpu, p->cpus_ptr)) return false; if (is_per_cpu_kthread(p)) @@ -928,7 +1371,7 @@ static struct rq *move_queued_task(struct rq *rq, struct rq_flags *rf, { lockdep_assert_held(&rq->lock); - p->on_rq = TASK_ON_RQ_MIGRATING; + WRITE_ONCE(p->on_rq, TASK_ON_RQ_MIGRATING); dequeue_task(rq, p, DEQUEUE_NOCLOCK); set_task_cpu(p, new_cpu); rq_unlock(rq, rf); @@ -990,7 +1433,7 @@ static int migration_cpu_stop(void *data) local_irq_disable(); /* * We need to explicitly wake pending tasks before running - * __migrate_task() such that we will not miss enforcing cpus_allowed + * __migrate_task() such that we will not miss enforcing cpus_ptr * during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test. */ sched_ttwu_pending(); @@ -1021,7 +1464,7 @@ static int migration_cpu_stop(void *data) */ void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask) { - cpumask_copy(&p->cpus_allowed, new_mask); + cpumask_copy(&p->cpus_mask, new_mask); p->nr_cpus_allowed = cpumask_weight(new_mask); } @@ -1091,7 +1534,7 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, goto out; } - if (cpumask_equal(&p->cpus_allowed, new_mask)) + if (cpumask_equal(p->cpus_ptr, new_mask)) goto out; if (!cpumask_intersects(new_mask, cpu_valid_mask)) { @@ -1121,7 +1564,6 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, /* Need help from migration thread: drop lock and wait. */ task_rq_unlock(rq, p, &rf); stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); - tlb_migrate_finish(p->mm); return 0; } else if (task_on_rq_queued(p)) { /* @@ -1207,11 +1649,9 @@ static void __migrate_swap_task(struct task_struct *p, int cpu) rq_pin_lock(src_rq, &srf); rq_pin_lock(dst_rq, &drf); - p->on_rq = TASK_ON_RQ_MIGRATING; deactivate_task(src_rq, p, 0); set_task_cpu(p, cpu); activate_task(dst_rq, p, 0); - p->on_rq = TASK_ON_RQ_QUEUED; check_preempt_curr(dst_rq, p, 0); rq_unpin_lock(dst_rq, &drf); @@ -1254,10 +1694,10 @@ static int migrate_swap_stop(void *data) if (task_cpu(arg->src_task) != arg->src_cpu) goto unlock; - if (!cpumask_test_cpu(arg->dst_cpu, &arg->src_task->cpus_allowed)) + if (!cpumask_test_cpu(arg->dst_cpu, arg->src_task->cpus_ptr)) goto unlock; - if (!cpumask_test_cpu(arg->src_cpu, &arg->dst_task->cpus_allowed)) + if (!cpumask_test_cpu(arg->src_cpu, arg->dst_task->cpus_ptr)) goto unlock; __migrate_swap_task(arg->src_task, arg->dst_cpu); @@ -1299,10 +1739,10 @@ int migrate_swap(struct task_struct *cur, struct task_struct *p, if (!cpu_active(arg.src_cpu) || !cpu_active(arg.dst_cpu)) goto out; - if (!cpumask_test_cpu(arg.dst_cpu, &arg.src_task->cpus_allowed)) + if (!cpumask_test_cpu(arg.dst_cpu, arg.src_task->cpus_ptr)) goto out; - if (!cpumask_test_cpu(arg.src_cpu, &arg.dst_task->cpus_allowed)) + if (!cpumask_test_cpu(arg.src_cpu, arg.dst_task->cpus_ptr)) goto out; trace_sched_swap_numa(cur, arg.src_cpu, p, arg.dst_cpu); @@ -1447,7 +1887,7 @@ void kick_process(struct task_struct *p) EXPORT_SYMBOL_GPL(kick_process); /* - * ->cpus_allowed is protected by both rq->lock and p->pi_lock + * ->cpus_ptr is protected by both rq->lock and p->pi_lock * * A few notes on cpu_active vs cpu_online: * @@ -1487,14 +1927,14 @@ static int select_fallback_rq(int cpu, struct task_struct *p) for_each_cpu(dest_cpu, nodemask) { if (!cpu_active(dest_cpu)) continue; - if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed)) + if (cpumask_test_cpu(dest_cpu, p->cpus_ptr)) return dest_cpu; } } for (;;) { /* Any allowed, online CPU? */ - for_each_cpu(dest_cpu, &p->cpus_allowed) { + for_each_cpu(dest_cpu, p->cpus_ptr) { if (!is_cpu_allowed(p, dest_cpu)) continue; @@ -1538,7 +1978,7 @@ out: } /* - * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable. + * The caller (fork, wakeup) owns p->pi_lock, ->cpus_ptr is stable. */ static inline int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags) @@ -1548,11 +1988,11 @@ int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags) if (p->nr_cpus_allowed > 1) cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags); else - cpu = cpumask_any(&p->cpus_allowed); + cpu = cpumask_any(p->cpus_ptr); /* * In order not to call set_task_cpu() on a blocking task we need - * to rely on ttwu() to place the task on a valid ->cpus_allowed + * to rely on ttwu() to place the task on a valid ->cpus_ptr * CPU. * * Since this is common to all placement strategies, this lives here. @@ -1651,16 +2091,6 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags) __schedstat_inc(p->se.statistics.nr_wakeups_sync); } -static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags) -{ - activate_task(rq, p, en_flags); - p->on_rq = TASK_ON_RQ_QUEUED; - - /* If a worker is waking up, notify the workqueue: */ - if (p->flags & PF_WQ_WORKER) - wq_worker_waking_up(p, cpu_of(rq)); -} - /* * Mark the task runnable and perform wakeup-preemption. */ @@ -1712,7 +2142,7 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags, en_flags |= ENQUEUE_MIGRATED; #endif - ttwu_activate(rq, p, en_flags); + activate_task(rq, p, en_flags); ttwu_do_wakeup(rq, p, wake_flags, rf); } @@ -1969,6 +2399,30 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) unsigned long flags; int cpu, success = 0; + preempt_disable(); + if (p == current) { + /* + * We're waking current, this means 'p->on_rq' and 'task_cpu(p) + * == smp_processor_id()'. Together this means we can special + * case the whole 'p->on_rq && ttwu_remote()' case below + * without taking any locks. + * + * In particular: + * - we rely on Program-Order guarantees for all the ordering, + * - we're serialized against set_special_state() by virtue of + * it disabling IRQs (this allows not taking ->pi_lock). + */ + if (!(p->state & state)) + goto out; + + success = 1; + cpu = task_cpu(p); + trace_sched_waking(p); + p->state = TASK_RUNNING; + trace_sched_wakeup(p); + goto out; + } + /* * If we are going to wake up a thread waiting for CONDITION we * need to ensure that CONDITION=1 done by the caller can not be @@ -1978,7 +2432,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) raw_spin_lock_irqsave(&p->pi_lock, flags); smp_mb__after_spinlock(); if (!(p->state & state)) - goto out; + goto unlock; trace_sched_waking(p); @@ -2008,7 +2462,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) */ smp_rmb(); if (p->on_rq && ttwu_remote(p, wake_flags)) - goto stat; + goto unlock; #ifdef CONFIG_SMP /* @@ -2068,65 +2522,17 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) #endif /* CONFIG_SMP */ ttwu_queue(p, cpu, wake_flags); -stat: - ttwu_stat(p, cpu, wake_flags); -out: +unlock: raw_spin_unlock_irqrestore(&p->pi_lock, flags); +out: + if (success) + ttwu_stat(p, cpu, wake_flags); + preempt_enable(); return success; } /** - * try_to_wake_up_local - try to wake up a local task with rq lock held - * @p: the thread to be awakened - * @rf: request-queue flags for pinning - * - * Put @p on the run-queue if it's not already there. The caller must - * ensure that this_rq() is locked, @p is bound to this_rq() and not - * the current task. - */ -static void try_to_wake_up_local(struct task_struct *p, struct rq_flags *rf) -{ - struct rq *rq = task_rq(p); - - if (WARN_ON_ONCE(rq != this_rq()) || - WARN_ON_ONCE(p == current)) - return; - - lockdep_assert_held(&rq->lock); - - if (!raw_spin_trylock(&p->pi_lock)) { - /* - * This is OK, because current is on_cpu, which avoids it being - * picked for load-balance and preemption/IRQs are still - * disabled avoiding further scheduler activity on it and we've - * not yet picked a replacement task. - */ - rq_unlock(rq, rf); - raw_spin_lock(&p->pi_lock); - rq_relock(rq, rf); - } - - if (!(p->state & TASK_NORMAL)) - goto out; - - trace_sched_waking(p); - - if (!task_on_rq_queued(p)) { - if (p->in_iowait) { - delayacct_blkio_end(p); - atomic_dec(&rq->nr_iowait); - } - ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK); - } - - ttwu_do_wakeup(rq, p, 0, rf); - ttwu_stat(p, smp_processor_id(), 0); -out: - raw_spin_unlock(&p->pi_lock); -} - -/** * wake_up_process - Wake up a specific process * @p: The process to be woken up. * @@ -2190,6 +2596,9 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) INIT_HLIST_HEAD(&p->preempt_notifiers); #endif +#ifdef CONFIG_COMPACTION + p->capture_control = NULL; +#endif init_numa_balancing(clone_flags, p); } @@ -2325,6 +2734,8 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) */ p->prio = current->normal_prio; + uclamp_fork(p); + /* * Revert to default priority/policy on fork if requested. */ @@ -2420,7 +2831,7 @@ void wake_up_new_task(struct task_struct *p) #ifdef CONFIG_SMP /* * Fork balancing, do it here and not earlier because: - * - cpus_allowed can change in the fork path + * - cpus_ptr can change in the fork path * - any previously selected CPU might disappear through hotplug * * Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq, @@ -2431,10 +2842,9 @@ void wake_up_new_task(struct task_struct *p) #endif rq = __task_rq_lock(p, &rf); update_rq_clock(rq); - post_init_entity_util_avg(&p->se); + post_init_entity_util_avg(p); activate_task(rq, p, ENQUEUE_NOCLOCK); - p->on_rq = TASK_ON_RQ_QUEUED; trace_sched_wakeup_new(p); check_preempt_curr(rq, p, WF_FORK); #ifdef CONFIG_SMP @@ -3059,7 +3469,6 @@ void scheduler_tick(void) update_rq_clock(rq); curr->sched_class->task_tick(rq, curr, 0); - cpu_load_update_active(rq); calc_global_load_tick(rq); psi_task_tick(rq); @@ -3433,25 +3842,11 @@ static void __sched notrace __schedule(bool preempt) prev->state = TASK_RUNNING; } else { deactivate_task(rq, prev, DEQUEUE_SLEEP | DEQUEUE_NOCLOCK); - prev->on_rq = 0; if (prev->in_iowait) { atomic_inc(&rq->nr_iowait); delayacct_blkio_start(); } - - /* - * If a worker went to sleep, notify and ask workqueue - * whether it wants to wake up a task to maintain - * concurrency. - */ - if (prev->flags & PF_WQ_WORKER) { - struct task_struct *to_wakeup; - - to_wakeup = wq_worker_sleeping(prev); - if (to_wakeup) - try_to_wake_up_local(to_wakeup, &rf); - } } switch_count = &prev->nvcsw; } @@ -3511,6 +3906,20 @@ static inline void sched_submit_work(struct task_struct *tsk) { if (!tsk->state || tsk_is_pi_blocked(tsk)) return; + + /* + * If a worker went to sleep, notify and ask workqueue whether + * it wants to wake up a task to maintain concurrency. + * As this function is called inside the schedule() context, + * we disable preemption to avoid it calling schedule() again + * in the possible wakeup of a kworker. + */ + if (tsk->flags & PF_WQ_WORKER) { + preempt_disable(); + wq_worker_sleeping(tsk); + preempt_enable_no_resched(); + } + /* * If we are going to sleep and we have plugged IO queued, * make sure to submit it to avoid deadlocks. @@ -3519,6 +3928,12 @@ static inline void sched_submit_work(struct task_struct *tsk) blk_schedule_flush_plug(tsk); } +static void sched_update_worker(struct task_struct *tsk) +{ + if (tsk->flags & PF_WQ_WORKER) + wq_worker_running(tsk); +} + asmlinkage __visible void __sched schedule(void) { struct task_struct *tsk = current; @@ -3529,6 +3944,7 @@ asmlinkage __visible void __sched schedule(void) __schedule(false); sched_preempt_enable_no_resched(); } while (need_resched()); + sched_update_worker(tsk); } EXPORT_SYMBOL(schedule); @@ -4090,6 +4506,13 @@ static void __setscheduler_params(struct task_struct *p, static void __setscheduler(struct rq *rq, struct task_struct *p, const struct sched_attr *attr, bool keep_boost) { + /* + * If params can't change scheduling class changes aren't allowed + * either. + */ + if (attr->sched_flags & SCHED_FLAG_KEEP_PARAMS) + return; + __setscheduler_params(p, attr); /* @@ -4227,6 +4650,13 @@ recheck: return retval; } + /* Update task specific "requested" clamps */ + if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) { + retval = uclamp_validate(p, attr); + if (retval) + return retval; + } + /* * Make sure no PI-waiters arrive (or leave) while we are * changing the priority of the task: @@ -4256,6 +4686,8 @@ recheck: goto change; if (dl_policy(policy) && dl_param_changed(p, attr)) goto change; + if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) + goto change; p->sched_reset_on_fork = reset_on_fork; task_rq_unlock(rq, p, &rf); @@ -4286,7 +4718,7 @@ change: * the entire root_domain to become SCHED_DEADLINE. We * will also fail if there's no bandwidth available. */ - if (!cpumask_subset(span, &p->cpus_allowed) || + if (!cpumask_subset(span, p->cpus_ptr) || rq->rd->dl_bw.bw == 0) { task_rq_unlock(rq, p, &rf); return -EPERM; @@ -4336,7 +4768,9 @@ change: put_prev_task(rq, p); prev_class = p->sched_class; + __setscheduler(rq, p, attr, pi); + __setscheduler_uclamp(p, attr); if (queued) { /* @@ -4512,6 +4946,10 @@ static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *a if (ret) return -EFAULT; + if ((attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) && + size < SCHED_ATTR_SIZE_VER1) + return -EINVAL; + /* * XXX: Do we want to be lenient like existing syscalls; or do we want * to be strict and return an error on out-of-bounds values? @@ -4575,14 +5013,21 @@ SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr, if ((int)attr.sched_policy < 0) return -EINVAL; + if (attr.sched_flags & SCHED_FLAG_KEEP_POLICY) + attr.sched_policy = SETPARAM_POLICY; rcu_read_lock(); retval = -ESRCH; p = find_process_by_pid(pid); - if (p != NULL) - retval = sched_setattr(p, &attr); + if (likely(p)) + get_task_struct(p); rcu_read_unlock(); + if (likely(p)) { + retval = sched_setattr(p, &attr); + put_task_struct(p); + } + return retval; } @@ -4733,6 +5178,11 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, else attr.sched_nice = task_nice(p); +#ifdef CONFIG_UCLAMP_TASK + attr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value; + attr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value; +#endif + rcu_read_unlock(); retval = sched_read_attr(uattr, &attr, size); @@ -4885,7 +5335,7 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask) goto out_unlock; raw_spin_lock_irqsave(&p->pi_lock, flags); - cpumask_and(mask, &p->cpus_allowed, cpu_active_mask); + cpumask_and(mask, &p->cpus_mask, cpu_active_mask); raw_spin_unlock_irqrestore(&p->pi_lock, flags); out_unlock: @@ -5142,7 +5592,7 @@ long __sched io_schedule_timeout(long timeout) } EXPORT_SYMBOL(io_schedule_timeout); -void io_schedule(void) +void __sched io_schedule(void) { int token; @@ -5265,9 +5715,8 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, } #ifdef CONFIG_COMPAT_32BIT_TIME -COMPAT_SYSCALL_DEFINE2(sched_rr_get_interval, - compat_pid_t, pid, - struct old_timespec32 __user *, interval) +SYSCALL_DEFINE2(sched_rr_get_interval_time32, pid_t, pid, + struct old_timespec32 __user *, interval) { struct timespec64 t; int retval = sched_rr_get_interval(pid, &t); @@ -5463,7 +5912,7 @@ int task_can_attach(struct task_struct *p, * allowed nodes is unnecessary. Thus, cpusets are not * applicable for such threads. This prevents checking for * success of set_cpus_allowed_ptr() on all attached tasks - * before cpus_allowed may be changed. + * before cpus_mask may be changed. */ if (p->flags & PF_NO_SETAFFINITY) { ret = -EINVAL; @@ -5490,7 +5939,7 @@ int migrate_task_to(struct task_struct *p, int target_cpu) if (curr_cpu == target_cpu) return 0; - if (!cpumask_test_cpu(target_cpu, &p->cpus_allowed)) + if (!cpumask_test_cpu(target_cpu, p->cpus_ptr)) return -EINVAL; /* TODO: This is not properly updating schedstats */ @@ -5628,7 +6077,7 @@ static void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf) put_prev_task(rq, next); /* - * Rules for changing task_struct::cpus_allowed are holding + * Rules for changing task_struct::cpus_mask are holding * both pi_lock and rq->lock, such that holding either * stabilizes the mask. * @@ -5867,14 +6316,11 @@ void __init sched_init_smp(void) /* * There's no userspace yet to cause hotplug operations; hence all the * CPU masks are stable and all blatant races in the below code cannot - * happen. The hotplug lock is nevertheless taken to satisfy lockdep, - * but there won't be any contention on it. + * happen. */ - cpus_read_lock(); mutex_lock(&sched_domains_mutex); sched_init_domains(cpu_active_mask); mutex_unlock(&sched_domains_mutex); - cpus_read_unlock(); /* Move init over to a non-isolated CPU */ if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_FLAG_DOMAIN)) < 0) @@ -5889,7 +6335,7 @@ void __init sched_init_smp(void) static int __init migration_init(void) { - sched_rq_cpu_starting(smp_processor_id()); + sched_cpu_starting(smp_processor_id()); return 0; } early_initcall(migration_init); @@ -5925,8 +6371,8 @@ DECLARE_PER_CPU(cpumask_var_t, select_idle_mask); void __init sched_init(void) { - int i, j; unsigned long alloc_size = 0, ptr; + int i; wait_bit_init(); @@ -6028,10 +6474,6 @@ void __init sched_init(void) #ifdef CONFIG_RT_GROUP_SCHED init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL); #endif - - for (j = 0; j < CPU_LOAD_IDX_MAX; j++) - rq->cpu_load[j] = 0; - #ifdef CONFIG_SMP rq->sd = NULL; rq->rd = NULL; @@ -6086,6 +6528,8 @@ void __init sched_init(void) psi_init(); + init_uclamp(); + scheduler_running = 1; } @@ -6162,6 +6606,34 @@ void ___might_sleep(const char *file, int line, int preempt_offset) add_taint(TAINT_WARN, LOCKDEP_STILL_OK); } EXPORT_SYMBOL(___might_sleep); + +void __cant_sleep(const char *file, int line, int preempt_offset) +{ + static unsigned long prev_jiffy; + + if (irqs_disabled()) + return; + + if (!IS_ENABLED(CONFIG_PREEMPT_COUNT)) + return; + + if (preempt_count() > preempt_offset) + return; + + if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) + return; + prev_jiffy = jiffies; + + printk(KERN_ERR "BUG: assuming atomic context at %s:%d\n", file, line); + printk(KERN_ERR "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n", + in_atomic(), irqs_disabled(), + current->pid, current->comm); + + debug_show_held_locks(current); + dump_stack(); + add_taint(TAINT_WARN, LOCKDEP_STILL_OK); +} +EXPORT_SYMBOL_GPL(__cant_sleep); #endif #ifdef CONFIG_MAGIC_SYSRQ @@ -6502,6 +6974,8 @@ static void cpu_cgroup_attach(struct cgroup_taskset *tset) static int cpu_shares_write_u64(struct cgroup_subsys_state *css, struct cftype *cftype, u64 shareval) { + if (shareval > scale_load_down(ULONG_MAX)) + shareval = MAX_SHARES; return sched_group_set_shares(css_tg(css), scale_load(shareval)); } @@ -6517,7 +6991,7 @@ static u64 cpu_shares_read_u64(struct cgroup_subsys_state *css, static DEFINE_MUTEX(cfs_constraints_mutex); const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */ -const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */ +static const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */ static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime); @@ -6597,20 +7071,22 @@ out_unlock: return ret; } -int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us) +static int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us) { u64 quota, period; period = ktime_to_ns(tg->cfs_bandwidth.period); if (cfs_quota_us < 0) quota = RUNTIME_INF; - else + else if ((u64)cfs_quota_us <= U64_MAX / NSEC_PER_USEC) quota = (u64)cfs_quota_us * NSEC_PER_USEC; + else + return -EINVAL; return tg_set_cfs_bandwidth(tg, period, quota); } -long tg_get_cfs_quota(struct task_group *tg) +static long tg_get_cfs_quota(struct task_group *tg) { u64 quota_us; @@ -6623,17 +7099,20 @@ long tg_get_cfs_quota(struct task_group *tg) return quota_us; } -int tg_set_cfs_period(struct task_group *tg, long cfs_period_us) +static int tg_set_cfs_period(struct task_group *tg, long cfs_period_us) { u64 quota, period; + if ((u64)cfs_period_us > U64_MAX / NSEC_PER_USEC) + return -EINVAL; + period = (u64)cfs_period_us * NSEC_PER_USEC; quota = tg->cfs_bandwidth.quota; return tg_set_cfs_bandwidth(tg, period, quota); } -long tg_get_cfs_period(struct task_group *tg) +static long tg_get_cfs_period(struct task_group *tg) { u64 cfs_period_us; @@ -6941,7 +7420,7 @@ static int __maybe_unused cpu_period_quota_parse(char *buf, { char tok[21]; /* U64_MAX */ - if (!sscanf(buf, "%s %llu", tok, periodp)) + if (sscanf(buf, "%20s %llu", tok, periodp) < 1) return -EINVAL; *periodp *= NSEC_PER_USEC; diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c index 50316455ea66..5cc4012572ec 100644 --- a/kernel/sched/cpudeadline.c +++ b/kernel/sched/cpudeadline.c @@ -1,14 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * kernel/sched/cpudl.c * * Global CPU deadline management * * Author: Juri Lelli <j.lelli@sssup.it> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; version 2 - * of the License. */ #include "sched.h" @@ -124,14 +120,14 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p, const struct sched_dl_entity *dl_se = &p->dl; if (later_mask && - cpumask_and(later_mask, cp->free_cpus, &p->cpus_allowed)) { + cpumask_and(later_mask, cp->free_cpus, p->cpus_ptr)) { return 1; } else { int best_cpu = cpudl_maximum(cp); WARN_ON(best_cpu != -1 && !cpu_present(best_cpu)); - if (cpumask_test_cpu(best_cpu, &p->cpus_allowed) && + if (cpumask_test_cpu(best_cpu, p->cpus_ptr) && dl_time_before(dl_se->deadline, cp->elements[0].dl)) { if (later_mask) cpumask_set_cpu(best_cpu, later_mask); diff --git a/kernel/sched/cpufreq.c b/kernel/sched/cpufreq.c index 22bd8980f32f..b5dcd1d83c7f 100644 --- a/kernel/sched/cpufreq.c +++ b/kernel/sched/cpufreq.c @@ -7,7 +7,7 @@ */ #include "sched.h" -DEFINE_PER_CPU(struct update_util_data *, cpufreq_update_util_data); +DEFINE_PER_CPU(struct update_util_data __rcu *, cpufreq_update_util_data); /** * cpufreq_add_update_util_hook - Populate the CPU's update_util_data pointer. @@ -48,8 +48,8 @@ EXPORT_SYMBOL_GPL(cpufreq_add_update_util_hook); * * Clear the update_util_data pointer for the given CPU. * - * Callers must use RCU-sched callbacks to free any memory that might be - * accessed via the old update_util_data pointer or invoke synchronize_sched() + * Callers must use RCU callbacks to free any memory that might be + * accessed via the old update_util_data pointer or invoke synchronize_rcu() * right after this function to avoid use-after-free. */ void cpufreq_remove_update_util_hook(int cpu) diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 033ec7c45f13..636ca6f88c8e 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -13,6 +13,8 @@ #include <linux/sched/cpufreq.h> #include <trace/events/power.h> +#define IOWAIT_BOOST_MIN (SCHED_CAPACITY_SCALE / 8) + struct sugov_tunables { struct gov_attr_set attr_set; unsigned int rate_limit_us; @@ -48,7 +50,6 @@ struct sugov_cpu { bool iowait_boost_pending; unsigned int iowait_boost; - unsigned int iowait_boost_max; u64 last_update; unsigned long bw_dl; @@ -195,14 +196,17 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy, * based on the task model parameters and gives the minimal utilization * required to meet deadlines. */ -unsigned long schedutil_freq_util(int cpu, unsigned long util_cfs, - unsigned long max, enum schedutil_type type) +unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs, + unsigned long max, enum schedutil_type type, + struct task_struct *p) { unsigned long dl_util, util, irq; struct rq *rq = cpu_rq(cpu); - if (type == FREQUENCY_UTIL && rt_rq_is_runnable(&rq->rt)) + if (!IS_BUILTIN(CONFIG_UCLAMP_TASK) && + type == FREQUENCY_UTIL && rt_rq_is_runnable(&rq->rt)) { return max; + } /* * Early check to see if IRQ/steal time saturates the CPU, can be @@ -218,9 +222,16 @@ unsigned long schedutil_freq_util(int cpu, unsigned long util_cfs, * CFS tasks and we use the same metric to track the effective * utilization (PELT windows are synchronized) we can directly add them * to obtain the CPU's actual utilization. + * + * CFS and RT utilization can be boosted or capped, depending on + * utilization clamp constraints requested by currently RUNNABLE + * tasks. + * When there are no CFS RUNNABLE tasks, clamps are released and + * frequency will be gracefully reduced with the utilization decay. */ - util = util_cfs; - util += cpu_util_rt(rq); + util = util_cfs + cpu_util_rt(rq); + if (type == FREQUENCY_UTIL) + util = uclamp_util_with(rq, util, p); dl_util = cpu_util_dl(rq); @@ -275,12 +286,12 @@ static unsigned long sugov_get_util(struct sugov_cpu *sg_cpu) { struct rq *rq = cpu_rq(sg_cpu->cpu); unsigned long util = cpu_util_cfs(rq); - unsigned long max = arch_scale_cpu_capacity(NULL, sg_cpu->cpu); + unsigned long max = arch_scale_cpu_capacity(sg_cpu->cpu); sg_cpu->max = max; sg_cpu->bw_dl = cpu_bw_dl(rq); - return schedutil_freq_util(sg_cpu->cpu, util, max, FREQUENCY_UTIL); + return schedutil_cpu_util(sg_cpu->cpu, util, max, FREQUENCY_UTIL, NULL); } /** @@ -291,8 +302,8 @@ static unsigned long sugov_get_util(struct sugov_cpu *sg_cpu) * * The IO wait boost of a task is disabled after a tick since the last update * of a CPU. If a new IO wait boost is requested after more then a tick, then - * we enable the boost starting from the minimum frequency, which improves - * energy efficiency by ignoring sporadic wakeups from IO. + * we enable the boost starting from IOWAIT_BOOST_MIN, which improves energy + * efficiency by ignoring sporadic wakeups from IO. */ static bool sugov_iowait_reset(struct sugov_cpu *sg_cpu, u64 time, bool set_iowait_boost) @@ -303,8 +314,7 @@ static bool sugov_iowait_reset(struct sugov_cpu *sg_cpu, u64 time, if (delta_ns <= TICK_NSEC) return false; - sg_cpu->iowait_boost = set_iowait_boost - ? sg_cpu->sg_policy->policy->min : 0; + sg_cpu->iowait_boost = set_iowait_boost ? IOWAIT_BOOST_MIN : 0; sg_cpu->iowait_boost_pending = set_iowait_boost; return true; @@ -318,8 +328,9 @@ static bool sugov_iowait_reset(struct sugov_cpu *sg_cpu, u64 time, * * Each time a task wakes up after an IO operation, the CPU utilization can be * boosted to a certain utilization which doubles at each "frequent and - * successive" wakeup from IO, ranging from the utilization of the minimum - * OPP to the utilization of the maximum OPP. + * successive" wakeup from IO, ranging from IOWAIT_BOOST_MIN to the utilization + * of the maximum OPP. + * * To keep doubling, an IO boost has to be requested at least once per tick, * otherwise we restart from the utilization of the minimum OPP. */ @@ -344,14 +355,13 @@ static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, u64 time, /* Double the boost at each request */ if (sg_cpu->iowait_boost) { - sg_cpu->iowait_boost <<= 1; - if (sg_cpu->iowait_boost > sg_cpu->iowait_boost_max) - sg_cpu->iowait_boost = sg_cpu->iowait_boost_max; + sg_cpu->iowait_boost = + min_t(unsigned int, sg_cpu->iowait_boost << 1, SCHED_CAPACITY_SCALE); return; } /* First wakeup after IO: start with minimum boost */ - sg_cpu->iowait_boost = sg_cpu->sg_policy->policy->min; + sg_cpu->iowait_boost = IOWAIT_BOOST_MIN; } /** @@ -373,47 +383,38 @@ static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, u64 time, * This mechanism is designed to boost high frequently IO waiting tasks, while * being more conservative on tasks which does sporadic IO operations. */ -static void sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time, - unsigned long *util, unsigned long *max) +static unsigned long sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time, + unsigned long util, unsigned long max) { - unsigned int boost_util, boost_max; + unsigned long boost; /* No boost currently required */ if (!sg_cpu->iowait_boost) - return; + return util; /* Reset boost if the CPU appears to have been idle enough */ if (sugov_iowait_reset(sg_cpu, time, false)) - return; + return util; - /* - * An IO waiting task has just woken up: - * allow to further double the boost value - */ - if (sg_cpu->iowait_boost_pending) { - sg_cpu->iowait_boost_pending = false; - } else { + if (!sg_cpu->iowait_boost_pending) { /* - * Otherwise: reduce the boost value and disable it when we - * reach the minimum. + * No boost pending; reduce the boost value. */ sg_cpu->iowait_boost >>= 1; - if (sg_cpu->iowait_boost < sg_cpu->sg_policy->policy->min) { + if (sg_cpu->iowait_boost < IOWAIT_BOOST_MIN) { sg_cpu->iowait_boost = 0; - return; + return util; } } + sg_cpu->iowait_boost_pending = false; + /* - * Apply the current boost value: a CPU is boosted only if its current - * utilization is smaller then the current IO boost level. + * @util is already in capacity scale; convert iowait_boost + * into the same scale so we can compare. */ - boost_util = sg_cpu->iowait_boost; - boost_max = sg_cpu->iowait_boost_max; - if (*util * boost_max < *max * boost_util) { - *util = boost_util; - *max = boost_max; - } + boost = (sg_cpu->iowait_boost * max) >> SCHED_CAPACITY_SHIFT; + return max(boost, util); } #ifdef CONFIG_NO_HZ_COMMON @@ -460,7 +461,7 @@ static void sugov_update_single(struct update_util_data *hook, u64 time, util = sugov_get_util(sg_cpu); max = sg_cpu->max; - sugov_iowait_apply(sg_cpu, time, &util, &max); + util = sugov_iowait_apply(sg_cpu, time, util, max); next_f = get_next_freq(sg_policy, util, max); /* * Do not reduce the frequency if the CPU has not been idle @@ -500,7 +501,7 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time) j_util = sugov_get_util(j_sg_cpu); j_max = j_sg_cpu->max; - sugov_iowait_apply(j_sg_cpu, time, &j_util, &j_max); + j_util = sugov_iowait_apply(j_sg_cpu, time, j_util, j_max); if (j_util * max > j_max * util) { util = j_util; @@ -609,13 +610,14 @@ rate_limit_us_store(struct gov_attr_set *attr_set, const char *buf, size_t count static struct governor_attr rate_limit_us = __ATTR_RW(rate_limit_us); -static struct attribute *sugov_attributes[] = { +static struct attribute *sugov_attrs[] = { &rate_limit_us.attr, NULL }; +ATTRIBUTE_GROUPS(sugov); static struct kobj_type sugov_tunables_ktype = { - .default_attrs = sugov_attributes, + .default_groups = sugov_groups, .sysfs_ops = &governor_sysfs_ops, }; @@ -782,6 +784,7 @@ out: return 0; fail: + kobject_put(&tunables->attr_set.kobj); policy->governor_data = NULL; sugov_tunables_free(tunables); @@ -837,7 +840,6 @@ static int sugov_start(struct cpufreq_policy *policy) memset(sg_cpu, 0, sizeof(*sg_cpu)); sg_cpu->cpu = cpu; sg_cpu->sg_policy = sg_policy; - sg_cpu->iowait_boost_max = policy->cpuinfo.max_freq; } for_each_cpu(cpu, policy->cpus) { @@ -859,7 +861,7 @@ static void sugov_stop(struct cpufreq_policy *policy) for_each_cpu(cpu, policy->cpus) cpufreq_remove_update_util_hook(cpu); - synchronize_sched(); + synchronize_rcu(); if (!policy->fast_switch_enabled) { irq_work_sync(&sg_policy->irq_work); diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c index daaadf939ccb..b7abca987d94 100644 --- a/kernel/sched/cpupri.c +++ b/kernel/sched/cpupri.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * kernel/sched/cpupri.c * @@ -20,11 +21,6 @@ * searches). For tasks with affinity restrictions, the algorithm has a * worst case complexity of O(min(102, nr_domcpus)), though the scenario that * yields the worst case search is fairly contrived. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; version 2 - * of the License. */ #include "sched.h" @@ -98,11 +94,11 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p, if (skip) continue; - if (cpumask_any_and(&p->cpus_allowed, vec->mask) >= nr_cpu_ids) + if (cpumask_any_and(p->cpus_ptr, vec->mask) >= nr_cpu_ids) continue; if (lowest_mask) { - cpumask_and(lowest_mask, &p->cpus_allowed, vec->mask); + cpumask_and(lowest_mask, p->cpus_ptr, vec->mask); /* * We have to ensure that we have at least one bit diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index ba4a143bdcf3..2305ce89a26c 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Simple CPU accounting cgroup controller */ diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index fb8b7b5d745d..ef5b9f6b1d42 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -252,7 +252,6 @@ static void task_non_contending(struct task_struct *p) if (dl_entity_is_special(dl_se)) return; - WARN_ON(hrtimer_active(&dl_se->inactive_timer)); WARN_ON(dl_se->dl_non_contending); zerolag_time = dl_se->deadline - @@ -269,7 +268,7 @@ static void task_non_contending(struct task_struct *p) * If the "0-lag time" already passed, decrease the active * utilization now, instead of starting a timer */ - if (zerolag_time < 0) { + if ((zerolag_time < 0) || hrtimer_active(&dl_se->inactive_timer)) { if (dl_task(p)) sub_running_bw(dl_se, dl_rq); if (!dl_task(p) || p->state == TASK_DEAD) { @@ -539,7 +538,7 @@ static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p * If we cannot preempt any rq, fall back to pick any * online CPU: */ - cpu = cpumask_any_and(cpu_active_mask, &p->cpus_allowed); + cpu = cpumask_any_and(cpu_active_mask, p->cpus_ptr); if (cpu >= nr_cpu_ids) { /* * Failed to find any suitable CPU. @@ -727,7 +726,7 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se, * refill the runtime and set the deadline a period in the future, * because keeping the current (absolute) deadline of the task would * result in breaking guarantees promised to other tasks (refer to - * Documentation/scheduler/sched-deadline.txt for more information). + * Documentation/scheduler/sched-deadline.rst for more information). * * This function returns true if: * @@ -1196,7 +1195,7 @@ static void update_curr_dl(struct rq *rq) &curr->dl); } else { unsigned long scale_freq = arch_scale_freq_capacity(cpu); - unsigned long scale_cpu = arch_scale_cpu_capacity(NULL, cpu); + unsigned long scale_cpu = arch_scale_cpu_capacity(cpu); scaled_delta_exec = cap_scale(delta_exec, scale_freq); scaled_delta_exec = cap_scale(scaled_delta_exec, scale_cpu); @@ -1767,7 +1766,7 @@ pick_next_task_dl(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) deadline_queue_push_tasks(rq); if (rq->curr->sched_class != &dl_sched_class) - update_dl_rq_load_avg(rq_clock_task(rq), rq, 0); + update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 0); return p; } @@ -1776,7 +1775,7 @@ static void put_prev_task_dl(struct rq *rq, struct task_struct *p) { update_curr_dl(rq); - update_dl_rq_load_avg(rq_clock_task(rq), rq, 1); + update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 1); if (on_dl_rq(&p->dl) && p->nr_cpus_allowed > 1) enqueue_pushable_dl_task(rq, p); } @@ -1793,7 +1792,7 @@ static void task_tick_dl(struct rq *rq, struct task_struct *p, int queued) { update_curr_dl(rq); - update_dl_rq_load_avg(rq_clock_task(rq), rq, 1); + update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 1); /* * Even when we have runtime, update_curr_dl() might have resulted in us * not being the leftmost task anymore. In that case NEED_RESCHED will @@ -1825,7 +1824,7 @@ static void set_curr_task_dl(struct rq *rq) static int pick_dl_task(struct rq *rq, struct task_struct *p, int cpu) { if (!task_running(rq, p) && - cpumask_test_cpu(cpu, &p->cpus_allowed)) + cpumask_test_cpu(cpu, p->cpus_ptr)) return 1; return 0; } @@ -1975,7 +1974,7 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq) /* Retry if something changed. */ if (double_lock_balance(rq, later_rq)) { if (unlikely(task_rq(task) != rq || - !cpumask_test_cpu(later_rq->cpu, &task->cpus_allowed) || + !cpumask_test_cpu(later_rq->cpu, task->cpus_ptr) || task_running(rq, task) || !dl_task(task) || !task_on_rq_queued(task))) { diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index de3de997e245..f7e4579e746c 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -1,13 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * kernel/sched/debug.c * * Print the CFS rbtree and other debugging details * * Copyright(C) 2007, Red Hat, Inc., Ingo Molnar - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. */ #include "sched.h" @@ -236,49 +233,35 @@ static void sd_free_ctl_entry(struct ctl_table **tablep) *tablep = NULL; } -static int min_load_idx = 0; -static int max_load_idx = CPU_LOAD_IDX_MAX-1; - static void set_table_entry(struct ctl_table *entry, const char *procname, void *data, int maxlen, - umode_t mode, proc_handler *proc_handler, - bool load_idx) + umode_t mode, proc_handler *proc_handler) { entry->procname = procname; entry->data = data; entry->maxlen = maxlen; entry->mode = mode; entry->proc_handler = proc_handler; - - if (load_idx) { - entry->extra1 = &min_load_idx; - entry->extra2 = &max_load_idx; - } } static struct ctl_table * sd_alloc_ctl_domain_table(struct sched_domain *sd) { - struct ctl_table *table = sd_alloc_ctl_entry(14); + struct ctl_table *table = sd_alloc_ctl_entry(9); if (table == NULL) return NULL; - set_table_entry(&table[0] , "min_interval", &sd->min_interval, sizeof(long), 0644, proc_doulongvec_minmax, false); - set_table_entry(&table[1] , "max_interval", &sd->max_interval, sizeof(long), 0644, proc_doulongvec_minmax, false); - set_table_entry(&table[2] , "busy_idx", &sd->busy_idx, sizeof(int) , 0644, proc_dointvec_minmax, true ); - set_table_entry(&table[3] , "idle_idx", &sd->idle_idx, sizeof(int) , 0644, proc_dointvec_minmax, true ); - set_table_entry(&table[4] , "newidle_idx", &sd->newidle_idx, sizeof(int) , 0644, proc_dointvec_minmax, true ); - set_table_entry(&table[5] , "wake_idx", &sd->wake_idx, sizeof(int) , 0644, proc_dointvec_minmax, true ); - set_table_entry(&table[6] , "forkexec_idx", &sd->forkexec_idx, sizeof(int) , 0644, proc_dointvec_minmax, true ); - set_table_entry(&table[7] , "busy_factor", &sd->busy_factor, sizeof(int) , 0644, proc_dointvec_minmax, false); - set_table_entry(&table[8] , "imbalance_pct", &sd->imbalance_pct, sizeof(int) , 0644, proc_dointvec_minmax, false); - set_table_entry(&table[9] , "cache_nice_tries", &sd->cache_nice_tries, sizeof(int) , 0644, proc_dointvec_minmax, false); - set_table_entry(&table[10], "flags", &sd->flags, sizeof(int) , 0644, proc_dointvec_minmax, false); - set_table_entry(&table[11], "max_newidle_lb_cost", &sd->max_newidle_lb_cost, sizeof(long), 0644, proc_doulongvec_minmax, false); - set_table_entry(&table[12], "name", sd->name, CORENAME_MAX_SIZE, 0444, proc_dostring, false); - /* &table[13] is terminator */ + set_table_entry(&table[0], "min_interval", &sd->min_interval, sizeof(long), 0644, proc_doulongvec_minmax); + set_table_entry(&table[1], "max_interval", &sd->max_interval, sizeof(long), 0644, proc_doulongvec_minmax); + set_table_entry(&table[2], "busy_factor", &sd->busy_factor, sizeof(int), 0644, proc_dointvec_minmax); + set_table_entry(&table[3], "imbalance_pct", &sd->imbalance_pct, sizeof(int), 0644, proc_dointvec_minmax); + set_table_entry(&table[4], "cache_nice_tries", &sd->cache_nice_tries, sizeof(int), 0644, proc_dointvec_minmax); + set_table_entry(&table[5], "flags", &sd->flags, sizeof(int), 0644, proc_dointvec_minmax); + set_table_entry(&table[6], "max_newidle_lb_cost", &sd->max_newidle_lb_cost, sizeof(long), 0644, proc_doulongvec_minmax); + set_table_entry(&table[7], "name", sd->name, CORENAME_MAX_SIZE, 0444, proc_dostring); + /* &table[8] is terminator */ return table; } @@ -315,6 +298,7 @@ void register_sched_domain_sysctl(void) { static struct ctl_table *cpu_entries; static struct ctl_table **cpu_idx; + static bool init_done = false; char buf[32]; int i; @@ -344,7 +328,10 @@ void register_sched_domain_sysctl(void) if (!cpumask_available(sd_sysctl_cpus)) { if (!alloc_cpumask_var(&sd_sysctl_cpus, GFP_KERNEL)) return; + } + if (!init_done) { + init_done = true; /* init to possible to not have holes in @cpu_entries */ cpumask_copy(sd_sysctl_cpus, cpu_possible_mask); } @@ -652,8 +639,6 @@ do { \ SEQ_printf(m, " .%-30s: %Ld.%06ld\n", #x, SPLIT_NS(rq->x)) P(nr_running); - SEQ_printf(m, " .%-30s: %lu\n", "load", - rq->load.weight); P(nr_switches); P(nr_load_updates); P(nr_uninterruptible); @@ -661,11 +646,6 @@ do { \ SEQ_printf(m, " .%-30s: %ld\n", "curr->pid", (long)(task_pid_nr(rq->curr))); PN(clock); PN(clock_task); - P(cpu_load[0]); - P(cpu_load[1]); - P(cpu_load[2]); - P(cpu_load[3]); - P(cpu_load[4]); #undef P #undef PN @@ -698,7 +678,7 @@ do { \ static const char *sched_tunable_scaling_names[] = { "none", - "logaritmic", + "logarithmic", "linear" }; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 310d0637fe4b..036be95a87e9 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -248,13 +248,6 @@ const struct sched_class fair_sched_class; */ #ifdef CONFIG_FAIR_GROUP_SCHED - -/* cpu runqueue to which this cfs_rq is attached */ -static inline struct rq *rq_of(struct cfs_rq *cfs_rq) -{ - return cfs_rq->rq; -} - static inline struct task_struct *task_of(struct sched_entity *se) { SCHED_WARN_ON(!entity_is_task(se)); @@ -282,79 +275,116 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) return grp->my_q; } -static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) +static inline void cfs_rq_tg_path(struct cfs_rq *cfs_rq, char *path, int len) { - if (!cfs_rq->on_list) { - struct rq *rq = rq_of(cfs_rq); - int cpu = cpu_of(rq); + if (!path) + return; + + if (cfs_rq && task_group_is_autogroup(cfs_rq->tg)) + autogroup_path(cfs_rq->tg, path, len); + else if (cfs_rq && cfs_rq->tg->css.cgroup) + cgroup_path(cfs_rq->tg->css.cgroup, path, len); + else + strlcpy(path, "(null)", len); +} + +static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) +{ + struct rq *rq = rq_of(cfs_rq); + int cpu = cpu_of(rq); + + if (cfs_rq->on_list) + return rq->tmp_alone_branch == &rq->leaf_cfs_rq_list; + + cfs_rq->on_list = 1; + + /* + * Ensure we either appear before our parent (if already + * enqueued) or force our parent to appear after us when it is + * enqueued. The fact that we always enqueue bottom-up + * reduces this to two cases and a special case for the root + * cfs_rq. Furthermore, it also means that we will always reset + * tmp_alone_branch either when the branch is connected + * to a tree or when we reach the top of the tree + */ + if (cfs_rq->tg->parent && + cfs_rq->tg->parent->cfs_rq[cpu]->on_list) { /* - * Ensure we either appear before our parent (if already - * enqueued) or force our parent to appear after us when it is - * enqueued. The fact that we always enqueue bottom-up - * reduces this to two cases and a special case for the root - * cfs_rq. Furthermore, it also means that we will always reset - * tmp_alone_branch either when the branch is connected - * to a tree or when we reach the beg of the tree + * If parent is already on the list, we add the child + * just before. Thanks to circular linked property of + * the list, this means to put the child at the tail + * of the list that starts by parent. */ - if (cfs_rq->tg->parent && - cfs_rq->tg->parent->cfs_rq[cpu]->on_list) { - /* - * If parent is already on the list, we add the child - * just before. Thanks to circular linked property of - * the list, this means to put the child at the tail - * of the list that starts by parent. - */ - list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list, - &(cfs_rq->tg->parent->cfs_rq[cpu]->leaf_cfs_rq_list)); - /* - * The branch is now connected to its tree so we can - * reset tmp_alone_branch to the beginning of the - * list. - */ - rq->tmp_alone_branch = &rq->leaf_cfs_rq_list; - } else if (!cfs_rq->tg->parent) { - /* - * cfs rq without parent should be put - * at the tail of the list. - */ - list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list, - &rq->leaf_cfs_rq_list); - /* - * We have reach the beg of a tree so we can reset - * tmp_alone_branch to the beginning of the list. - */ - rq->tmp_alone_branch = &rq->leaf_cfs_rq_list; - } else { - /* - * The parent has not already been added so we want to - * make sure that it will be put after us. - * tmp_alone_branch points to the beg of the branch - * where we will add parent. - */ - list_add_rcu(&cfs_rq->leaf_cfs_rq_list, - rq->tmp_alone_branch); - /* - * update tmp_alone_branch to points to the new beg - * of the branch - */ - rq->tmp_alone_branch = &cfs_rq->leaf_cfs_rq_list; - } + list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list, + &(cfs_rq->tg->parent->cfs_rq[cpu]->leaf_cfs_rq_list)); + /* + * The branch is now connected to its tree so we can + * reset tmp_alone_branch to the beginning of the + * list. + */ + rq->tmp_alone_branch = &rq->leaf_cfs_rq_list; + return true; + } - cfs_rq->on_list = 1; + if (!cfs_rq->tg->parent) { + /* + * cfs rq without parent should be put + * at the tail of the list. + */ + list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list, + &rq->leaf_cfs_rq_list); + /* + * We have reach the top of a tree so we can reset + * tmp_alone_branch to the beginning of the list. + */ + rq->tmp_alone_branch = &rq->leaf_cfs_rq_list; + return true; } + + /* + * The parent has not already been added so we want to + * make sure that it will be put after us. + * tmp_alone_branch points to the begin of the branch + * where we will add parent. + */ + list_add_rcu(&cfs_rq->leaf_cfs_rq_list, rq->tmp_alone_branch); + /* + * update tmp_alone_branch to points to the new begin + * of the branch + */ + rq->tmp_alone_branch = &cfs_rq->leaf_cfs_rq_list; + return false; } static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq) { if (cfs_rq->on_list) { + struct rq *rq = rq_of(cfs_rq); + + /* + * With cfs_rq being unthrottled/throttled during an enqueue, + * it can happen the tmp_alone_branch points the a leaf that + * we finally want to del. In this case, tmp_alone_branch moves + * to the prev element but it will point to rq->leaf_cfs_rq_list + * at the end of the enqueue. + */ + if (rq->tmp_alone_branch == &cfs_rq->leaf_cfs_rq_list) + rq->tmp_alone_branch = cfs_rq->leaf_cfs_rq_list.prev; + list_del_rcu(&cfs_rq->leaf_cfs_rq_list); cfs_rq->on_list = 0; } } -/* Iterate through all leaf cfs_rq's on a runqueue: */ -#define for_each_leaf_cfs_rq(rq, cfs_rq) \ - list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list) +static inline void assert_list_leaf_cfs_rq(struct rq *rq) +{ + SCHED_WARN_ON(rq->tmp_alone_branch != &rq->leaf_cfs_rq_list); +} + +/* Iterate thr' all leaf cfs_rq's on a runqueue */ +#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) \ + list_for_each_entry_safe(cfs_rq, pos, &rq->leaf_cfs_rq_list, \ + leaf_cfs_rq_list) /* Do the two (enqueued) entities belong to the same group ? */ static inline struct cfs_rq * @@ -410,12 +440,6 @@ static inline struct task_struct *task_of(struct sched_entity *se) return container_of(se, struct task_struct, se); } -static inline struct rq *rq_of(struct cfs_rq *cfs_rq) -{ - return container_of(cfs_rq, struct rq, cfs); -} - - #define for_each_sched_entity(se) \ for (; se; se = NULL) @@ -438,16 +462,27 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) return NULL; } -static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) +static inline void cfs_rq_tg_path(struct cfs_rq *cfs_rq, char *path, int len) +{ + if (path) + strlcpy(path, "(null)", len); +} + +static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) { + return true; } static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq) { } -#define for_each_leaf_cfs_rq(rq, cfs_rq) \ - for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL) +static inline void assert_list_leaf_cfs_rq(struct rq *rq) +{ +} + +#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) \ + for (cfs_rq = &rq->cfs, pos = NULL; cfs_rq; cfs_rq = pos) static inline struct sched_entity *parent_entity(struct sched_entity *se) { @@ -686,9 +721,8 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se) return calc_delta_fair(sched_slice(cfs_rq, se), se); } -#ifdef CONFIG_SMP #include "pelt.h" -#include "sched-pelt.h" +#ifdef CONFIG_SMP static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu); static unsigned long task_h_load(struct task_struct *p); @@ -744,11 +778,12 @@ static void attach_entity_cfs_rq(struct sched_entity *se); * Finally, that extrapolated util_avg is clamped to the cap (util_avg_cap) * if util_avg > util_avg_cap. */ -void post_init_entity_util_avg(struct sched_entity *se) +void post_init_entity_util_avg(struct task_struct *p) { + struct sched_entity *se = &p->se; struct cfs_rq *cfs_rq = cfs_rq_of(se); struct sched_avg *sa = &se->avg; - long cpu_scale = arch_scale_cpu_capacity(NULL, cpu_of(rq_of(cfs_rq))); + long cpu_scale = arch_scale_cpu_capacity(cpu_of(rq_of(cfs_rq))); long cap = (long)(cpu_scale - cfs_rq->avg.util_avg) / 2; if (cap > 0) { @@ -763,22 +798,19 @@ void post_init_entity_util_avg(struct sched_entity *se) } } - if (entity_is_task(se)) { - struct task_struct *p = task_of(se); - if (p->sched_class != &fair_sched_class) { - /* - * For !fair tasks do: - * - update_cfs_rq_load_avg(now, cfs_rq); - attach_entity_load_avg(cfs_rq, se, 0); - switched_from_fair(rq, p); - * - * such that the next switched_to_fair() has the - * expected state. - */ - se->avg.last_update_time = cfs_rq_clock_task(cfs_rq); - return; - } + if (p->sched_class != &fair_sched_class) { + /* + * For !fair tasks do: + * + update_cfs_rq_load_avg(now, cfs_rq); + attach_entity_load_avg(cfs_rq, se, 0); + switched_from_fair(rq, p); + * + * such that the next switched_to_fair() has the + * expected state. + */ + se->avg.last_update_time = cfs_rq_clock_pelt(cfs_rq); + return; } attach_entity_cfs_rq(se); @@ -788,7 +820,7 @@ void post_init_entity_util_avg(struct sched_entity *se) void init_entity_runnable_average(struct sched_entity *se) { } -void post_init_entity_util_avg(struct sched_entity *se) +void post_init_entity_util_avg(struct task_struct *p) { } static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) @@ -1035,7 +1067,7 @@ unsigned int sysctl_numa_balancing_scan_size = 256; unsigned int sysctl_numa_balancing_scan_delay = 1000; struct numa_group { - atomic_t refcount; + refcount_t refcount; spinlock_t lock; /* nr_tasks, tasks */ int nr_tasks; @@ -1104,7 +1136,7 @@ static unsigned int task_scan_start(struct task_struct *p) unsigned long shared = group_faults_shared(ng); unsigned long private = group_faults_priv(ng); - period *= atomic_read(&ng->refcount); + period *= refcount_read(&ng->refcount); period *= shared + 1; period /= private + shared + 1; } @@ -1127,7 +1159,7 @@ static unsigned int task_scan_max(struct task_struct *p) unsigned long private = group_faults_priv(ng); unsigned long period = smax; - period *= atomic_read(&ng->refcount); + period *= refcount_read(&ng->refcount); period *= shared + 1; period /= private + shared + 1; @@ -1160,7 +1192,7 @@ void init_numa_balancing(unsigned long clone_flags, struct task_struct *p) /* New address space, reset the preferred nid */ if (!(clone_flags & CLONE_VM)) { - p->numa_preferred_nid = -1; + p->numa_preferred_nid = NUMA_NO_NODE; return; } @@ -1180,13 +1212,13 @@ void init_numa_balancing(unsigned long clone_flags, struct task_struct *p) static void account_numa_enqueue(struct rq *rq, struct task_struct *p) { - rq->nr_numa_running += (p->numa_preferred_nid != -1); + rq->nr_numa_running += (p->numa_preferred_nid != NUMA_NO_NODE); rq->nr_preferred_running += (p->numa_preferred_nid == task_node(p)); } static void account_numa_dequeue(struct rq *rq, struct task_struct *p) { - rq->nr_numa_running -= (p->numa_preferred_nid != -1); + rq->nr_numa_running -= (p->numa_preferred_nid != NUMA_NO_NODE); rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p)); } @@ -1400,7 +1432,7 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page, * two full passes of the "multi-stage node selection" test that is * executed below. */ - if ((p->numa_preferred_nid == -1 || p->numa_scan_seq <= 4) && + if ((p->numa_preferred_nid == NUMA_NO_NODE || p->numa_scan_seq <= 4) && (cpupid_pid_unset(last_cpupid) || cpupid_match_pid(p, last_cpupid))) return true; @@ -1453,9 +1485,7 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page, group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4; } -static unsigned long weighted_cpuload(struct rq *rq); -static unsigned long source_load(int cpu, int type); -static unsigned long target_load(int cpu, int type); +static unsigned long cpu_runnable_load(struct rq *rq); /* Cached statistics for all CPUs within a node */ struct numa_stats { @@ -1476,7 +1506,7 @@ static void update_numa_stats(struct numa_stats *ns, int nid) for_each_cpu(cpu, cpumask_of_node(nid)) { struct rq *rq = cpu_rq(cpu); - ns->load += weighted_cpuload(rq); + ns->load += cpu_runnable_load(rq); ns->compute_capacity += capacity_of(cpu); } @@ -1608,7 +1638,7 @@ static void task_numa_compare(struct task_numa_env *env, * be incurred if the tasks were swapped. */ /* Skip this swap candidate if cannot move to the source cpu */ - if (!cpumask_test_cpu(env->src_cpu, &cur->cpus_allowed)) + if (!cpumask_test_cpu(env->src_cpu, cur->cpus_ptr)) goto unlock; /* @@ -1705,7 +1735,7 @@ static void task_numa_find_cpu(struct task_numa_env *env, for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) { /* Skip this CPU if the source task cannot migrate */ - if (!cpumask_test_cpu(cpu, &env->p->cpus_allowed)) + if (!cpumask_test_cpu(cpu, env->p->cpus_ptr)) continue; env->dst_cpu = cpu; @@ -1848,7 +1878,7 @@ static void numa_migrate_preferred(struct task_struct *p) unsigned long interval = HZ; /* This task has no NUMA fault statistics yet */ - if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults)) + if (unlikely(p->numa_preferred_nid == NUMA_NO_NODE || !p->numa_faults)) return; /* Periodically retry migrating the task to the preferred node */ @@ -1994,6 +2024,10 @@ static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period) if (p->last_task_numa_placement) { delta = runtime - p->last_sum_exec_runtime; *period = now - p->last_task_numa_placement; + + /* Avoid time going backwards, prevent potential divide error: */ + if (unlikely((s64)*period < 0)) + *period = 0; } else { delta = p->se.avg.load_sum; *period = LOAD_AVG_MAX; @@ -2095,7 +2129,7 @@ static int preferred_group_nid(struct task_struct *p, int nid) static void task_numa_placement(struct task_struct *p) { - int seq, nid, max_nid = -1; + int seq, nid, max_nid = NUMA_NO_NODE; unsigned long max_faults = 0; unsigned long fault_types[2] = { 0, 0 }; unsigned long total_faults; @@ -2203,12 +2237,12 @@ static void task_numa_placement(struct task_struct *p) static inline int get_numa_group(struct numa_group *grp) { - return atomic_inc_not_zero(&grp->refcount); + return refcount_inc_not_zero(&grp->refcount); } static inline void put_numa_group(struct numa_group *grp) { - if (atomic_dec_and_test(&grp->refcount)) + if (refcount_dec_and_test(&grp->refcount)) kfree_rcu(grp, rcu); } @@ -2229,7 +2263,7 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags, if (!grp) return; - atomic_set(&grp->refcount, 1); + refcount_set(&grp->refcount, 1); grp->active_nodes = 1; grp->max_faults_cpu = 0; spin_lock_init(&grp->lock); @@ -2580,7 +2614,7 @@ out: /* * Drive the periodic memory faults.. */ -void task_tick_numa(struct rq *rq, struct task_struct *curr) +static void task_tick_numa(struct rq *rq, struct task_struct *curr) { struct callback_head *work = &curr->numa_work; u64 period, now; @@ -2638,7 +2672,8 @@ static void update_scan_period(struct task_struct *p, int new_cpu) * the preferred node. */ if (dst_nid == p->numa_preferred_nid || - (p->numa_preferred_nid != -1 && src_nid != p->numa_preferred_nid)) + (p->numa_preferred_nid != NUMA_NO_NODE && + src_nid != p->numa_preferred_nid)) return; } @@ -2668,8 +2703,6 @@ static void account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) { update_load_add(&cfs_rq->load, se->load.weight); - if (!parent_entity(se)) - update_load_add(&rq_of(cfs_rq)->load, se->load.weight); #ifdef CONFIG_SMP if (entity_is_task(se)) { struct rq *rq = rq_of(cfs_rq); @@ -2685,8 +2718,6 @@ static void account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) { update_load_sub(&cfs_rq->load, se->load.weight); - if (!parent_entity(se)) - update_load_sub(&rq_of(cfs_rq)->load, se->load.weight); #ifdef CONFIG_SMP if (entity_is_task(se)) { account_numa_dequeue(rq_of(cfs_rq), task_of(se)); @@ -3122,7 +3153,7 @@ void set_task_rq_fair(struct sched_entity *se, p_last_update_time = prev->avg.last_update_time; n_last_update_time = next->avg.last_update_time; #endif - __update_load_avg_blocked_se(p_last_update_time, cpu_of(rq_of(prev)), se); + __update_load_avg_blocked_se(p_last_update_time, se); se->avg.last_update_time = n_last_update_time; } @@ -3257,11 +3288,11 @@ update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cf /* * runnable_sum can't be lower than running_sum - * As running sum is scale with CPU capacity wehreas the runnable sum - * is not we rescale running_sum 1st + * Rescale running sum to be in the same range as runnable sum + * running_sum is in [0 : LOAD_AVG_MAX << SCHED_CAPACITY_SHIFT] + * runnable_sum is in [0 : LOAD_AVG_MAX] */ - running_sum = se->avg.util_sum / - arch_scale_cpu_capacity(NULL, cpu_of(rq_of(cfs_rq))); + running_sum = se->avg.util_sum >> SCHED_CAPACITY_SHIFT; runnable_sum = max(runnable_sum, running_sum); load_sum = (s64)se_weight(se) * runnable_sum; @@ -3316,6 +3347,9 @@ static inline int propagate_entity_load_avg(struct sched_entity *se) update_tg_cfs_util(cfs_rq, se, gcfs_rq); update_tg_cfs_runnable(cfs_rq, se, gcfs_rq); + trace_pelt_cfs_tp(cfs_rq); + trace_pelt_se_tp(se); + return 1; } @@ -3364,7 +3398,7 @@ static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum /** * update_cfs_rq_load_avg - update the cfs_rq's load/util averages - * @now: current time, as per cfs_rq_clock_task() + * @now: current time, as per cfs_rq_clock_pelt() * @cfs_rq: cfs_rq to update * * The cfs_rq avg is the direct sum of all its entities (blocked and runnable) @@ -3409,7 +3443,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) decayed = 1; } - decayed |= __update_load_avg_cfs_rq(now, cpu_of(rq_of(cfs_rq)), cfs_rq); + decayed |= __update_load_avg_cfs_rq(now, cfs_rq); #ifndef CONFIG_64BIT smp_wmb(); @@ -3468,6 +3502,8 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s add_tg_cfs_propagate(cfs_rq, se->avg.load_sum); cfs_rq_util_change(cfs_rq, flags); + + trace_pelt_cfs_tp(cfs_rq); } /** @@ -3487,6 +3523,8 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s add_tg_cfs_propagate(cfs_rq, -se->avg.load_sum); cfs_rq_util_change(cfs_rq, 0); + + trace_pelt_cfs_tp(cfs_rq); } /* @@ -3499,9 +3537,7 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s /* Update task and its cfs_rq load average */ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { - u64 now = cfs_rq_clock_task(cfs_rq); - struct rq *rq = rq_of(cfs_rq); - int cpu = cpu_of(rq); + u64 now = cfs_rq_clock_pelt(cfs_rq); int decayed; /* @@ -3509,7 +3545,7 @@ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s * track group sched_entity load average for task_h_load calc in migration */ if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD)) - __update_load_avg_se(now, cpu, cfs_rq, se); + __update_load_avg_se(now, cfs_rq, se); decayed = update_cfs_rq_load_avg(now, cfs_rq); decayed |= propagate_entity_load_avg(se); @@ -3555,20 +3591,20 @@ static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq) * Synchronize entity load avg of dequeued entity without locking * the previous rq. */ -void sync_entity_load_avg(struct sched_entity *se) +static void sync_entity_load_avg(struct sched_entity *se) { struct cfs_rq *cfs_rq = cfs_rq_of(se); u64 last_update_time; last_update_time = cfs_rq_last_update_time(cfs_rq); - __update_load_avg_blocked_se(last_update_time, cpu_of(rq_of(cfs_rq)), se); + __update_load_avg_blocked_se(last_update_time, se); } /* * Task first catches up with cfs_rq, and then subtract * itself from the cfs_rq (task must be off the queue now). */ -void remove_entity_load_avg(struct sched_entity *se) +static void remove_entity_load_avg(struct sched_entity *se) { struct cfs_rq *cfs_rq = cfs_rq_of(se); unsigned long flags; @@ -3577,10 +3613,6 @@ void remove_entity_load_avg(struct sched_entity *se) * tasks cannot exit without having gone through wake_up_new_task() -> * post_init_entity_util_avg() which will have added things to the * cfs_rq, so we can remove unconditionally. - * - * Similarly for groups, they will have passed through - * post_init_entity_util_avg() before unregister_sched_fair_group() - * calls this. */ sync_entity_load_avg(se); @@ -3654,6 +3686,7 @@ util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep) { long last_ewma_diff; struct util_est ue; + int cpu; if (!sched_feat(UTIL_EST)) return; @@ -3688,6 +3721,14 @@ util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep) return; /* + * To avoid overestimation of actual task utilization, skip updates if + * we cannot grant there is idle time in this CPU. + */ + cpu = cpu_of(rq_of(cfs_rq)); + if (task_util(p) > capacity_orig_of(cpu)) + return; + + /* * Update Task's estimated utilization * * When *p completes an activation we can consolidate another sample @@ -4079,7 +4120,8 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) * least twice that of our own weight (i.e. dont track it * when there are only lesser-weight tasks around): */ - if (schedstat_enabled() && rq_of(cfs_rq)->load.weight >= 2*se->load.weight) { + if (schedstat_enabled() && + rq_of(cfs_rq)->cfs.load.weight >= 2*se->load.weight) { schedstat_set(se->statistics.slice_max, max((u64)schedstat_val(se->statistics.slice_max), se->sum_exec_runtime - se->prev_sum_exec_runtime)); @@ -4429,6 +4471,10 @@ static int tg_unthrottle_up(struct task_group *tg, void *data) /* adjust cfs_rq_clock_task() */ cfs_rq->throttled_clock_task_time += rq_clock_task(rq) - cfs_rq->throttled_clock_task; + + /* Add cfs_rq with already running entity in the list */ + if (cfs_rq->nr_running >= 1) + list_add_leaf_cfs_rq(cfs_rq); } return 0; @@ -4440,8 +4486,10 @@ static int tg_throttle_down(struct task_group *tg, void *data) struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; /* group is entering throttled state, stop time */ - if (!cfs_rq->throttle_count) + if (!cfs_rq->throttle_count) { cfs_rq->throttled_clock_task = rq_clock_task(rq); + list_del_leaf_cfs_rq(cfs_rq); + } cfs_rq->throttle_count++; return 0; @@ -4544,6 +4592,8 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) break; } + assert_list_leaf_cfs_rq(rq); + if (!se) add_nr_running(rq, task_delta); @@ -4565,7 +4615,7 @@ static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, struct rq *rq = rq_of(cfs_rq); struct rq_flags rf; - rq_lock(rq, &rf); + rq_lock_irqsave(rq, &rf); if (!cfs_rq_throttled(cfs_rq)) goto next; @@ -4582,7 +4632,7 @@ static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, unthrottle_cfs_rq(cfs_rq); next: - rq_unlock(rq, &rf); + rq_unlock_irqrestore(rq, &rf); if (!remaining) break; @@ -4598,7 +4648,7 @@ next: * period the timer is deactivated until scheduling resumes; cfs_b->idle is * used to track this state. */ -static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun) +static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, unsigned long flags) { u64 runtime, runtime_expires; int throttled; @@ -4640,11 +4690,11 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun) while (throttled && cfs_b->runtime > 0 && !cfs_b->distribute_running) { runtime = cfs_b->runtime; cfs_b->distribute_running = 1; - raw_spin_unlock(&cfs_b->lock); + raw_spin_unlock_irqrestore(&cfs_b->lock, flags); /* we can't nest cfs_b->lock while distributing bandwidth */ runtime = distribute_cfs_runtime(cfs_b, runtime, runtime_expires); - raw_spin_lock(&cfs_b->lock); + raw_spin_lock_irqsave(&cfs_b->lock, flags); cfs_b->distribute_running = 0; throttled = !list_empty(&cfs_b->throttled_cfs_rq); @@ -4705,6 +4755,11 @@ static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b) if (runtime_refresh_within(cfs_b, min_left)) return; + /* don't push forwards an existing deferred unthrottle */ + if (cfs_b->slack_started) + return; + cfs_b->slack_started = true; + hrtimer_start(&cfs_b->slack_timer, ns_to_ktime(cfs_bandwidth_slack_period), HRTIMER_MODE_REL); @@ -4753,17 +4808,19 @@ static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b) { u64 runtime = 0, slice = sched_cfs_bandwidth_slice(); + unsigned long flags; u64 expires; /* confirm we're still not at a refresh boundary */ - raw_spin_lock(&cfs_b->lock); + raw_spin_lock_irqsave(&cfs_b->lock, flags); + cfs_b->slack_started = false; if (cfs_b->distribute_running) { - raw_spin_unlock(&cfs_b->lock); + raw_spin_unlock_irqrestore(&cfs_b->lock, flags); return; } if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) { - raw_spin_unlock(&cfs_b->lock); + raw_spin_unlock_irqrestore(&cfs_b->lock, flags); return; } @@ -4774,18 +4831,18 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b) if (runtime) cfs_b->distribute_running = 1; - raw_spin_unlock(&cfs_b->lock); + raw_spin_unlock_irqrestore(&cfs_b->lock, flags); if (!runtime) return; runtime = distribute_cfs_runtime(cfs_b, runtime, expires); - raw_spin_lock(&cfs_b->lock); + raw_spin_lock_irqsave(&cfs_b->lock, flags); if (expires == cfs_b->runtime_expires) lsub_positive(&cfs_b->runtime, runtime); cfs_b->distribute_running = 0; - raw_spin_unlock(&cfs_b->lock); + raw_spin_unlock_irqrestore(&cfs_b->lock, flags); } /* @@ -4859,24 +4916,50 @@ static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer) return HRTIMER_NORESTART; } +extern const u64 max_cfs_quota_period; + static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer) { struct cfs_bandwidth *cfs_b = container_of(timer, struct cfs_bandwidth, period_timer); + unsigned long flags; int overrun; int idle = 0; + int count = 0; - raw_spin_lock(&cfs_b->lock); + raw_spin_lock_irqsave(&cfs_b->lock, flags); for (;;) { overrun = hrtimer_forward_now(timer, cfs_b->period); if (!overrun) break; - idle = do_sched_cfs_period_timer(cfs_b, overrun); + if (++count > 3) { + u64 new, old = ktime_to_ns(cfs_b->period); + + new = (old * 147) / 128; /* ~115% */ + new = min(new, max_cfs_quota_period); + + cfs_b->period = ns_to_ktime(new); + + /* since max is 1s, this is limited to 1e9^2, which fits in u64 */ + cfs_b->quota *= new; + cfs_b->quota = div64_u64(cfs_b->quota, old); + + pr_warn_ratelimited( + "cfs_period_timer[cpu%d]: period too short, scaling up (new cfs_period_us %lld, cfs_quota_us = %lld)\n", + smp_processor_id(), + div_u64(new, NSEC_PER_USEC), + div_u64(cfs_b->quota, NSEC_PER_USEC)); + + /* reset count so we don't come right back in here */ + count = 0; + } + + idle = do_sched_cfs_period_timer(cfs_b, overrun, flags); } if (idle) cfs_b->period_active = 0; - raw_spin_unlock(&cfs_b->lock); + raw_spin_unlock_irqrestore(&cfs_b->lock, flags); return idle ? HRTIMER_NORESTART : HRTIMER_RESTART; } @@ -4894,6 +4977,7 @@ void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); cfs_b->slack_timer.function = sched_cfs_slack_timer; cfs_b->distribute_running = 0; + cfs_b->slack_started = false; } static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) @@ -4986,6 +5070,12 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq) } #else /* CONFIG_CFS_BANDWIDTH */ + +static inline bool cfs_bandwidth_used(void) +{ + return false; +} + static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq) { return rq_clock_task(rq_of(cfs_rq)); @@ -5083,7 +5173,6 @@ static inline void hrtick_update(struct rq *rq) #ifdef CONFIG_SMP static inline unsigned long cpu_util(int cpu); -static unsigned long capacity_of(int cpu); static inline bool cpu_overutilized(int cpu) { @@ -5092,8 +5181,10 @@ static inline bool cpu_overutilized(int cpu) static inline void update_overutilized_status(struct rq *rq) { - if (!READ_ONCE(rq->rd->overutilized) && cpu_overutilized(rq->cpu)) + if (!READ_ONCE(rq->rd->overutilized) && cpu_overutilized(rq->cpu)) { WRITE_ONCE(rq->rd->overutilized, SG_OVERUTILIZED); + trace_sched_overutilized_tp(rq->rd, SG_OVERUTILIZED); + } } #else static inline void update_overutilized_status(struct rq *rq) { } @@ -5177,6 +5268,23 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) } + if (cfs_bandwidth_used()) { + /* + * When bandwidth control is enabled; the cfs_rq_throttled() + * breaks in the above iteration can result in incomplete + * leaf list maintenance, resulting in triggering the assertion + * below. + */ + for_each_sched_entity(se) { + cfs_rq = cfs_rq_of(se); + + if (list_add_leaf_cfs_rq(cfs_rq)) + break; + } + } + + assert_list_leaf_cfs_rq(rq); + hrtick_update(rq); } @@ -5247,71 +5355,6 @@ DEFINE_PER_CPU(cpumask_var_t, load_balance_mask); DEFINE_PER_CPU(cpumask_var_t, select_idle_mask); #ifdef CONFIG_NO_HZ_COMMON -/* - * per rq 'load' arrray crap; XXX kill this. - */ - -/* - * The exact cpuload calculated at every tick would be: - * - * load' = (1 - 1/2^i) * load + (1/2^i) * cur_load - * - * If a CPU misses updates for n ticks (as it was idle) and update gets - * called on the n+1-th tick when CPU may be busy, then we have: - * - * load_n = (1 - 1/2^i)^n * load_0 - * load_n+1 = (1 - 1/2^i) * load_n + (1/2^i) * cur_load - * - * decay_load_missed() below does efficient calculation of - * - * load' = (1 - 1/2^i)^n * load - * - * Because x^(n+m) := x^n * x^m we can decompose any x^n in power-of-2 factors. - * This allows us to precompute the above in said factors, thereby allowing the - * reduction of an arbitrary n in O(log_2 n) steps. (See also - * fixed_power_int()) - * - * The calculation is approximated on a 128 point scale. - */ -#define DEGRADE_SHIFT 7 - -static const u8 degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128}; -static const u8 degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = { - { 0, 0, 0, 0, 0, 0, 0, 0 }, - { 64, 32, 8, 0, 0, 0, 0, 0 }, - { 96, 72, 40, 12, 1, 0, 0, 0 }, - { 112, 98, 75, 43, 15, 1, 0, 0 }, - { 120, 112, 98, 76, 45, 16, 2, 0 } -}; - -/* - * Update cpu_load for any missed ticks, due to tickless idle. The backlog - * would be when CPU is idle and so we just decay the old load without - * adding any new load. - */ -static unsigned long -decay_load_missed(unsigned long load, unsigned long missed_updates, int idx) -{ - int j = 0; - - if (!missed_updates) - return load; - - if (missed_updates >= degrade_zero_ticks[idx]) - return 0; - - if (idx == 1) - return load >> missed_updates; - - while (missed_updates) { - if (missed_updates % 2) - load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT; - - missed_updates >>= 1; - j++; - } - return load; -} static struct { cpumask_var_t idle_cpus_mask; @@ -5323,249 +5366,21 @@ static struct { #endif /* CONFIG_NO_HZ_COMMON */ -/** - * __cpu_load_update - update the rq->cpu_load[] statistics - * @this_rq: The rq to update statistics for - * @this_load: The current load - * @pending_updates: The number of missed updates - * - * Update rq->cpu_load[] statistics. This function is usually called every - * scheduler tick (TICK_NSEC). - * - * This function computes a decaying average: - * - * load[i]' = (1 - 1/2^i) * load[i] + (1/2^i) * load - * - * Because of NOHZ it might not get called on every tick which gives need for - * the @pending_updates argument. - * - * load[i]_n = (1 - 1/2^i) * load[i]_n-1 + (1/2^i) * load_n-1 - * = A * load[i]_n-1 + B ; A := (1 - 1/2^i), B := (1/2^i) * load - * = A * (A * load[i]_n-2 + B) + B - * = A * (A * (A * load[i]_n-3 + B) + B) + B - * = A^3 * load[i]_n-3 + (A^2 + A + 1) * B - * = A^n * load[i]_0 + (A^(n-1) + A^(n-2) + ... + 1) * B - * = A^n * load[i]_0 + ((1 - A^n) / (1 - A)) * B - * = (1 - 1/2^i)^n * (load[i]_0 - load) + load - * - * In the above we've assumed load_n := load, which is true for NOHZ_FULL as - * any change in load would have resulted in the tick being turned back on. - * - * For regular NOHZ, this reduces to: - * - * load[i]_n = (1 - 1/2^i)^n * load[i]_0 - * - * see decay_load_misses(). For NOHZ_FULL we get to subtract and add the extra - * term. - */ -static void cpu_load_update(struct rq *this_rq, unsigned long this_load, - unsigned long pending_updates) -{ - unsigned long __maybe_unused tickless_load = this_rq->cpu_load[0]; - int i, scale; - - this_rq->nr_load_updates++; - - /* Update our load: */ - this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */ - for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) { - unsigned long old_load, new_load; - - /* scale is effectively 1 << i now, and >> i divides by scale */ - - old_load = this_rq->cpu_load[i]; -#ifdef CONFIG_NO_HZ_COMMON - old_load = decay_load_missed(old_load, pending_updates - 1, i); - if (tickless_load) { - old_load -= decay_load_missed(tickless_load, pending_updates - 1, i); - /* - * old_load can never be a negative value because a - * decayed tickless_load cannot be greater than the - * original tickless_load. - */ - old_load += tickless_load; - } -#endif - new_load = this_load; - /* - * Round up the averaging division if load is increasing. This - * prevents us from getting stuck on 9 if the load is 10, for - * example. - */ - if (new_load > old_load) - new_load += scale - 1; - - this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i; - } -} - -/* Used instead of source_load when we know the type == 0 */ -static unsigned long weighted_cpuload(struct rq *rq) +static unsigned long cpu_runnable_load(struct rq *rq) { return cfs_rq_runnable_load_avg(&rq->cfs); } -#ifdef CONFIG_NO_HZ_COMMON -/* - * There is no sane way to deal with nohz on smp when using jiffies because the - * CPU doing the jiffies update might drift wrt the CPU doing the jiffy reading - * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}. - * - * Therefore we need to avoid the delta approach from the regular tick when - * possible since that would seriously skew the load calculation. This is why we - * use cpu_load_update_periodic() for CPUs out of nohz. However we'll rely on - * jiffies deltas for updates happening while in nohz mode (idle ticks, idle - * loop exit, nohz_idle_balance, nohz full exit...) - * - * This means we might still be one tick off for nohz periods. - */ - -static void cpu_load_update_nohz(struct rq *this_rq, - unsigned long curr_jiffies, - unsigned long load) -{ - unsigned long pending_updates; - - pending_updates = curr_jiffies - this_rq->last_load_update_tick; - if (pending_updates) { - this_rq->last_load_update_tick = curr_jiffies; - /* - * In the regular NOHZ case, we were idle, this means load 0. - * In the NOHZ_FULL case, we were non-idle, we should consider - * its weighted load. - */ - cpu_load_update(this_rq, load, pending_updates); - } -} - -/* - * Called from nohz_idle_balance() to update the load ratings before doing the - * idle balance. - */ -static void cpu_load_update_idle(struct rq *this_rq) -{ - /* - * bail if there's load or we're actually up-to-date. - */ - if (weighted_cpuload(this_rq)) - return; - - cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), 0); -} - -/* - * Record CPU load on nohz entry so we know the tickless load to account - * on nohz exit. cpu_load[0] happens then to be updated more frequently - * than other cpu_load[idx] but it should be fine as cpu_load readers - * shouldn't rely into synchronized cpu_load[*] updates. - */ -void cpu_load_update_nohz_start(void) -{ - struct rq *this_rq = this_rq(); - - /* - * This is all lockless but should be fine. If weighted_cpuload changes - * concurrently we'll exit nohz. And cpu_load write can race with - * cpu_load_update_idle() but both updater would be writing the same. - */ - this_rq->cpu_load[0] = weighted_cpuload(this_rq); -} - -/* - * Account the tickless load in the end of a nohz frame. - */ -void cpu_load_update_nohz_stop(void) -{ - unsigned long curr_jiffies = READ_ONCE(jiffies); - struct rq *this_rq = this_rq(); - unsigned long load; - struct rq_flags rf; - - if (curr_jiffies == this_rq->last_load_update_tick) - return; - - load = weighted_cpuload(this_rq); - rq_lock(this_rq, &rf); - update_rq_clock(this_rq); - cpu_load_update_nohz(this_rq, curr_jiffies, load); - rq_unlock(this_rq, &rf); -} -#else /* !CONFIG_NO_HZ_COMMON */ -static inline void cpu_load_update_nohz(struct rq *this_rq, - unsigned long curr_jiffies, - unsigned long load) { } -#endif /* CONFIG_NO_HZ_COMMON */ - -static void cpu_load_update_periodic(struct rq *this_rq, unsigned long load) -{ -#ifdef CONFIG_NO_HZ_COMMON - /* See the mess around cpu_load_update_nohz(). */ - this_rq->last_load_update_tick = READ_ONCE(jiffies); -#endif - cpu_load_update(this_rq, load, 1); -} - -/* - * Called from scheduler_tick() - */ -void cpu_load_update_active(struct rq *this_rq) -{ - unsigned long load = weighted_cpuload(this_rq); - - if (tick_nohz_tick_stopped()) - cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), load); - else - cpu_load_update_periodic(this_rq, load); -} - -/* - * Return a low guess at the load of a migration-source CPU weighted - * according to the scheduling class and "nice" value. - * - * We want to under-estimate the load of migration sources, to - * balance conservatively. - */ -static unsigned long source_load(int cpu, int type) -{ - struct rq *rq = cpu_rq(cpu); - unsigned long total = weighted_cpuload(rq); - - if (type == 0 || !sched_feat(LB_BIAS)) - return total; - - return min(rq->cpu_load[type-1], total); -} - -/* - * Return a high guess at the load of a migration-target CPU weighted - * according to the scheduling class and "nice" value. - */ -static unsigned long target_load(int cpu, int type) -{ - struct rq *rq = cpu_rq(cpu); - unsigned long total = weighted_cpuload(rq); - - if (type == 0 || !sched_feat(LB_BIAS)) - return total; - - return max(rq->cpu_load[type-1], total); -} - static unsigned long capacity_of(int cpu) { return cpu_rq(cpu)->cpu_capacity; } -static unsigned long capacity_orig_of(int cpu) -{ - return cpu_rq(cpu)->cpu_capacity_orig; -} - static unsigned long cpu_avg_load_per_task(int cpu) { struct rq *rq = cpu_rq(cpu); unsigned long nr_running = READ_ONCE(rq->cfs.h_nr_running); - unsigned long load_avg = weighted_cpuload(rq); + unsigned long load_avg = cpu_runnable_load(rq); if (nr_running) return load_avg / nr_running; @@ -5663,7 +5478,7 @@ wake_affine_weight(struct sched_domain *sd, struct task_struct *p, s64 this_eff_load, prev_eff_load; unsigned long task_load; - this_eff_load = target_load(this_cpu, sd->wake_idx); + this_eff_load = cpu_runnable_load(cpu_rq(this_cpu)); if (sync) { unsigned long current_load = task_h_load(current); @@ -5681,7 +5496,7 @@ wake_affine_weight(struct sched_domain *sd, struct task_struct *p, this_eff_load *= 100; this_eff_load *= capacity_of(prev_cpu); - prev_eff_load = source_load(prev_cpu, sd->wake_idx); + prev_eff_load = cpu_runnable_load(cpu_rq(prev_cpu)); prev_eff_load -= task_load; if (sched_feat(WA_BIAS)) prev_eff_load *= 100 + (sd->imbalance_pct - 100) / 2; @@ -5742,14 +5557,10 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, unsigned long this_runnable_load = ULONG_MAX; unsigned long min_avg_load = ULONG_MAX, this_avg_load = ULONG_MAX; unsigned long most_spare = 0, this_spare = 0; - int load_idx = sd->forkexec_idx; int imbalance_scale = 100 + (sd->imbalance_pct-100)/2; unsigned long imbalance = scale_load_down(NICE_0_LOAD) * (sd->imbalance_pct-100) / 100; - if (sd_flag & SD_BALANCE_WAKE) - load_idx = sd->wake_idx; - do { unsigned long load, avg_load, runnable_load; unsigned long spare_cap, max_spare_cap; @@ -5758,7 +5569,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, /* Skip over this group if it has no CPUs allowed */ if (!cpumask_intersects(sched_group_span(group), - &p->cpus_allowed)) + p->cpus_ptr)) continue; local_group = cpumask_test_cpu(this_cpu, @@ -5773,12 +5584,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, max_spare_cap = 0; for_each_cpu(i, sched_group_span(group)) { - /* Bias balancing toward CPUs of our domain */ - if (local_group) - load = source_load(i, load_idx); - else - load = target_load(i, load_idx); - + load = cpu_runnable_load(cpu_rq(i)); runnable_load += load; avg_load += cfs_rq_load_avg(&cpu_rq(i)->cfs); @@ -5890,7 +5696,7 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this return cpumask_first(sched_group_span(group)); /* Traverse only the allowed CPUs */ - for_each_cpu_and(i, sched_group_span(group), &p->cpus_allowed) { + for_each_cpu_and(i, sched_group_span(group), p->cpus_ptr) { if (available_idle_cpu(i)) { struct rq *rq = cpu_rq(i); struct cpuidle_state *idle = idle_get_state(rq); @@ -5914,7 +5720,7 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this shallowest_idle_cpu = i; } } else if (shallowest_idle_cpu == -1) { - load = weighted_cpuload(cpu_rq(i)); + load = cpu_runnable_load(cpu_rq(i)); if (load < min_load) { min_load = load; least_loaded_cpu = i; @@ -5930,7 +5736,7 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p { int new_cpu = cpu; - if (!cpumask_intersects(sched_domain_span(sd), &p->cpus_allowed)) + if (!cpumask_intersects(sched_domain_span(sd), p->cpus_ptr)) return prev_cpu; /* @@ -6047,13 +5853,13 @@ static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int if (!test_idle_cores(target, false)) return -1; - cpumask_and(cpus, sched_domain_span(sd), &p->cpus_allowed); + cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr); for_each_cpu_wrap(core, cpus, target) { bool idle = true; for_each_cpu(cpu, cpu_smt_mask(core)) { - cpumask_clear_cpu(cpu, cpus); + __cpumask_clear_cpu(cpu, cpus); if (!available_idle_cpu(cpu)) idle = false; } @@ -6073,7 +5879,7 @@ static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int /* * Scan the local SMT mask for idle CPUs. */ -static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target) +static int select_idle_smt(struct task_struct *p, int target) { int cpu; @@ -6081,7 +5887,7 @@ static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int t return -1; for_each_cpu(cpu, cpu_smt_mask(target)) { - if (!cpumask_test_cpu(cpu, &p->cpus_allowed)) + if (!cpumask_test_cpu(cpu, p->cpus_ptr)) continue; if (available_idle_cpu(cpu)) return cpu; @@ -6097,7 +5903,7 @@ static inline int select_idle_core(struct task_struct *p, struct sched_domain *s return -1; } -static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target) +static inline int select_idle_smt(struct task_struct *p, int target) { return -1; } @@ -6116,6 +5922,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t u64 time, cost; s64 delta; int cpu, nr = INT_MAX; + int this = smp_processor_id(); this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc)); if (!this_sd) @@ -6139,18 +5946,18 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t nr = 4; } - time = local_clock(); + time = cpu_clock(this); for_each_cpu_wrap(cpu, sched_domain_span(sd), target) { if (!--nr) return -1; - if (!cpumask_test_cpu(cpu, &p->cpus_allowed)) + if (!cpumask_test_cpu(cpu, p->cpus_ptr)) continue; if (available_idle_cpu(cpu)) break; } - time = local_clock() - time; + time = cpu_clock(this) - time; cost = this_sd->avg_scan_cost; delta = (s64)(time - cost) / 8; this_sd->avg_scan_cost += delta; @@ -6181,7 +5988,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) recent_used_cpu != target && cpus_share_cache(recent_used_cpu, target) && available_idle_cpu(recent_used_cpu) && - cpumask_test_cpu(p->recent_used_cpu, &p->cpus_allowed)) { + cpumask_test_cpu(p->recent_used_cpu, p->cpus_ptr)) { /* * Replace recent_used_cpu with prev as it is a potential * candidate for the next wake: @@ -6202,7 +6009,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) if ((unsigned)i < nr_cpumask_bits) return i; - i = select_idle_smt(p, sd, target); + i = select_idle_smt(p, target); if ((unsigned)i < nr_cpumask_bits) return i; @@ -6425,11 +6232,21 @@ static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu) static long compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd) { - long util, max_util, sum_util, energy = 0; + unsigned int max_util, util_cfs, cpu_util, cpu_cap; + unsigned long sum_util, energy = 0; + struct task_struct *tsk; int cpu; for (; pd; pd = pd->next) { + struct cpumask *pd_mask = perf_domain_span(pd); + + /* + * The energy model mandates all the CPUs of a performance + * domain have the same capacity. + */ + cpu_cap = arch_scale_cpu_capacity(cpumask_first(pd_mask)); max_util = sum_util = 0; + /* * The capacity state of CPUs of the current rd can be driven by * CPUs of another rd if they belong to the same performance @@ -6440,11 +6257,29 @@ compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd) * it will not appear in its pd list and will not be accounted * by compute_energy(). */ - for_each_cpu_and(cpu, perf_domain_span(pd), cpu_online_mask) { - util = cpu_util_next(cpu, p, dst_cpu); - util = schedutil_energy_util(cpu, util); - max_util = max(util, max_util); - sum_util += util; + for_each_cpu_and(cpu, pd_mask, cpu_online_mask) { + util_cfs = cpu_util_next(cpu, p, dst_cpu); + + /* + * Busy time computation: utilization clamping is not + * required since the ratio (sum_util / cpu_capacity) + * is already enough to scale the EM reported power + * consumption at the (eventually clamped) cpu_capacity. + */ + sum_util += schedutil_cpu_util(cpu, util_cfs, cpu_cap, + ENERGY_UTIL, NULL); + + /* + * Performance domain frequency: utilization clamping + * must be considered since it affects the selection + * of the performance domain frequency. + * NOTE: in case RT tasks are running, by default the + * FREQUENCY_UTIL's utilization can be max OPP. + */ + tsk = cpu == dst_cpu ? p : NULL; + cpu_util = schedutil_cpu_util(cpu, util_cfs, cpu_cap, + FREQUENCY_UTIL, tsk); + max_util = max(max_util, cpu_util); } energy += em_pd_energy(pd->em_pd, max_util, sum_util); @@ -6527,7 +6362,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) int max_spare_cap_cpu = -1; for_each_cpu_and(cpu, perf_domain_span(pd), sched_domain_span(sd)) { - if (!cpumask_test_cpu(cpu, &p->cpus_allowed)) + if (!cpumask_test_cpu(cpu, p->cpus_ptr)) continue; /* Skip CPUs that will be overutilized. */ @@ -6608,7 +6443,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f if (sd_flag & SD_BALANCE_WAKE) { record_wakee(p); - if (static_branch_unlikely(&sched_energy_present)) { + if (sched_energy_enabled()) { new_cpu = find_energy_efficient_cpu(p, prev_cpu); if (new_cpu >= 0) return new_cpu; @@ -6616,7 +6451,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f } want_affine = !wake_wide(p) && !wake_cap(p, cpu, prev_cpu) && - cpumask_test_cpu(cpu, &p->cpus_allowed); + cpumask_test_cpu(cpu, p->cpus_ptr); } rcu_read_lock(); @@ -7027,6 +6862,12 @@ idle: if (new_tasks > 0) goto again; + /* + * rq is about to be idle, check if we need to update the + * lost_idle_time of clock_pelt + */ + update_idle_rq_clock_pelt(rq); + return NULL; } @@ -7366,14 +7207,14 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) /* * We do not migrate tasks that are: * 1) throttled_lb_pair, or - * 2) cannot be migrated to this CPU due to cpus_allowed, or + * 2) cannot be migrated to this CPU due to cpus_ptr, or * 3) running (obviously), or * 4) are cache-hot on their current CPU. */ if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu)) return 0; - if (!cpumask_test_cpu(env->dst_cpu, &p->cpus_allowed)) { + if (!cpumask_test_cpu(env->dst_cpu, p->cpus_ptr)) { int cpu; schedstat_inc(p->se.statistics.nr_failed_migrations_affine); @@ -7393,7 +7234,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) /* Prevent to re-select dst_cpu via env's CPUs: */ for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) { - if (cpumask_test_cpu(cpu, &p->cpus_allowed)) { + if (cpumask_test_cpu(cpu, p->cpus_ptr)) { env->flags |= LBF_DST_PINNED; env->new_dst_cpu = cpu; break; @@ -7441,7 +7282,6 @@ static void detach_task(struct task_struct *p, struct lb_env *env) { lockdep_assert_held(&env->src_rq->lock); - p->on_rq = TASK_ON_RQ_MIGRATING; deactivate_task(env->src_rq, p, DEQUEUE_NOCLOCK); set_task_cpu(p, env->dst_cpu); } @@ -7480,7 +7320,7 @@ static struct task_struct *detach_one_task(struct lb_env *env) static const unsigned int sched_nr_migrate_break = 32; /* - * detach_tasks() -- tries to detach up to imbalance weighted load from + * detach_tasks() -- tries to detach up to imbalance runnable load from * busiest_rq, as part of a balancing operation within domain "sd". * * Returns number of detached tasks if successful and 0 otherwise. @@ -7548,7 +7388,7 @@ static int detach_tasks(struct lb_env *env) /* * We only want to steal up to the prescribed amount of - * weighted load. + * runnable load. */ if (env->imbalance <= 0) break; @@ -7577,7 +7417,6 @@ static void attach_task(struct rq *rq, struct task_struct *p) BUG_ON(task_rq(p) != rq); activate_task(rq, p, ENQUEUE_NOCLOCK); - p->on_rq = TASK_ON_RQ_QUEUED; check_preempt_curr(rq, p, 0); } @@ -7618,6 +7457,7 @@ static void attach_tasks(struct lb_env *env) rq_unlock(env->dst_rq, &rf); } +#ifdef CONFIG_NO_HZ_COMMON static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq) { if (cfs_rq->avg.load_avg) @@ -7645,12 +7485,42 @@ static inline bool others_have_blocked(struct rq *rq) return false; } +static inline void update_blocked_load_status(struct rq *rq, bool has_blocked) +{ + rq->last_blocked_load_update_tick = jiffies; + + if (!has_blocked) + rq->has_blocked_load = 0; +} +#else +static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq) { return false; } +static inline bool others_have_blocked(struct rq *rq) { return false; } +static inline void update_blocked_load_status(struct rq *rq, bool has_blocked) {} +#endif + #ifdef CONFIG_FAIR_GROUP_SCHED +static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq) +{ + if (cfs_rq->load.weight) + return false; + + if (cfs_rq->avg.load_sum) + return false; + + if (cfs_rq->avg.util_sum) + return false; + + if (cfs_rq->avg.runnable_load_sum) + return false; + + return true; +} + static void update_blocked_averages(int cpu) { struct rq *rq = cpu_rq(cpu); - struct cfs_rq *cfs_rq; + struct cfs_rq *cfs_rq, *pos; const struct sched_class *curr_class; struct rq_flags rf; bool done = true; @@ -7662,14 +7532,10 @@ static void update_blocked_averages(int cpu) * Iterates the task_group tree in a bottom up fashion, see * list_add_leaf_cfs_rq() for details. */ - for_each_leaf_cfs_rq(rq, cfs_rq) { + for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) { struct sched_entity *se; - /* throttled entities do not contribute to load */ - if (throttled_hierarchy(cfs_rq)) - continue; - - if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq)) + if (update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq)) update_tg_load_avg(cfs_rq, 0); /* Propagate pending load changes to the parent, if any: */ @@ -7677,24 +7543,27 @@ static void update_blocked_averages(int cpu) if (se && !skip_blocked_update(se)) update_load_avg(cfs_rq_of(se), se, 0); + /* + * There can be a lot of idle CPU cgroups. Don't let fully + * decayed cfs_rqs linger on the list. + */ + if (cfs_rq_is_decayed(cfs_rq)) + list_del_leaf_cfs_rq(cfs_rq); + /* Don't need periodic decay once load/util_avg are null */ if (cfs_rq_has_blocked(cfs_rq)) done = false; } curr_class = rq->curr->sched_class; - update_rt_rq_load_avg(rq_clock_task(rq), rq, curr_class == &rt_sched_class); - update_dl_rq_load_avg(rq_clock_task(rq), rq, curr_class == &dl_sched_class); + update_rt_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &rt_sched_class); + update_dl_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &dl_sched_class); update_irq_load_avg(rq, 0); /* Don't need periodic decay once load/util_avg are null */ if (others_have_blocked(rq)) done = false; -#ifdef CONFIG_NO_HZ_COMMON - rq->last_blocked_load_update_tick = jiffies; - if (done) - rq->has_blocked_load = 0; -#endif + update_blocked_load_status(rq, !done); rq_unlock_irqrestore(rq, &rf); } @@ -7713,10 +7582,10 @@ static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq) if (cfs_rq->last_h_load_update == now) return; - cfs_rq->h_load_next = NULL; + WRITE_ONCE(cfs_rq->h_load_next, NULL); for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); - cfs_rq->h_load_next = se; + WRITE_ONCE(cfs_rq->h_load_next, se); if (cfs_rq->last_h_load_update == now) break; } @@ -7726,7 +7595,7 @@ static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq) cfs_rq->last_h_load_update = now; } - while ((se = cfs_rq->h_load_next) != NULL) { + while ((se = READ_ONCE(cfs_rq->h_load_next)) != NULL) { load = cfs_rq->h_load; load = div64_ul(load * se->avg.load_avg, cfs_rq_load_avg(cfs_rq) + 1); @@ -7754,17 +7623,13 @@ static inline void update_blocked_averages(int cpu) rq_lock_irqsave(rq, &rf); update_rq_clock(rq); - update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq); + update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq); curr_class = rq->curr->sched_class; - update_rt_rq_load_avg(rq_clock_task(rq), rq, curr_class == &rt_sched_class); - update_dl_rq_load_avg(rq_clock_task(rq), rq, curr_class == &dl_sched_class); + update_rt_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &rt_sched_class); + update_dl_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &dl_sched_class); update_irq_load_avg(rq, 0); -#ifdef CONFIG_NO_HZ_COMMON - rq->last_blocked_load_update_tick = jiffies; - if (!cfs_rq_has_blocked(cfs_rq) && !others_have_blocked(rq)) - rq->has_blocked_load = 0; -#endif + update_blocked_load_status(rq, cfs_rq_has_blocked(cfs_rq) || others_have_blocked(rq)); rq_unlock_irqrestore(rq, &rf); } @@ -7782,7 +7647,6 @@ static unsigned long task_h_load(struct task_struct *p) struct sg_lb_stats { unsigned long avg_load; /*Avg load across the CPUs of the group */ unsigned long group_load; /* Total load over the CPUs of the group */ - unsigned long sum_weighted_load; /* Weighted load of group's tasks */ unsigned long load_per_task; unsigned long group_capacity; unsigned long group_util; /* Total utilization of the group */ @@ -7836,38 +7700,10 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds) }; } -/** - * get_sd_load_idx - Obtain the load index for a given sched domain. - * @sd: The sched_domain whose load_idx is to be obtained. - * @idle: The idle status of the CPU for whose sd load_idx is obtained. - * - * Return: The load index. - */ -static inline int get_sd_load_idx(struct sched_domain *sd, - enum cpu_idle_type idle) -{ - int load_idx; - - switch (idle) { - case CPU_NOT_IDLE: - load_idx = sd->busy_idx; - break; - - case CPU_NEWLY_IDLE: - load_idx = sd->newidle_idx; - break; - default: - load_idx = sd->idle_idx; - break; - } - - return load_idx; -} - static unsigned long scale_rt_capacity(struct sched_domain *sd, int cpu) { struct rq *rq = cpu_rq(cpu); - unsigned long max = arch_scale_cpu_capacity(sd, cpu); + unsigned long max = arch_scale_cpu_capacity(cpu); unsigned long used, free; unsigned long irq; @@ -7892,7 +7728,7 @@ static void update_cpu_capacity(struct sched_domain *sd, int cpu) unsigned long capacity = scale_rt_capacity(sd, cpu); struct sched_group *sdg = sd->groups; - cpu_rq(cpu)->cpu_capacity_orig = arch_scale_cpu_capacity(sd, cpu); + cpu_rq(cpu)->cpu_capacity_orig = arch_scale_cpu_capacity(cpu); if (!capacity) capacity = 1; @@ -7989,8 +7825,20 @@ check_cpu_capacity(struct rq *rq, struct sched_domain *sd) } /* + * Check whether a rq has a misfit task and if it looks like we can actually + * help that task: we can migrate the task to a CPU of higher capacity, or + * the task's current CPU is heavily pressured. + */ +static inline int check_misfit_status(struct rq *rq, struct sched_domain *sd) +{ + return rq->misfit_task_load && + (rq->cpu_capacity_orig < rq->rd->max_cpu_capacity || + check_cpu_capacity(rq, sd)); +} + +/* * Group imbalance indicates (and tries to solve) the problem where balancing - * groups is inadequate due to ->cpus_allowed constraints. + * groups is inadequate due to ->cpus_ptr constraints. * * Imagine a situation of two groups of 4 CPUs each and 4 tasks each with a * cpumask covering 1 CPU of the first group and 3 CPUs of the second group. @@ -8140,9 +7988,6 @@ static inline void update_sg_lb_stats(struct lb_env *env, struct sg_lb_stats *sgs, int *sg_status) { - int local_group = cpumask_test_cpu(env->dst_cpu, sched_group_span(group)); - int load_idx = get_sd_load_idx(env->sd, env->idle); - unsigned long load; int i, nr_running; memset(sgs, 0, sizeof(*sgs)); @@ -8153,13 +7998,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, if ((env->flags & LBF_NOHZ_STATS) && update_nohz_stats(rq, false)) env->flags |= LBF_NOHZ_AGAIN; - /* Bias balancing toward CPUs of our domain: */ - if (local_group) - load = target_load(i, load_idx); - else - load = source_load(i, load_idx); - - sgs->group_load += load; + sgs->group_load += cpu_runnable_load(rq); sgs->group_util += cpu_util(i); sgs->sum_nr_running += rq->cfs.h_nr_running; @@ -8174,7 +8013,6 @@ static inline void update_sg_lb_stats(struct lb_env *env, sgs->nr_numa_running += rq->nr_numa_running; sgs->nr_preferred_running += rq->nr_preferred_running; #endif - sgs->sum_weighted_load += weighted_cpuload(rq); /* * No need to call idle_cpu() if nr_running is not 0 */ @@ -8193,7 +8031,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) / sgs->group_capacity; if (sgs->sum_nr_running) - sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; + sgs->load_per_task = sgs->group_load / sgs->sum_nr_running; sgs->group_weight = group->group_weight; @@ -8407,8 +8245,12 @@ next_group: /* Update over-utilization (tipping point, U >= 0) indicator */ WRITE_ONCE(rd->overutilized, sg_status & SG_OVERUTILIZED); + trace_sched_overutilized_tp(rd, sg_status & SG_OVERUTILIZED); } else if (sg_status & SG_OVERUTILIZED) { - WRITE_ONCE(env->dst_rq->rd->overutilized, SG_OVERUTILIZED); + struct root_domain *rd = env->dst_rq->rd; + + WRITE_ONCE(rd->overutilized, SG_OVERUTILIZED); + trace_sched_overutilized_tp(rd, SG_OVERUTILIZED); } } @@ -8452,9 +8294,7 @@ static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds) if (sched_asym_prefer(busiest_cpu, env->dst_cpu)) return 0; - env->imbalance = DIV_ROUND_CLOSEST( - sds->busiest_stat.avg_load * sds->busiest_stat.group_capacity, - SCHED_CAPACITY_SCALE); + env->imbalance = sds->busiest_stat.group_load; return 1; } @@ -8616,7 +8456,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s * find_busiest_group - Returns the busiest group within the sched_domain * if there is an imbalance. * - * Also calculates the amount of weighted load which should be moved + * Also calculates the amount of runnable load which should be moved * to restore balance. * * @env: The load balancing environment. @@ -8636,7 +8476,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env) */ update_sd_lb_stats(env, &sds); - if (static_branch_unlikely(&sched_energy_present)) { + if (sched_energy_enabled()) { struct root_domain *rd = env->dst_rq->rd; if (rcu_dereference(rd->pd) && !READ_ONCE(rd->overutilized)) @@ -8661,7 +8501,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env) /* * If the busiest group is imbalanced the below checks don't * work because they assume all things are equal, which typically - * isn't true due to cpus_allowed constraints and the like. + * isn't true due to cpus_ptr constraints and the like. */ if (busiest->group_type == group_imbalanced) goto force_balance; @@ -8735,7 +8575,7 @@ static struct rq *find_busiest_queue(struct lb_env *env, int i; for_each_cpu_and(i, sched_group_span(group), env->cpus) { - unsigned long capacity, wl; + unsigned long capacity, load; enum fbq_type rt; rq = cpu_rq(i); @@ -8789,30 +8629,30 @@ static struct rq *find_busiest_queue(struct lb_env *env, rq->nr_running == 1) continue; - wl = weighted_cpuload(rq); + load = cpu_runnable_load(rq); /* - * When comparing with imbalance, use weighted_cpuload() + * When comparing with imbalance, use cpu_runnable_load() * which is not scaled with the CPU capacity. */ - if (rq->nr_running == 1 && wl > env->imbalance && + if (rq->nr_running == 1 && load > env->imbalance && !check_cpu_capacity(rq, env->sd)) continue; /* * For the load comparisons with the other CPU's, consider - * the weighted_cpuload() scaled with the CPU capacity, so + * the cpu_runnable_load() scaled with the CPU capacity, so * that the load can be moved away from the CPU that is * potentially running at a lower capacity. * - * Thus we're looking for max(wl_i / capacity_i), crosswise + * Thus we're looking for max(load_i / capacity_i), crosswise * multiplication to rid ourselves of the division works out - * to: wl_i * capacity_j > wl_j * capacity_i; where j is + * to: load_i * capacity_j > load_j * capacity_i; where j is * our previous maximum. */ - if (wl * busiest_capacity > busiest_load * capacity) { - busiest_load = wl; + if (load * busiest_capacity > busiest_load * capacity) { + busiest_load = load; busiest_capacity = capacity; busiest = rq; } @@ -8827,21 +8667,25 @@ static struct rq *find_busiest_queue(struct lb_env *env, */ #define MAX_PINNED_INTERVAL 512 -static int need_active_balance(struct lb_env *env) +static inline bool +asym_active_balance(struct lb_env *env) { - struct sched_domain *sd = env->sd; + /* + * ASYM_PACKING needs to force migrate tasks from busy but + * lower priority CPUs in order to pack all tasks in the + * highest priority CPUs. + */ + return env->idle != CPU_NOT_IDLE && (env->sd->flags & SD_ASYM_PACKING) && + sched_asym_prefer(env->dst_cpu, env->src_cpu); +} - if (env->idle == CPU_NEWLY_IDLE) { +static inline bool +voluntary_active_balance(struct lb_env *env) +{ + struct sched_domain *sd = env->sd; - /* - * ASYM_PACKING needs to force migrate tasks from busy but - * lower priority CPUs in order to pack all tasks in the - * highest priority CPUs. - */ - if ((sd->flags & SD_ASYM_PACKING) && - sched_asym_prefer(env->dst_cpu, env->src_cpu)) - return 1; - } + if (asym_active_balance(env)) + return 1; /* * The dst_cpu is idle and the src_cpu CPU has only 1 CFS task. @@ -8859,6 +8703,16 @@ static int need_active_balance(struct lb_env *env) if (env->src_grp_type == group_misfit_task) return 1; + return 0; +} + +static int need_active_balance(struct lb_env *env) +{ + struct sched_domain *sd = env->sd; + + if (voluntary_active_balance(env)) + return 1; + return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2); } @@ -9023,7 +8877,7 @@ more_balance: if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) { /* Prevent to re-select dst_cpu via env's CPUs */ - cpumask_clear_cpu(env.dst_cpu, env.cpus); + __cpumask_clear_cpu(env.dst_cpu, env.cpus); env.dst_rq = cpu_rq(env.new_dst_cpu); env.dst_cpu = env.new_dst_cpu; @@ -9050,7 +8904,7 @@ more_balance: /* All tasks on this runqueue were pinned by CPU affinity */ if (unlikely(env.flags & LBF_ALL_PINNED)) { - cpumask_clear_cpu(cpu_of(busiest), cpus); + __cpumask_clear_cpu(cpu_of(busiest), cpus); /* * Attempting to continue load balancing at the current * sched_domain level only makes sense if there are @@ -9089,7 +8943,7 @@ more_balance: * if the curr task on busiest CPU can't be * moved to this_cpu: */ - if (!cpumask_test_cpu(this_cpu, &busiest->curr->cpus_allowed)) { + if (!cpumask_test_cpu(this_cpu, busiest->curr->cpus_ptr)) { raw_spin_unlock_irqrestore(&busiest->lock, flags); env.flags |= LBF_ALL_PINNED; @@ -9120,7 +8974,7 @@ more_balance: } else sd->nr_balance_failed = 0; - if (likely(!active_balance)) { + if (likely(!active_balance) || voluntary_active_balance(&env)) { /* We were unbalanced, so reset the balancing interval */ sd->balance_interval = sd->min_interval; } else { @@ -9427,22 +9281,26 @@ static inline int on_null_domain(struct rq *rq) * - When one of the busy CPUs notice that there may be an idle rebalancing * needed, they will kick the idle load balancer, which then does idle * load balancing for all the idle CPUs. + * - HK_FLAG_MISC CPUs are used for this task, because HK_FLAG_SCHED not set + * anywhere yet. */ static inline int find_new_ilb(void) { - int ilb = cpumask_first(nohz.idle_cpus_mask); + int ilb; - if (ilb < nr_cpu_ids && idle_cpu(ilb)) - return ilb; + for_each_cpu_and(ilb, nohz.idle_cpus_mask, + housekeeping_cpumask(HK_FLAG_MISC)) { + if (idle_cpu(ilb)) + return ilb; + } return nr_cpu_ids; } /* - * Kick a CPU to do the nohz balancing, if it is time for it. We pick the - * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle - * CPU (if there is one). + * Kick a CPU to do the nohz balancing, if it is time for it. We pick any + * idle CPU in the HK_FLAG_MISC housekeeping set (if there is one). */ static void kick_ilb(unsigned int flags) { @@ -9469,15 +9327,8 @@ static void kick_ilb(unsigned int flags) } /* - * Current heuristic for kicking the idle load balancer in the presence - * of an idle cpu in the system. - * - This rq has more than one task. - * - This rq has at least one CFS task and the capacity of the CPU is - * significantly reduced because of RT tasks or IRQs. - * - At parent of LLC scheduler domain level, this cpu's scheduler group has - * multiple busy cpu. - * - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler - * domain span are idle. + * Current decision point for kicking the idle load balancer in the presence + * of idle CPUs in the system. */ static void nohz_balancer_kick(struct rq *rq) { @@ -9510,30 +9361,21 @@ static void nohz_balancer_kick(struct rq *rq) if (time_before(now, nohz.next_balance)) goto out; - if (rq->nr_running >= 2 || rq->misfit_task_load) { + if (rq->nr_running >= 2) { flags = NOHZ_KICK_MASK; goto out; } rcu_read_lock(); - sds = rcu_dereference(per_cpu(sd_llc_shared, cpu)); - if (sds) { - /* - * XXX: write a coherent comment on why we do this. - * See also: http://lkml.kernel.org/r/20111202010832.602203411@sbsiddha-desk.sc.intel.com - */ - nr_busy = atomic_read(&sds->nr_busy_cpus); - if (nr_busy > 1) { - flags = NOHZ_KICK_MASK; - goto unlock; - } - - } sd = rcu_dereference(rq->sd); if (sd) { - if ((rq->cfs.h_nr_running >= 1) && - check_cpu_capacity(rq, sd)) { + /* + * If there's a CFS task and the current CPU has reduced + * capacity; kick the ILB to see if there's a better CPU to run + * on. + */ + if (rq->cfs.h_nr_running >= 1 && check_cpu_capacity(rq, sd)) { flags = NOHZ_KICK_MASK; goto unlock; } @@ -9541,17 +9383,57 @@ static void nohz_balancer_kick(struct rq *rq) sd = rcu_dereference(per_cpu(sd_asym_packing, cpu)); if (sd) { - for_each_cpu(i, sched_domain_span(sd)) { - if (i == cpu || - !cpumask_test_cpu(i, nohz.idle_cpus_mask)) - continue; - + /* + * When ASYM_PACKING; see if there's a more preferred CPU + * currently idle; in which case, kick the ILB to move tasks + * around. + */ + for_each_cpu_and(i, sched_domain_span(sd), nohz.idle_cpus_mask) { if (sched_asym_prefer(i, cpu)) { flags = NOHZ_KICK_MASK; goto unlock; } } } + + sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, cpu)); + if (sd) { + /* + * When ASYM_CPUCAPACITY; see if there's a higher capacity CPU + * to run the misfit task on. + */ + if (check_misfit_status(rq, sd)) { + flags = NOHZ_KICK_MASK; + goto unlock; + } + + /* + * For asymmetric systems, we do not want to nicely balance + * cache use, instead we want to embrace asymmetry and only + * ensure tasks have enough CPU capacity. + * + * Skip the LLC logic because it's not relevant in that case. + */ + goto unlock; + } + + sds = rcu_dereference(per_cpu(sd_llc_shared, cpu)); + if (sds) { + /* + * If there is an imbalance between LLC domains (IOW we could + * increase the overall cache use), we need some less-loaded LLC + * domain to pull some load. Likewise, we may need to spread + * load within the current LLC domain (e.g. packed SMT cores but + * other CPUs are idle). We can't really know from here how busy + * the others are - so just get a nohz balance going if it looks + * like this LLC domain has tasks we could move. + */ + nr_busy = atomic_read(&sds->nr_busy_cpus); + if (nr_busy > 1) { + flags = NOHZ_KICK_MASK; + goto unlock; + } + } unlock: rcu_read_unlock(); out: @@ -9730,7 +9612,6 @@ static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags, rq_lock_irqsave(rq, &rf); update_rq_clock(rq); - cpu_load_update_idle(rq); rq_unlock_irqrestore(rq, &rf); if (flags & NOHZ_BALANCE_KICK) @@ -10541,15 +10422,19 @@ const struct sched_class fair_sched_class = { #ifdef CONFIG_FAIR_GROUP_SCHED .task_change_group = task_change_group_fair, #endif + +#ifdef CONFIG_UCLAMP_TASK + .uclamp_enabled = 1, +#endif }; #ifdef CONFIG_SCHED_DEBUG void print_cfs_stats(struct seq_file *m, int cpu) { - struct cfs_rq *cfs_rq; + struct cfs_rq *cfs_rq, *pos; rcu_read_lock(); - for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq) + for_each_leaf_cfs_rq_safe(cpu_rq(cpu), cfs_rq, pos) print_cfs_rq(m, cpu, cfs_rq); rcu_read_unlock(); } @@ -10588,3 +10473,83 @@ __init void init_sched_fair_class(void) #endif /* SMP */ } + +/* + * Helper functions to facilitate extracting info from tracepoints. + */ + +const struct sched_avg *sched_trace_cfs_rq_avg(struct cfs_rq *cfs_rq) +{ +#ifdef CONFIG_SMP + return cfs_rq ? &cfs_rq->avg : NULL; +#else + return NULL; +#endif +} +EXPORT_SYMBOL_GPL(sched_trace_cfs_rq_avg); + +char *sched_trace_cfs_rq_path(struct cfs_rq *cfs_rq, char *str, int len) +{ + if (!cfs_rq) { + if (str) + strlcpy(str, "(null)", len); + else + return NULL; + } + + cfs_rq_tg_path(cfs_rq, str, len); + return str; +} +EXPORT_SYMBOL_GPL(sched_trace_cfs_rq_path); + +int sched_trace_cfs_rq_cpu(struct cfs_rq *cfs_rq) +{ + return cfs_rq ? cpu_of(rq_of(cfs_rq)) : -1; +} +EXPORT_SYMBOL_GPL(sched_trace_cfs_rq_cpu); + +const struct sched_avg *sched_trace_rq_avg_rt(struct rq *rq) +{ +#ifdef CONFIG_SMP + return rq ? &rq->avg_rt : NULL; +#else + return NULL; +#endif +} +EXPORT_SYMBOL_GPL(sched_trace_rq_avg_rt); + +const struct sched_avg *sched_trace_rq_avg_dl(struct rq *rq) +{ +#ifdef CONFIG_SMP + return rq ? &rq->avg_dl : NULL; +#else + return NULL; +#endif +} +EXPORT_SYMBOL_GPL(sched_trace_rq_avg_dl); + +const struct sched_avg *sched_trace_rq_avg_irq(struct rq *rq) +{ +#if defined(CONFIG_SMP) && defined(CONFIG_HAVE_SCHED_AVG_IRQ) + return rq ? &rq->avg_irq : NULL; +#else + return NULL; +#endif +} +EXPORT_SYMBOL_GPL(sched_trace_rq_avg_irq); + +int sched_trace_rq_cpu(struct rq *rq) +{ + return rq ? cpu_of(rq) : -1; +} +EXPORT_SYMBOL_GPL(sched_trace_rq_cpu); + +const struct cpumask *sched_trace_rd_span(struct root_domain *rd) +{ +#ifdef CONFIG_SMP + return rd ? rd->span : NULL; +#else + return NULL; +#endif +} +EXPORT_SYMBOL_GPL(sched_trace_rd_span); diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 858589b83377..2410db5e9a35 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -39,7 +39,6 @@ SCHED_FEAT(WAKEUP_PREEMPTION, true) SCHED_FEAT(HRTICK, false) SCHED_FEAT(DOUBLE_TICK, false) -SCHED_FEAT(LB_BIAS, false) /* * Decrement CPU capacity based on time not spent running tasks diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index f5516bae0c1b..80940939b733 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Generic entry points for the idle threads and * implementation of the idle task scheduling class. diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c index 81faddba9e20..123ea07a3f3b 100644 --- a/kernel/sched/isolation.c +++ b/kernel/sched/isolation.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Housekeeping management. Manage the targets for routine code that can run on * any CPU: unbound workqueues, timers, kthreads and any offloadable work. @@ -65,6 +66,7 @@ void __init housekeeping_init(void) static int __init housekeeping_setup(char *str, enum hk_flags flags) { cpumask_var_t non_housekeeping_mask; + cpumask_var_t tmp; int err; alloc_bootmem_cpumask_var(&non_housekeeping_mask); @@ -75,16 +77,23 @@ static int __init housekeeping_setup(char *str, enum hk_flags flags) return 0; } + alloc_bootmem_cpumask_var(&tmp); if (!housekeeping_flags) { alloc_bootmem_cpumask_var(&housekeeping_mask); cpumask_andnot(housekeeping_mask, cpu_possible_mask, non_housekeeping_mask); - if (cpumask_empty(housekeeping_mask)) - cpumask_set_cpu(smp_processor_id(), housekeeping_mask); - } else { - cpumask_var_t tmp; - alloc_bootmem_cpumask_var(&tmp); + cpumask_andnot(tmp, cpu_present_mask, non_housekeeping_mask); + if (cpumask_empty(tmp)) { + pr_warn("Housekeeping: must include one present CPU, " + "using boot CPU:%d\n", smp_processor_id()); + __cpumask_set_cpu(smp_processor_id(), housekeeping_mask); + __cpumask_clear_cpu(smp_processor_id(), non_housekeeping_mask); + } + } else { + cpumask_andnot(tmp, cpu_present_mask, non_housekeeping_mask); + if (cpumask_empty(tmp)) + __cpumask_clear_cpu(smp_processor_id(), non_housekeeping_mask); cpumask_andnot(tmp, cpu_possible_mask, non_housekeeping_mask); if (!cpumask_equal(tmp, housekeeping_mask)) { pr_warn("Housekeeping: nohz_full= must match isolcpus=\n"); @@ -92,8 +101,8 @@ static int __init housekeeping_setup(char *str, enum hk_flags flags) free_bootmem_cpumask_var(non_housekeeping_mask); return 0; } - free_bootmem_cpumask_var(tmp); } + free_bootmem_cpumask_var(tmp); if ((flags & HK_FLAG_TICK) && !(housekeeping_flags & HK_FLAG_TICK)) { if (IS_ENABLED(CONFIG_NO_HZ_FULL)) { diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c index 3cd8a3a795d2..aa8d75804108 100644 --- a/kernel/sched/membarrier.c +++ b/kernel/sched/membarrier.c @@ -1,17 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (C) 2010-2017 Mathieu Desnoyers <mathieu.desnoyers@efficios.com> * * membarrier system call - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. */ #include "sched.h" diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c index 90fb5bc12ad4..a96db50d40e0 100644 --- a/kernel/sched/pelt.c +++ b/kernel/sched/pelt.c @@ -26,9 +26,10 @@ #include <linux/sched.h> #include "sched.h" -#include "sched-pelt.h" #include "pelt.h" +#include <trace/events/sched.h> + /* * Approximate: * val * y^n, where y^32 ~= 0.5 (~1 scheduling period) @@ -106,16 +107,12 @@ static u32 __accumulate_pelt_segments(u64 periods, u32 d1, u32 d3) * n=1 */ static __always_inline u32 -accumulate_sum(u64 delta, int cpu, struct sched_avg *sa, +accumulate_sum(u64 delta, struct sched_avg *sa, unsigned long load, unsigned long runnable, int running) { - unsigned long scale_freq, scale_cpu; u32 contrib = (u32)delta; /* p == 0 -> delta < 1024 */ u64 periods; - scale_freq = arch_scale_freq_capacity(cpu); - scale_cpu = arch_scale_cpu_capacity(NULL, cpu); - delta += sa->period_contrib; periods = delta / 1024; /* A period is 1024us (~1ms) */ @@ -137,13 +134,12 @@ accumulate_sum(u64 delta, int cpu, struct sched_avg *sa, } sa->period_contrib = delta; - contrib = cap_scale(contrib, scale_freq); if (load) sa->load_sum += load * contrib; if (runnable) sa->runnable_load_sum += runnable * contrib; if (running) - sa->util_sum += contrib * scale_cpu; + sa->util_sum += contrib << SCHED_CAPACITY_SHIFT; return periods; } @@ -177,7 +173,7 @@ accumulate_sum(u64 delta, int cpu, struct sched_avg *sa, * = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}] */ static __always_inline int -___update_load_sum(u64 now, int cpu, struct sched_avg *sa, +___update_load_sum(u64 now, struct sched_avg *sa, unsigned long load, unsigned long runnable, int running) { u64 delta; @@ -221,7 +217,7 @@ ___update_load_sum(u64 now, int cpu, struct sched_avg *sa, * Step 1: accumulate *_sum since last_update_time. If we haven't * crossed period boundaries, finish. */ - if (!accumulate_sum(delta, cpu, sa, load, runnable, running)) + if (!accumulate_sum(delta, sa, load, runnable, running)) return 0; return 1; @@ -267,37 +263,40 @@ ___update_load_avg(struct sched_avg *sa, unsigned long load, unsigned long runna * runnable_load_avg = \Sum se->avg.runable_load_avg */ -int __update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se) +int __update_load_avg_blocked_se(u64 now, struct sched_entity *se) { - if (___update_load_sum(now, cpu, &se->avg, 0, 0, 0)) { + if (___update_load_sum(now, &se->avg, 0, 0, 0)) { ___update_load_avg(&se->avg, se_weight(se), se_runnable(se)); + trace_pelt_se_tp(se); return 1; } return 0; } -int __update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entity *se) +int __update_load_avg_se(u64 now, struct cfs_rq *cfs_rq, struct sched_entity *se) { - if (___update_load_sum(now, cpu, &se->avg, !!se->on_rq, !!se->on_rq, + if (___update_load_sum(now, &se->avg, !!se->on_rq, !!se->on_rq, cfs_rq->curr == se)) { ___update_load_avg(&se->avg, se_weight(se), se_runnable(se)); cfs_se_util_change(&se->avg); + trace_pelt_se_tp(se); return 1; } return 0; } -int __update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq) +int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq) { - if (___update_load_sum(now, cpu, &cfs_rq->avg, + if (___update_load_sum(now, &cfs_rq->avg, scale_load_down(cfs_rq->load.weight), scale_load_down(cfs_rq->runnable_weight), cfs_rq->curr != NULL)) { ___update_load_avg(&cfs_rq->avg, 1, 1); + trace_pelt_cfs_tp(cfs_rq); return 1; } @@ -317,12 +316,13 @@ int __update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq) int update_rt_rq_load_avg(u64 now, struct rq *rq, int running) { - if (___update_load_sum(now, rq->cpu, &rq->avg_rt, + if (___update_load_sum(now, &rq->avg_rt, running, running, running)) { ___update_load_avg(&rq->avg_rt, 1, 1); + trace_pelt_rt_tp(rq); return 1; } @@ -340,12 +340,13 @@ int update_rt_rq_load_avg(u64 now, struct rq *rq, int running) int update_dl_rq_load_avg(u64 now, struct rq *rq, int running) { - if (___update_load_sum(now, rq->cpu, &rq->avg_dl, + if (___update_load_sum(now, &rq->avg_dl, running, running, running)) { ___update_load_avg(&rq->avg_dl, 1, 1); + trace_pelt_dl_tp(rq); return 1; } @@ -365,28 +366,39 @@ int update_dl_rq_load_avg(u64 now, struct rq *rq, int running) int update_irq_load_avg(struct rq *rq, u64 running) { int ret = 0; + + /* + * We can't use clock_pelt because irq time is not accounted in + * clock_task. Instead we directly scale the running time to + * reflect the real amount of computation + */ + running = cap_scale(running, arch_scale_freq_capacity(cpu_of(rq))); + running = cap_scale(running, arch_scale_cpu_capacity(cpu_of(rq))); + /* * We know the time that has been used by interrupt since last update * but we don't when. Let be pessimistic and assume that interrupt has * happened just before the update. This is not so far from reality * because interrupt will most probably wake up task and trig an update - * of rq clock during which the metric si updated. + * of rq clock during which the metric is updated. * We start to decay with normal context time and then we add the * interrupt context time. * We can safely remove running from rq->clock because * rq->clock += delta with delta >= running */ - ret = ___update_load_sum(rq->clock - running, rq->cpu, &rq->avg_irq, + ret = ___update_load_sum(rq->clock - running, &rq->avg_irq, 0, 0, 0); - ret += ___update_load_sum(rq->clock, rq->cpu, &rq->avg_irq, + ret += ___update_load_sum(rq->clock, &rq->avg_irq, 1, 1, 1); - if (ret) + if (ret) { ___update_load_avg(&rq->avg_irq, 1, 1); + trace_pelt_irq_tp(rq); + } return ret; } diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h index 7e56b489ff32..afff644da065 100644 --- a/kernel/sched/pelt.h +++ b/kernel/sched/pelt.h @@ -1,8 +1,9 @@ #ifdef CONFIG_SMP +#include "sched-pelt.h" -int __update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se); -int __update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entity *se); -int __update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq); +int __update_load_avg_blocked_se(u64 now, struct sched_entity *se); +int __update_load_avg_se(u64 now, struct cfs_rq *cfs_rq, struct sched_entity *se); +int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq); int update_rt_rq_load_avg(u64 now, struct rq *rq, int running); int update_dl_rq_load_avg(u64 now, struct rq *rq, int running); @@ -42,6 +43,101 @@ static inline void cfs_se_util_change(struct sched_avg *avg) WRITE_ONCE(avg->util_est.enqueued, enqueued); } +/* + * The clock_pelt scales the time to reflect the effective amount of + * computation done during the running delta time but then sync back to + * clock_task when rq is idle. + * + * + * absolute time | 1| 2| 3| 4| 5| 6| 7| 8| 9|10|11|12|13|14|15|16 + * @ max capacity ------******---------------******--------------- + * @ half capacity ------************---------************--------- + * clock pelt | 1| 2| 3| 4| 7| 8| 9| 10| 11|14|15|16 + * + */ +static inline void update_rq_clock_pelt(struct rq *rq, s64 delta) +{ + if (unlikely(is_idle_task(rq->curr))) { + /* The rq is idle, we can sync to clock_task */ + rq->clock_pelt = rq_clock_task(rq); + return; + } + + /* + * When a rq runs at a lower compute capacity, it will need + * more time to do the same amount of work than at max + * capacity. In order to be invariant, we scale the delta to + * reflect how much work has been really done. + * Running longer results in stealing idle time that will + * disturb the load signal compared to max capacity. This + * stolen idle time will be automatically reflected when the + * rq will be idle and the clock will be synced with + * rq_clock_task. + */ + + /* + * Scale the elapsed time to reflect the real amount of + * computation + */ + delta = cap_scale(delta, arch_scale_cpu_capacity(cpu_of(rq))); + delta = cap_scale(delta, arch_scale_freq_capacity(cpu_of(rq))); + + rq->clock_pelt += delta; +} + +/* + * When rq becomes idle, we have to check if it has lost idle time + * because it was fully busy. A rq is fully used when the /Sum util_sum + * is greater or equal to: + * (LOAD_AVG_MAX - 1024 + rq->cfs.avg.period_contrib) << SCHED_CAPACITY_SHIFT; + * For optimization and computing rounding purpose, we don't take into account + * the position in the current window (period_contrib) and we use the higher + * bound of util_sum to decide. + */ +static inline void update_idle_rq_clock_pelt(struct rq *rq) +{ + u32 divider = ((LOAD_AVG_MAX - 1024) << SCHED_CAPACITY_SHIFT) - LOAD_AVG_MAX; + u32 util_sum = rq->cfs.avg.util_sum; + util_sum += rq->avg_rt.util_sum; + util_sum += rq->avg_dl.util_sum; + + /* + * Reflecting stolen time makes sense only if the idle + * phase would be present at max capacity. As soon as the + * utilization of a rq has reached the maximum value, it is + * considered as an always runnig rq without idle time to + * steal. This potential idle time is considered as lost in + * this case. We keep track of this lost idle time compare to + * rq's clock_task. + */ + if (util_sum >= divider) + rq->lost_idle_time += rq_clock_task(rq) - rq->clock_pelt; +} + +static inline u64 rq_clock_pelt(struct rq *rq) +{ + lockdep_assert_held(&rq->lock); + assert_clock_updated(rq); + + return rq->clock_pelt - rq->lost_idle_time; +} + +#ifdef CONFIG_CFS_BANDWIDTH +/* rq->task_clock normalized against any time this cfs_rq has spent throttled */ +static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq) +{ + if (unlikely(cfs_rq->throttle_count)) + return cfs_rq->throttled_clock_task - cfs_rq->throttled_clock_task_time; + + return rq_clock_pelt(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time; +} +#else +static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq) +{ + return rq_clock_pelt(rq_of(cfs_rq)); +} +#endif + #else static inline int @@ -67,6 +163,18 @@ update_irq_load_avg(struct rq *rq, u64 running) { return 0; } + +static inline u64 rq_clock_pelt(struct rq *rq) +{ + return rq_clock_task(rq); +} + +static inline void +update_rq_clock_pelt(struct rq *rq, s64 delta) { } + +static inline void +update_idle_rq_clock_pelt(struct rq *rq) { } + #endif diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index c3484785b179..7acc632c3b82 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -4,6 +4,9 @@ * Copyright (c) 2018 Facebook, Inc. * Author: Johannes Weiner <hannes@cmpxchg.org> * + * Polling support by Suren Baghdasaryan <surenb@google.com> + * Copyright (c) 2018 Google, Inc. + * * When CPU, memory and IO are contended, tasks experience delays that * reduce throughput and introduce latencies into the workload. Memory * and IO contention, in addition, can cause a full loss of forward @@ -129,9 +132,13 @@ #include <linux/seq_file.h> #include <linux/proc_fs.h> #include <linux/seqlock.h> +#include <linux/uaccess.h> #include <linux/cgroup.h> #include <linux/module.h> #include <linux/sched.h> +#include <linux/ctype.h> +#include <linux/file.h> +#include <linux/poll.h> #include <linux/psi.h> #include "sched.h" @@ -140,9 +147,9 @@ static int psi_bug __read_mostly; DEFINE_STATIC_KEY_FALSE(psi_disabled); #ifdef CONFIG_PSI_DEFAULT_DISABLED -bool psi_enable; +static bool psi_enable; #else -bool psi_enable = true; +static bool psi_enable = true; #endif static int __init setup_psi(char *str) { @@ -156,16 +163,21 @@ __setup("psi=", setup_psi); #define EXP_60s 1981 /* 1/exp(2s/60s) */ #define EXP_300s 2034 /* 1/exp(2s/300s) */ +/* PSI trigger definitions */ +#define WINDOW_MIN_US 500000 /* Min window size is 500ms */ +#define WINDOW_MAX_US 10000000 /* Max window size is 10s */ +#define UPDATES_PER_WINDOW 10 /* 10 updates per window */ + /* Sampling frequency in nanoseconds */ static u64 psi_period __read_mostly; /* System-level pressure and stall tracking */ static DEFINE_PER_CPU(struct psi_group_cpu, system_group_pcpu); -static struct psi_group psi_system = { +struct psi_group psi_system = { .pcpu = &system_group_pcpu, }; -static void psi_update_work(struct work_struct *work); +static void psi_avgs_work(struct work_struct *work); static void group_init(struct psi_group *group) { @@ -173,9 +185,20 @@ static void group_init(struct psi_group *group) for_each_possible_cpu(cpu) seqcount_init(&per_cpu_ptr(group->pcpu, cpu)->seq); - group->next_update = sched_clock() + psi_period; - INIT_DELAYED_WORK(&group->clock_work, psi_update_work); - mutex_init(&group->stat_lock); + group->avg_next_update = sched_clock() + psi_period; + INIT_DELAYED_WORK(&group->avgs_work, psi_avgs_work); + mutex_init(&group->avgs_lock); + /* Init trigger-related members */ + atomic_set(&group->poll_scheduled, 0); + mutex_init(&group->trigger_lock); + INIT_LIST_HEAD(&group->triggers); + memset(group->nr_triggers, 0, sizeof(group->nr_triggers)); + group->poll_states = 0; + group->poll_min_period = U32_MAX; + memset(group->polling_total, 0, sizeof(group->polling_total)); + group->polling_next_update = ULLONG_MAX; + group->polling_until = 0; + rcu_assign_pointer(group->poll_kworker, NULL); } void __init psi_init(void) @@ -210,20 +233,24 @@ static bool test_state(unsigned int *tasks, enum psi_states state) } } -static void get_recent_times(struct psi_group *group, int cpu, u32 *times) +static void get_recent_times(struct psi_group *group, int cpu, + enum psi_aggregators aggregator, u32 *times, + u32 *pchanged_states) { struct psi_group_cpu *groupc = per_cpu_ptr(group->pcpu, cpu); - unsigned int tasks[NR_PSI_TASK_COUNTS]; u64 now, state_start; + enum psi_states s; unsigned int seq; - int s; + u32 state_mask; + + *pchanged_states = 0; /* Snapshot a coherent view of the CPU state */ do { seq = read_seqcount_begin(&groupc->seq); now = cpu_clock(cpu); memcpy(times, groupc->times, sizeof(groupc->times)); - memcpy(tasks, groupc->tasks, sizeof(groupc->tasks)); + state_mask = groupc->state_mask; state_start = groupc->state_start; } while (read_seqcount_retry(&groupc->seq, seq)); @@ -239,13 +266,15 @@ static void get_recent_times(struct psi_group *group, int cpu, u32 *times) * (u32) and our reported pressure close to what's * actually happening. */ - if (test_state(tasks, s)) + if (state_mask & (1 << s)) times[s] += now - state_start; - delta = times[s] - groupc->times_prev[s]; - groupc->times_prev[s] = times[s]; + delta = times[s] - groupc->times_prev[aggregator][s]; + groupc->times_prev[aggregator][s] = times[s]; times[s] = delta; + if (delta) + *pchanged_states |= (1 << s); } } @@ -269,17 +298,16 @@ static void calc_avgs(unsigned long avg[3], int missed_periods, avg[2] = calc_load(avg[2], EXP_300s, pct); } -static bool update_stats(struct psi_group *group) +static void collect_percpu_times(struct psi_group *group, + enum psi_aggregators aggregator, + u32 *pchanged_states) { u64 deltas[NR_PSI_STATES - 1] = { 0, }; - unsigned long missed_periods = 0; unsigned long nonidle_total = 0; - u64 now, expires, period; + u32 changed_states = 0; int cpu; int s; - mutex_lock(&group->stat_lock); - /* * Collect the per-cpu time buckets and average them into a * single time sample that is normalized to wallclock time. @@ -291,8 +319,11 @@ static bool update_stats(struct psi_group *group) for_each_possible_cpu(cpu) { u32 times[NR_PSI_STATES]; u32 nonidle; + u32 cpu_changed_states; - get_recent_times(group, cpu, times); + get_recent_times(group, cpu, aggregator, times, + &cpu_changed_states); + changed_states |= cpu_changed_states; nonidle = nsecs_to_jiffies(times[PSI_NONIDLE]); nonidle_total += nonidle; @@ -315,14 +346,23 @@ static bool update_stats(struct psi_group *group) /* total= */ for (s = 0; s < NR_PSI_STATES - 1; s++) - group->total[s] += div_u64(deltas[s], max(nonidle_total, 1UL)); + group->total[aggregator][s] += + div_u64(deltas[s], max(nonidle_total, 1UL)); + + if (pchanged_states) + *pchanged_states = changed_states; +} + +static u64 update_averages(struct psi_group *group, u64 now) +{ + unsigned long missed_periods = 0; + u64 expires, period; + u64 avg_next_update; + int s; /* avgX= */ - now = sched_clock(); - expires = group->next_update; - if (now < expires) - goto out; - if (now - expires > psi_period) + expires = group->avg_next_update; + if (now - expires >= psi_period) missed_periods = div_u64(now - expires, psi_period); /* @@ -332,14 +372,14 @@ static bool update_stats(struct psi_group *group) * But the deltas we sample out of the per-cpu buckets above * are based on the actual time elapsing between clock ticks. */ - group->next_update = expires + ((1 + missed_periods) * psi_period); - period = now - (group->last_update + (missed_periods * psi_period)); - group->last_update = now; + avg_next_update = expires + ((1 + missed_periods) * psi_period); + period = now - (group->avg_last_update + (missed_periods * psi_period)); + group->avg_last_update = now; for (s = 0; s < NR_PSI_STATES - 1; s++) { u32 sample; - sample = group->total[s] - group->total_prev[s]; + sample = group->total[PSI_AVGS][s] - group->avg_total[s]; /* * Due to the lockless sampling of the time buckets, * recorded time deltas can slip into the next period, @@ -359,23 +399,30 @@ static bool update_stats(struct psi_group *group) */ if (sample > period) sample = period; - group->total_prev[s] += sample; + group->avg_total[s] += sample; calc_avgs(group->avg[s], missed_periods, sample, period); } -out: - mutex_unlock(&group->stat_lock); - return nonidle_total; + + return avg_next_update; } -static void psi_update_work(struct work_struct *work) +static void psi_avgs_work(struct work_struct *work) { struct delayed_work *dwork; struct psi_group *group; + u32 changed_states; bool nonidle; + u64 now; dwork = to_delayed_work(work); - group = container_of(dwork, struct psi_group, clock_work); + group = container_of(dwork, struct psi_group, avgs_work); + + mutex_lock(&group->avgs_lock); + now = sched_clock(); + + collect_percpu_times(group, PSI_AVGS, &changed_states); + nonidle = changed_states & (1 << PSI_NONIDLE); /* * If there is task activity, periodically fold the per-cpu * times and feed samples into the running averages. If things @@ -383,18 +430,196 @@ static void psi_update_work(struct work_struct *work) * Once restarted, we'll catch up the running averages in one * go - see calc_avgs() and missed_periods. */ - - nonidle = update_stats(group); + if (now >= group->avg_next_update) + group->avg_next_update = update_averages(group, now); if (nonidle) { - unsigned long delay = 0; - u64 now; + schedule_delayed_work(dwork, nsecs_to_jiffies( + group->avg_next_update - now) + 1); + } + + mutex_unlock(&group->avgs_lock); +} + +/* Trigger tracking window manupulations */ +static void window_reset(struct psi_window *win, u64 now, u64 value, + u64 prev_growth) +{ + win->start_time = now; + win->start_value = value; + win->prev_growth = prev_growth; +} + +/* + * PSI growth tracking window update and growth calculation routine. + * + * This approximates a sliding tracking window by interpolating + * partially elapsed windows using historical growth data from the + * previous intervals. This minimizes memory requirements (by not storing + * all the intermediate values in the previous window) and simplifies + * the calculations. It works well because PSI signal changes only in + * positive direction and over relatively small window sizes the growth + * is close to linear. + */ +static u64 window_update(struct psi_window *win, u64 now, u64 value) +{ + u64 elapsed; + u64 growth; + + elapsed = now - win->start_time; + growth = value - win->start_value; + /* + * After each tracking window passes win->start_value and + * win->start_time get reset and win->prev_growth stores + * the average per-window growth of the previous window. + * win->prev_growth is then used to interpolate additional + * growth from the previous window assuming it was linear. + */ + if (elapsed > win->size) + window_reset(win, now, value, growth); + else { + u32 remaining; + + remaining = win->size - elapsed; + growth += div_u64(win->prev_growth * remaining, win->size); + } + + return growth; +} + +static void init_triggers(struct psi_group *group, u64 now) +{ + struct psi_trigger *t; + + list_for_each_entry(t, &group->triggers, node) + window_reset(&t->win, now, + group->total[PSI_POLL][t->state], 0); + memcpy(group->polling_total, group->total[PSI_POLL], + sizeof(group->polling_total)); + group->polling_next_update = now + group->poll_min_period; +} + +static u64 update_triggers(struct psi_group *group, u64 now) +{ + struct psi_trigger *t; + bool new_stall = false; + u64 *total = group->total[PSI_POLL]; + + /* + * On subsequent updates, calculate growth deltas and let + * watchers know when their specified thresholds are exceeded. + */ + list_for_each_entry(t, &group->triggers, node) { + u64 growth; + + /* Check for stall activity */ + if (group->polling_total[t->state] == total[t->state]) + continue; + + /* + * Multiple triggers might be looking at the same state, + * remember to update group->polling_total[] once we've + * been through all of them. Also remember to extend the + * polling time if we see new stall activity. + */ + new_stall = true; + + /* Calculate growth since last update */ + growth = window_update(&t->win, now, total[t->state]); + if (growth < t->threshold) + continue; + + /* Limit event signaling to once per window */ + if (now < t->last_event_time + t->win.size) + continue; + + /* Generate an event */ + if (cmpxchg(&t->event, 0, 1) == 0) + wake_up_interruptible(&t->event_wait); + t->last_event_time = now; + } + + if (new_stall) + memcpy(group->polling_total, total, + sizeof(group->polling_total)); + + return now + group->poll_min_period; +} + +/* + * Schedule polling if it's not already scheduled. It's safe to call even from + * hotpath because even though kthread_queue_delayed_work takes worker->lock + * spinlock that spinlock is never contended due to poll_scheduled atomic + * preventing such competition. + */ +static void psi_schedule_poll_work(struct psi_group *group, unsigned long delay) +{ + struct kthread_worker *kworker; + + /* Do not reschedule if already scheduled */ + if (atomic_cmpxchg(&group->poll_scheduled, 0, 1) != 0) + return; + + rcu_read_lock(); - now = sched_clock(); - if (group->next_update > now) - delay = nsecs_to_jiffies(group->next_update - now) + 1; - schedule_delayed_work(dwork, delay); + kworker = rcu_dereference(group->poll_kworker); + /* + * kworker might be NULL in case psi_trigger_destroy races with + * psi_task_change (hotpath) which can't use locks + */ + if (likely(kworker)) + kthread_queue_delayed_work(kworker, &group->poll_work, delay); + else + atomic_set(&group->poll_scheduled, 0); + + rcu_read_unlock(); +} + +static void psi_poll_work(struct kthread_work *work) +{ + struct kthread_delayed_work *dwork; + struct psi_group *group; + u32 changed_states; + u64 now; + + dwork = container_of(work, struct kthread_delayed_work, work); + group = container_of(dwork, struct psi_group, poll_work); + + atomic_set(&group->poll_scheduled, 0); + + mutex_lock(&group->trigger_lock); + + now = sched_clock(); + + collect_percpu_times(group, PSI_POLL, &changed_states); + + if (changed_states & group->poll_states) { + /* Initialize trigger windows when entering polling mode */ + if (now > group->polling_until) + init_triggers(group, now); + + /* + * Keep the monitor active for at least the duration of the + * minimum tracking window as long as monitor states are + * changing. + */ + group->polling_until = now + + group->poll_min_period * UPDATES_PER_WINDOW; + } + + if (now > group->polling_until) { + group->polling_next_update = ULLONG_MAX; + goto out; } + + if (now >= group->polling_next_update) + group->polling_next_update = update_triggers(group, now); + + psi_schedule_poll_work(group, + nsecs_to_jiffies(group->polling_next_update - now) + 1); + +out: + mutex_unlock(&group->trigger_lock); } static void record_times(struct psi_group_cpu *groupc, int cpu, @@ -407,15 +632,15 @@ static void record_times(struct psi_group_cpu *groupc, int cpu, delta = now - groupc->state_start; groupc->state_start = now; - if (test_state(groupc->tasks, PSI_IO_SOME)) { + if (groupc->state_mask & (1 << PSI_IO_SOME)) { groupc->times[PSI_IO_SOME] += delta; - if (test_state(groupc->tasks, PSI_IO_FULL)) + if (groupc->state_mask & (1 << PSI_IO_FULL)) groupc->times[PSI_IO_FULL] += delta; } - if (test_state(groupc->tasks, PSI_MEM_SOME)) { + if (groupc->state_mask & (1 << PSI_MEM_SOME)) { groupc->times[PSI_MEM_SOME] += delta; - if (test_state(groupc->tasks, PSI_MEM_FULL)) + if (groupc->state_mask & (1 << PSI_MEM_FULL)) groupc->times[PSI_MEM_FULL] += delta; else if (memstall_tick) { u32 sample; @@ -436,18 +661,20 @@ static void record_times(struct psi_group_cpu *groupc, int cpu, } } - if (test_state(groupc->tasks, PSI_CPU_SOME)) + if (groupc->state_mask & (1 << PSI_CPU_SOME)) groupc->times[PSI_CPU_SOME] += delta; - if (test_state(groupc->tasks, PSI_NONIDLE)) + if (groupc->state_mask & (1 << PSI_NONIDLE)) groupc->times[PSI_NONIDLE] += delta; } -static void psi_group_change(struct psi_group *group, int cpu, - unsigned int clear, unsigned int set) +static u32 psi_group_change(struct psi_group *group, int cpu, + unsigned int clear, unsigned int set) { struct psi_group_cpu *groupc; unsigned int t, m; + enum psi_states s; + u32 state_mask = 0; groupc = per_cpu_ptr(group->pcpu, cpu); @@ -480,7 +707,16 @@ static void psi_group_change(struct psi_group *group, int cpu, if (set & (1 << t)) groupc->tasks[t]++; + /* Calculate state mask representing active states */ + for (s = 0; s < NR_PSI_STATES; s++) { + if (test_state(groupc->tasks, s)) + state_mask |= (1 << s); + } + groupc->state_mask = state_mask; + write_seqcount_end(&groupc->seq); + + return state_mask; } static struct psi_group *iterate_groups(struct task_struct *task, void **iter) @@ -537,13 +773,17 @@ void psi_task_change(struct task_struct *task, int clear, int set) */ if (unlikely((clear & TSK_RUNNING) && (task->flags & PF_WQ_WORKER) && - wq_worker_last_func(task) == psi_update_work)) + wq_worker_last_func(task) == psi_avgs_work)) wake_clock = false; while ((group = iterate_groups(task, &iter))) { - psi_group_change(group, cpu, clear, set); - if (wake_clock && !delayed_work_pending(&group->clock_work)) - schedule_delayed_work(&group->clock_work, PSI_FREQ); + u32 state_mask = psi_group_change(group, cpu, clear, set); + + if (state_mask & group->poll_states) + psi_schedule_poll_work(group, 1); + + if (wake_clock && !delayed_work_pending(&group->avgs_work)) + schedule_delayed_work(&group->avgs_work, PSI_FREQ); } } @@ -640,8 +880,10 @@ void psi_cgroup_free(struct cgroup *cgroup) if (static_branch_likely(&psi_disabled)) return; - cancel_delayed_work_sync(&cgroup->psi.clock_work); + cancel_delayed_work_sync(&cgroup->psi.avgs_work); free_percpu(cgroup->psi.pcpu); + /* All triggers must be removed by now */ + WARN_ONCE(cgroup->psi.poll_states, "psi: trigger leak\n"); } /** @@ -697,11 +939,18 @@ void cgroup_move_task(struct task_struct *task, struct css_set *to) int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res) { int full; + u64 now; if (static_branch_likely(&psi_disabled)) return -EOPNOTSUPP; - update_stats(group); + /* Update averages before reporting them */ + mutex_lock(&group->avgs_lock); + now = sched_clock(); + collect_percpu_times(group, PSI_AVGS, NULL); + if (now >= group->avg_next_update) + group->avg_next_update = update_averages(group, now); + mutex_unlock(&group->avgs_lock); for (full = 0; full < 2 - (res == PSI_CPU); full++) { unsigned long avg[3]; @@ -710,7 +959,8 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res) for (w = 0; w < 3; w++) avg[w] = group->avg[res * 2 + full][w]; - total = div_u64(group->total[res * 2 + full], NSEC_PER_USEC); + total = div_u64(group->total[PSI_AVGS][res * 2 + full], + NSEC_PER_USEC); seq_printf(m, "%s avg10=%lu.%02lu avg60=%lu.%02lu avg300=%lu.%02lu total=%llu\n", full ? "full" : "some", @@ -753,25 +1003,270 @@ static int psi_cpu_open(struct inode *inode, struct file *file) return single_open(file, psi_cpu_show, NULL); } +struct psi_trigger *psi_trigger_create(struct psi_group *group, + char *buf, size_t nbytes, enum psi_res res) +{ + struct psi_trigger *t; + enum psi_states state; + u32 threshold_us; + u32 window_us; + + if (static_branch_likely(&psi_disabled)) + return ERR_PTR(-EOPNOTSUPP); + + if (sscanf(buf, "some %u %u", &threshold_us, &window_us) == 2) + state = PSI_IO_SOME + res * 2; + else if (sscanf(buf, "full %u %u", &threshold_us, &window_us) == 2) + state = PSI_IO_FULL + res * 2; + else + return ERR_PTR(-EINVAL); + + if (state >= PSI_NONIDLE) + return ERR_PTR(-EINVAL); + + if (window_us < WINDOW_MIN_US || + window_us > WINDOW_MAX_US) + return ERR_PTR(-EINVAL); + + /* Check threshold */ + if (threshold_us == 0 || threshold_us > window_us) + return ERR_PTR(-EINVAL); + + t = kmalloc(sizeof(*t), GFP_KERNEL); + if (!t) + return ERR_PTR(-ENOMEM); + + t->group = group; + t->state = state; + t->threshold = threshold_us * NSEC_PER_USEC; + t->win.size = window_us * NSEC_PER_USEC; + window_reset(&t->win, 0, 0, 0); + + t->event = 0; + t->last_event_time = 0; + init_waitqueue_head(&t->event_wait); + kref_init(&t->refcount); + + mutex_lock(&group->trigger_lock); + + if (!rcu_access_pointer(group->poll_kworker)) { + struct sched_param param = { + .sched_priority = MAX_RT_PRIO - 1, + }; + struct kthread_worker *kworker; + + kworker = kthread_create_worker(0, "psimon"); + if (IS_ERR(kworker)) { + kfree(t); + mutex_unlock(&group->trigger_lock); + return ERR_CAST(kworker); + } + sched_setscheduler(kworker->task, SCHED_FIFO, ¶m); + kthread_init_delayed_work(&group->poll_work, + psi_poll_work); + rcu_assign_pointer(group->poll_kworker, kworker); + } + + list_add(&t->node, &group->triggers); + group->poll_min_period = min(group->poll_min_period, + div_u64(t->win.size, UPDATES_PER_WINDOW)); + group->nr_triggers[t->state]++; + group->poll_states |= (1 << t->state); + + mutex_unlock(&group->trigger_lock); + + return t; +} + +static void psi_trigger_destroy(struct kref *ref) +{ + struct psi_trigger *t = container_of(ref, struct psi_trigger, refcount); + struct psi_group *group = t->group; + struct kthread_worker *kworker_to_destroy = NULL; + + if (static_branch_likely(&psi_disabled)) + return; + + /* + * Wakeup waiters to stop polling. Can happen if cgroup is deleted + * from under a polling process. + */ + wake_up_interruptible(&t->event_wait); + + mutex_lock(&group->trigger_lock); + + if (!list_empty(&t->node)) { + struct psi_trigger *tmp; + u64 period = ULLONG_MAX; + + list_del(&t->node); + group->nr_triggers[t->state]--; + if (!group->nr_triggers[t->state]) + group->poll_states &= ~(1 << t->state); + /* reset min update period for the remaining triggers */ + list_for_each_entry(tmp, &group->triggers, node) + period = min(period, div_u64(tmp->win.size, + UPDATES_PER_WINDOW)); + group->poll_min_period = period; + /* Destroy poll_kworker when the last trigger is destroyed */ + if (group->poll_states == 0) { + group->polling_until = 0; + kworker_to_destroy = rcu_dereference_protected( + group->poll_kworker, + lockdep_is_held(&group->trigger_lock)); + rcu_assign_pointer(group->poll_kworker, NULL); + } + } + + mutex_unlock(&group->trigger_lock); + + /* + * Wait for both *trigger_ptr from psi_trigger_replace and + * poll_kworker RCUs to complete their read-side critical sections + * before destroying the trigger and optionally the poll_kworker + */ + synchronize_rcu(); + /* + * Destroy the kworker after releasing trigger_lock to prevent a + * deadlock while waiting for psi_poll_work to acquire trigger_lock + */ + if (kworker_to_destroy) { + kthread_cancel_delayed_work_sync(&group->poll_work); + kthread_destroy_worker(kworker_to_destroy); + } + kfree(t); +} + +void psi_trigger_replace(void **trigger_ptr, struct psi_trigger *new) +{ + struct psi_trigger *old = *trigger_ptr; + + if (static_branch_likely(&psi_disabled)) + return; + + rcu_assign_pointer(*trigger_ptr, new); + if (old) + kref_put(&old->refcount, psi_trigger_destroy); +} + +__poll_t psi_trigger_poll(void **trigger_ptr, + struct file *file, poll_table *wait) +{ + __poll_t ret = DEFAULT_POLLMASK; + struct psi_trigger *t; + + if (static_branch_likely(&psi_disabled)) + return DEFAULT_POLLMASK | EPOLLERR | EPOLLPRI; + + rcu_read_lock(); + + t = rcu_dereference(*(void __rcu __force **)trigger_ptr); + if (!t) { + rcu_read_unlock(); + return DEFAULT_POLLMASK | EPOLLERR | EPOLLPRI; + } + kref_get(&t->refcount); + + rcu_read_unlock(); + + poll_wait(file, &t->event_wait, wait); + + if (cmpxchg(&t->event, 1, 0) == 1) + ret |= EPOLLPRI; + + kref_put(&t->refcount, psi_trigger_destroy); + + return ret; +} + +static ssize_t psi_write(struct file *file, const char __user *user_buf, + size_t nbytes, enum psi_res res) +{ + char buf[32]; + size_t buf_size; + struct seq_file *seq; + struct psi_trigger *new; + + if (static_branch_likely(&psi_disabled)) + return -EOPNOTSUPP; + + buf_size = min(nbytes, (sizeof(buf) - 1)); + if (copy_from_user(buf, user_buf, buf_size)) + return -EFAULT; + + buf[buf_size - 1] = '\0'; + + new = psi_trigger_create(&psi_system, buf, nbytes, res); + if (IS_ERR(new)) + return PTR_ERR(new); + + seq = file->private_data; + /* Take seq->lock to protect seq->private from concurrent writes */ + mutex_lock(&seq->lock); + psi_trigger_replace(&seq->private, new); + mutex_unlock(&seq->lock); + + return nbytes; +} + +static ssize_t psi_io_write(struct file *file, const char __user *user_buf, + size_t nbytes, loff_t *ppos) +{ + return psi_write(file, user_buf, nbytes, PSI_IO); +} + +static ssize_t psi_memory_write(struct file *file, const char __user *user_buf, + size_t nbytes, loff_t *ppos) +{ + return psi_write(file, user_buf, nbytes, PSI_MEM); +} + +static ssize_t psi_cpu_write(struct file *file, const char __user *user_buf, + size_t nbytes, loff_t *ppos) +{ + return psi_write(file, user_buf, nbytes, PSI_CPU); +} + +static __poll_t psi_fop_poll(struct file *file, poll_table *wait) +{ + struct seq_file *seq = file->private_data; + + return psi_trigger_poll(&seq->private, file, wait); +} + +static int psi_fop_release(struct inode *inode, struct file *file) +{ + struct seq_file *seq = file->private_data; + + psi_trigger_replace(&seq->private, NULL); + return single_release(inode, file); +} + static const struct file_operations psi_io_fops = { .open = psi_io_open, .read = seq_read, .llseek = seq_lseek, - .release = single_release, + .write = psi_io_write, + .poll = psi_fop_poll, + .release = psi_fop_release, }; static const struct file_operations psi_memory_fops = { .open = psi_memory_open, .read = seq_read, .llseek = seq_lseek, - .release = single_release, + .write = psi_memory_write, + .poll = psi_fop_poll, + .release = psi_fop_release, }; static const struct file_operations psi_cpu_fops = { .open = psi_cpu_open, .read = seq_read, .llseek = seq_lseek, - .release = single_release, + .write = psi_cpu_write, + .poll = psi_fop_poll, + .release = psi_fop_release, }; static int __init psi_proc_init(void) diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index e4f398ad9e73..a532558a5176 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1587,7 +1587,7 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) * rt task */ if (rq->curr->sched_class != &rt_sched_class) - update_rt_rq_load_avg(rq_clock_task(rq), rq, 0); + update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 0); return p; } @@ -1596,7 +1596,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p) { update_curr_rt(rq); - update_rt_rq_load_avg(rq_clock_task(rq), rq, 1); + update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 1); /* * The previous task needs to be made eligible for pushing @@ -1614,7 +1614,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p) static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu) { if (!task_running(rq, p) && - cpumask_test_cpu(cpu, &p->cpus_allowed)) + cpumask_test_cpu(cpu, p->cpus_ptr)) return 1; return 0; @@ -1751,7 +1751,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq) * Also make sure that it wasn't scheduled on its rq. */ if (unlikely(task_rq(task) != rq || - !cpumask_test_cpu(lowest_rq->cpu, &task->cpus_allowed) || + !cpumask_test_cpu(lowest_rq->cpu, task->cpus_ptr) || task_running(rq, task) || !rt_task(task) || !task_on_rq_queued(task))) { @@ -2325,7 +2325,7 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued) struct sched_rt_entity *rt_se = &p->rt; update_curr_rt(rq); - update_rt_rq_load_avg(rq_clock_task(rq), rq, 1); + update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 1); watchdog(rq, p); @@ -2400,6 +2400,10 @@ const struct sched_class rt_sched_class = { .switched_to = switched_to_rt, .update_curr = update_curr_rt, + +#ifdef CONFIG_UCLAMP_TASK + .uclamp_enabled = 1, +#endif }; #ifdef CONFIG_RT_GROUP_SCHED @@ -2555,6 +2559,8 @@ int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us) rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC; if (rt_runtime_us < 0) rt_runtime = RUNTIME_INF; + else if ((u64)rt_runtime_us > U64_MAX / NSEC_PER_USEC) + return -EINVAL; return tg_set_rt_bandwidth(tg, rt_period, rt_runtime); } @@ -2575,6 +2581,9 @@ int sched_group_set_rt_period(struct task_group *tg, u64 rt_period_us) { u64 rt_runtime, rt_period; + if (rt_period_us > U64_MAX / NSEC_PER_USEC) + return -EINVAL; + rt_period = rt_period_us * NSEC_PER_USEC; rt_runtime = tg->rt_bandwidth.rt_runtime; diff --git a/kernel/sched/sched-pelt.h b/kernel/sched/sched-pelt.h index a26473674fb7..c529706bed11 100644 --- a/kernel/sched/sched-pelt.h +++ b/kernel/sched/sched-pelt.h @@ -1,7 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0 */ /* Generated by Documentation/scheduler/sched-pelt; do not modify. */ -static const u32 runnable_avg_yN_inv[] = { +static const u32 runnable_avg_yN_inv[] __maybe_unused = { 0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6, 0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85, 0xc5672a10, 0xc12c4cc9, 0xbd08a39e, 0xb8fbaf46, 0xb504f333, 0xb123f581, diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index d04530bf251f..802b1f3405f2 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -96,12 +96,6 @@ extern atomic_long_t calc_load_tasks; extern void calc_global_load_tick(struct rq *this_rq); extern long calc_load_fold_active(struct rq *this_rq, long adjust); -#ifdef CONFIG_SMP -extern void cpu_load_update_active(struct rq *this_rq); -#else -static inline void cpu_load_update_active(struct rq *this_rq) { } -#endif - /* * Helpers for converting nanosecond timing to jiffy resolution */ @@ -344,8 +338,10 @@ struct cfs_bandwidth { u64 runtime_expires; int expires_seq; - short idle; - short period_active; + u8 idle; + u8 period_active; + u8 distribute_running; + u8 slack_started; struct hrtimer period_timer; struct hrtimer slack_timer; struct list_head throttled_cfs_rq; @@ -354,8 +350,6 @@ struct cfs_bandwidth { int nr_periods; int nr_throttled; u64 throttled_time; - - bool distribute_running; #endif }; @@ -780,7 +774,7 @@ struct root_domain { * NULL-terminated list of performance domains intersecting with the * CPUs of the rd. Protected by RCU. */ - struct perf_domain *pd; + struct perf_domain __rcu *pd; }; extern struct root_domain def_root_domain; @@ -797,6 +791,48 @@ extern void rto_push_irq_work_func(struct irq_work *work); #endif #endif /* CONFIG_SMP */ +#ifdef CONFIG_UCLAMP_TASK +/* + * struct uclamp_bucket - Utilization clamp bucket + * @value: utilization clamp value for tasks on this clamp bucket + * @tasks: number of RUNNABLE tasks on this clamp bucket + * + * Keep track of how many tasks are RUNNABLE for a given utilization + * clamp value. + */ +struct uclamp_bucket { + unsigned long value : bits_per(SCHED_CAPACITY_SCALE); + unsigned long tasks : BITS_PER_LONG - bits_per(SCHED_CAPACITY_SCALE); +}; + +/* + * struct uclamp_rq - rq's utilization clamp + * @value: currently active clamp values for a rq + * @bucket: utilization clamp buckets affecting a rq + * + * Keep track of RUNNABLE tasks on a rq to aggregate their clamp values. + * A clamp value is affecting a rq when there is at least one task RUNNABLE + * (or actually running) with that value. + * + * There are up to UCLAMP_CNT possible different clamp values, currently there + * are only two: minimum utilization and maximum utilization. + * + * All utilization clamping values are MAX aggregated, since: + * - for util_min: we want to run the CPU at least at the max of the minimum + * utilization required by its currently RUNNABLE tasks. + * - for util_max: we want to allow the CPU to run up to the max of the + * maximum utilization allowed by its currently RUNNABLE tasks. + * + * Since on each system we expect only a limited number of different + * utilization clamp values (UCLAMP_BUCKETS), use a simple array to track + * the metrics required to compute all the per-rq utilization clamp values. + */ +struct uclamp_rq { + unsigned int value; + struct uclamp_bucket bucket[UCLAMP_BUCKETS]; +}; +#endif /* CONFIG_UCLAMP_TASK */ + /* * This is the main, per-CPU runqueue data structure. * @@ -818,8 +854,6 @@ struct rq { unsigned int nr_preferred_running; unsigned int numa_migrate_on; #endif - #define CPU_LOAD_IDX_MAX 5 - unsigned long cpu_load[CPU_LOAD_IDX_MAX]; #ifdef CONFIG_NO_HZ_COMMON #ifdef CONFIG_SMP unsigned long last_load_update_tick; @@ -830,11 +864,16 @@ struct rq { atomic_t nohz_flags; #endif /* CONFIG_NO_HZ_COMMON */ - /* capture load from *all* tasks on this CPU: */ - struct load_weight load; unsigned long nr_load_updates; u64 nr_switches; +#ifdef CONFIG_UCLAMP_TASK + /* Utilization clamp values based on CPU's RUNNABLE tasks */ + struct uclamp_rq uclamp[UCLAMP_CNT] ____cacheline_aligned; + unsigned int uclamp_flags; +#define UCLAMP_FLAG_IDLE 0x01 +#endif + struct cfs_rq cfs; struct rt_rq rt; struct dl_rq dl; @@ -861,13 +900,16 @@ struct rq { unsigned int clock_update_flags; u64 clock; - u64 clock_task; + /* Ensure that all clocks are in the same cache line */ + u64 clock_task ____cacheline_aligned; + u64 clock_pelt; + unsigned long lost_idle_time; atomic_t nr_iowait; #ifdef CONFIG_SMP - struct root_domain *rd; - struct sched_domain *sd; + struct root_domain *rd; + struct sched_domain __rcu *sd; unsigned long cpu_capacity; unsigned long cpu_capacity_orig; @@ -951,6 +993,22 @@ struct rq { #endif }; +#ifdef CONFIG_FAIR_GROUP_SCHED + +/* CPU runqueue to which this cfs_rq is attached */ +static inline struct rq *rq_of(struct cfs_rq *cfs_rq) +{ + return cfs_rq->rq; +} + +#else + +static inline struct rq *rq_of(struct cfs_rq *cfs_rq) +{ + return container_of(cfs_rq, struct rq, cfs); +} +#endif + static inline int cpu_of(struct rq *rq) { #ifdef CONFIG_SMP @@ -1260,7 +1318,7 @@ extern void sched_ttwu_pending(void); /* * The domain tree (rq->sd) is protected by RCU's quiescent state transition. - * See detach_destroy_domains: synchronize_sched for details. + * See destroy_sched_domains: call_rcu for details. * * The domain tree of any CPU may only be accessed from within * preempt-disabled sections. @@ -1305,13 +1363,13 @@ static inline struct sched_domain *lowest_flag_domain(int cpu, int flag) return sd; } -DECLARE_PER_CPU(struct sched_domain *, sd_llc); +DECLARE_PER_CPU(struct sched_domain __rcu *, sd_llc); DECLARE_PER_CPU(int, sd_llc_size); DECLARE_PER_CPU(int, sd_llc_id); -DECLARE_PER_CPU(struct sched_domain_shared *, sd_llc_shared); -DECLARE_PER_CPU(struct sched_domain *, sd_numa); -DECLARE_PER_CPU(struct sched_domain *, sd_asym_packing); -DECLARE_PER_CPU(struct sched_domain *, sd_asym_cpucapacity); +DECLARE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared); +DECLARE_PER_CPU(struct sched_domain __rcu *, sd_numa); +DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing); +DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity); extern struct static_key_false sched_asym_cpucapacity; struct sched_group_capacity { @@ -1460,9 +1518,9 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) */ smp_wmb(); #ifdef CONFIG_THREAD_INFO_IN_TASK - p->cpu = cpu; + WRITE_ONCE(p->cpu, cpu); #else - task_thread_info(p)->cpu = cpu; + WRITE_ONCE(task_thread_info(p)->cpu, cpu); #endif p->wake_cpu = cpu; #endif @@ -1563,7 +1621,7 @@ static inline int task_on_rq_queued(struct task_struct *p) static inline int task_on_rq_migrating(struct task_struct *p) { - return p->on_rq == TASK_ON_RQ_MIGRATING; + return READ_ONCE(p->on_rq) == TASK_ON_RQ_MIGRATING; } /* @@ -1630,6 +1688,10 @@ extern const u32 sched_prio_to_wmult[40]; struct sched_class { const struct sched_class *next; +#ifdef CONFIG_UCLAMP_TASK + int uclamp_enabled; +#endif + void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags); void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags); void (*yield_task) (struct rq *rq); @@ -1781,7 +1843,7 @@ extern void init_dl_rq_bw_ratio(struct dl_rq *dl_rq); unsigned long to_ratio(u64 period, u64 runtime); extern void init_entity_runnable_average(struct sched_entity *se); -extern void post_init_entity_util_avg(struct sched_entity *se); +extern void post_init_entity_util_avg(struct task_struct *p); #ifdef CONFIG_NO_HZ_FULL extern bool sched_can_stop_tick(struct rq *rq); @@ -2166,7 +2228,7 @@ static inline u64 irq_time_read(int cpu) #endif /* CONFIG_IRQ_TIME_ACCOUNTING */ #ifdef CONFIG_CPU_FREQ -DECLARE_PER_CPU(struct update_util_data *, cpufreq_update_util_data); +DECLARE_PER_CPU(struct update_util_data __rcu *, cpufreq_update_util_data); /** * cpufreq_update_util - Take a note about CPU utilization changes. @@ -2203,6 +2265,48 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {} #endif /* CONFIG_CPU_FREQ */ +#ifdef CONFIG_UCLAMP_TASK +unsigned int uclamp_eff_value(struct task_struct *p, unsigned int clamp_id); + +static __always_inline +unsigned int uclamp_util_with(struct rq *rq, unsigned int util, + struct task_struct *p) +{ + unsigned int min_util = READ_ONCE(rq->uclamp[UCLAMP_MIN].value); + unsigned int max_util = READ_ONCE(rq->uclamp[UCLAMP_MAX].value); + + if (p) { + min_util = max(min_util, uclamp_eff_value(p, UCLAMP_MIN)); + max_util = max(max_util, uclamp_eff_value(p, UCLAMP_MAX)); + } + + /* + * Since CPU's {min,max}_util clamps are MAX aggregated considering + * RUNNABLE tasks with _different_ clamps, we can end up with an + * inversion. Fix it now when the clamps are applied. + */ + if (unlikely(min_util >= max_util)) + return min_util; + + return clamp(util, min_util, max_util); +} + +static inline unsigned int uclamp_util(struct rq *rq, unsigned int util) +{ + return uclamp_util_with(rq, util, NULL); +} +#else /* CONFIG_UCLAMP_TASK */ +static inline unsigned int uclamp_util_with(struct rq *rq, unsigned int util, + struct task_struct *p) +{ + return util; +} +static inline unsigned int uclamp_util(struct rq *rq, unsigned int util) +{ + return util; +} +#endif /* CONFIG_UCLAMP_TASK */ + #ifdef arch_scale_freq_capacity # ifndef arch_scale_freq_invariant # define arch_scale_freq_invariant() true @@ -2211,7 +2315,13 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {} # define arch_scale_freq_invariant() false #endif -#ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL +#ifdef CONFIG_SMP +static inline unsigned long capacity_orig_of(int cpu) +{ + return cpu_rq(cpu)->cpu_capacity_orig; +} +#endif + /** * enum schedutil_type - CPU utilization type * @FREQUENCY_UTIL: Utilization used to select frequency @@ -2227,15 +2337,11 @@ enum schedutil_type { ENERGY_UTIL, }; -unsigned long schedutil_freq_util(int cpu, unsigned long util_cfs, - unsigned long max, enum schedutil_type type); - -static inline unsigned long schedutil_energy_util(int cpu, unsigned long cfs) -{ - unsigned long max = arch_scale_cpu_capacity(NULL, cpu); +#ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL - return schedutil_freq_util(cpu, cfs, max, ENERGY_UTIL); -} +unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs, + unsigned long max, enum schedutil_type type, + struct task_struct *p); static inline unsigned long cpu_bw_dl(struct rq *rq) { @@ -2264,11 +2370,13 @@ static inline unsigned long cpu_util_rt(struct rq *rq) return READ_ONCE(rq->avg_rt.util_avg); } #else /* CONFIG_CPU_FREQ_GOV_SCHEDUTIL */ -static inline unsigned long schedutil_energy_util(int cpu, unsigned long cfs) +static inline unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs, + unsigned long max, enum schedutil_type type, + struct task_struct *p) { - return cfs; + return 0; } -#endif +#endif /* CONFIG_CPU_FREQ_GOV_SCHEDUTIL */ #ifdef CONFIG_HAVE_SCHED_AVG_IRQ static inline unsigned long cpu_util_irq(struct rq *rq) @@ -2299,11 +2407,19 @@ unsigned long scale_irq_capacity(unsigned long util, unsigned long irq, unsigned #endif #if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) + #define perf_domain_span(pd) (to_cpumask(((pd)->em_pd->cpus))) -#else + +DECLARE_STATIC_KEY_FALSE(sched_energy_present); + +static inline bool sched_energy_enabled(void) +{ + return static_branch_unlikely(&sched_energy_present); +} + +#else /* ! (CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL) */ + #define perf_domain_span(pd) NULL -#endif +static inline bool sched_energy_enabled(void) { return false; } -#ifdef CONFIG_SMP -extern struct static_key_false sched_energy_present; -#endif +#endif /* CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL */ diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 3f35ba1d8fde..f751ce0b783e 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -201,11 +201,37 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) return 1; } -DEFINE_STATIC_KEY_FALSE(sched_energy_present); #if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) +DEFINE_STATIC_KEY_FALSE(sched_energy_present); +unsigned int sysctl_sched_energy_aware = 1; DEFINE_MUTEX(sched_energy_mutex); bool sched_energy_update; +#ifdef CONFIG_PROC_SYSCTL +int sched_energy_aware_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int ret, state; + + if (write && !capable(CAP_SYS_ADMIN)) + return -EPERM; + + ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + if (!ret && write) { + state = static_branch_unlikely(&sched_energy_present); + if (state != sysctl_sched_energy_aware) { + mutex_lock(&sched_energy_mutex); + sched_energy_update = 1; + rebuild_sched_domains(); + sched_energy_update = 0; + mutex_unlock(&sched_energy_mutex); + } + } + + return ret; +} +#endif + static void free_pd(struct perf_domain *pd) { struct perf_domain *tmp; @@ -322,6 +348,9 @@ static bool build_perf_domains(const struct cpumask *cpu_map) struct cpufreq_policy *policy; struct cpufreq_governor *gov; + if (!sysctl_sched_energy_aware) + goto free; + /* EAS is enabled for asymmetric CPU capacity topologies. */ if (!per_cpu(sd_asym_cpucapacity, cpu)) { if (sched_debug()) { @@ -442,7 +471,7 @@ void rq_attach_root(struct rq *rq, struct root_domain *rd) raw_spin_unlock_irqrestore(&rq->lock, flags); if (old_rd) - call_rcu_sched(&old_rd->rcu, free_rootdomain); + call_rcu(&old_rd->rcu, free_rootdomain); } void sched_get_rd(struct root_domain *rd) @@ -455,7 +484,7 @@ void sched_put_rd(struct root_domain *rd) if (!atomic_dec_and_test(&rd->refcount)) return; - call_rcu_sched(&rd->rcu, free_rootdomain); + call_rcu(&rd->rcu, free_rootdomain); } static int init_rootdomain(struct root_domain *rd) @@ -586,13 +615,13 @@ static void destroy_sched_domains(struct sched_domain *sd) * the cpumask of the domain), this allows us to quickly tell if * two CPUs are in the same cache domain, see cpus_share_cache(). */ -DEFINE_PER_CPU(struct sched_domain *, sd_llc); +DEFINE_PER_CPU(struct sched_domain __rcu *, sd_llc); DEFINE_PER_CPU(int, sd_llc_size); DEFINE_PER_CPU(int, sd_llc_id); -DEFINE_PER_CPU(struct sched_domain_shared *, sd_llc_shared); -DEFINE_PER_CPU(struct sched_domain *, sd_numa); -DEFINE_PER_CPU(struct sched_domain *, sd_asym_packing); -DEFINE_PER_CPU(struct sched_domain *, sd_asym_cpucapacity); +DEFINE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared); +DEFINE_PER_CPU(struct sched_domain __rcu *, sd_numa); +DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing); +DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity); DEFINE_STATIC_KEY_FALSE(sched_asym_cpucapacity); static void update_top_cache_domain(int cpu) @@ -676,7 +705,7 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) } struct s_data { - struct sched_domain ** __percpu sd; + struct sched_domain * __percpu *sd; struct root_domain *rd; }; @@ -1030,6 +1059,7 @@ static struct sched_group *get_group(int cpu, struct sd_data *sdd) struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu); struct sched_domain *child = sd->child; struct sched_group *sg; + bool already_visited; if (child) cpu = cpumask_first(sched_domain_span(child)); @@ -1037,9 +1067,14 @@ static struct sched_group *get_group(int cpu, struct sd_data *sdd) sg = *per_cpu_ptr(sdd->sg, cpu); sg->sgc = *per_cpu_ptr(sdd->sgc, cpu); - /* For claim_allocations: */ - atomic_inc(&sg->ref); - atomic_inc(&sg->sgc->ref); + /* Increase refcounts for claim_allocations: */ + already_visited = atomic_inc_return(&sg->ref) > 1; + /* sgc visits should follow a similar trend as sg */ + WARN_ON(already_visited != (atomic_inc_return(&sg->sgc->ref) > 1)); + + /* If we have already visited that group, it's already initialized. */ + if (already_visited) + return sg; if (child) { cpumask_copy(sched_group_span(sg), sched_domain_span(child)); @@ -1058,8 +1093,8 @@ static struct sched_group *get_group(int cpu, struct sd_data *sdd) /* * build_sched_groups will build a circular linked list of the groups - * covered by the given span, and will set each group's ->cpumask correctly, - * and ->cpu_capacity to 0. + * covered by the given span, will set each group's ->cpumask correctly, + * and will initialize their ->sgc. * * Assumes the sched_domain tree is fully constructed */ @@ -1309,11 +1344,6 @@ sd_init(struct sched_domain_topology_level *tl, .imbalance_pct = 125, .cache_nice_tries = 0, - .busy_idx = 0, - .idle_idx = 0, - .newidle_idx = 0, - .wake_idx = 0, - .forkexec_idx = 0, .flags = 1*SD_LOAD_BALANCE | 1*SD_BALANCE_NEWIDLE @@ -1365,13 +1395,10 @@ sd_init(struct sched_domain_topology_level *tl, } else if (sd->flags & SD_SHARE_PKG_RESOURCES) { sd->imbalance_pct = 117; sd->cache_nice_tries = 1; - sd->busy_idx = 2; #ifdef CONFIG_NUMA } else if (sd->flags & SD_NUMA) { sd->cache_nice_tries = 2; - sd->busy_idx = 3; - sd->idle_idx = 2; sd->flags &= ~SD_PREFER_SIBLING; sd->flags |= SD_SERIALIZE; @@ -1384,8 +1411,6 @@ sd_init(struct sched_domain_topology_level *tl, #endif } else { sd->cache_nice_tries = 1; - sd->busy_idx = 2; - sd->idle_idx = 1; } /* @@ -1849,10 +1874,10 @@ static struct sched_domain_topology_level unsigned long cap; /* Is there any asymmetry? */ - cap = arch_scale_cpu_capacity(NULL, cpumask_first(cpu_map)); + cap = arch_scale_cpu_capacity(cpumask_first(cpu_map)); for_each_cpu(i, cpu_map) { - if (arch_scale_cpu_capacity(NULL, i) != cap) { + if (arch_scale_cpu_capacity(i) != cap) { asym = true; break; } @@ -1867,7 +1892,7 @@ static struct sched_domain_topology_level * to everyone. */ for_each_cpu(i, cpu_map) { - unsigned long max_capacity = arch_scale_cpu_capacity(NULL, i); + unsigned long max_capacity = arch_scale_cpu_capacity(i); int tl_id = 0; for_each_sd_topology(tl) { @@ -1877,7 +1902,7 @@ static struct sched_domain_topology_level for_each_cpu_and(j, tl->mask(i), cpu_map) { unsigned long capacity; - capacity = arch_scale_cpu_capacity(NULL, j); + capacity = arch_scale_cpu_capacity(j); if (capacity <= max_capacity) continue; @@ -2046,9 +2071,8 @@ void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms) } /* - * Set up scheduler domains and groups. Callers must hold the hotplug lock. - * For now this just excludes isolated CPUs, but could be used to - * exclude other special cases in the future. + * Set up scheduler domains and groups. For now this just excludes isolated + * CPUs, but could be used to exclude other special cases in the future. */ int sched_init_domains(const struct cpumask *cpu_map) { diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index 6eb1f8efd221..c1e566a114ca 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Generic waiting primitives. * @@ -117,16 +118,12 @@ static void __wake_up_common_lock(struct wait_queue_head *wq_head, unsigned int bookmark.func = NULL; INIT_LIST_HEAD(&bookmark.entry); - spin_lock_irqsave(&wq_head->lock, flags); - nr_exclusive = __wake_up_common(wq_head, mode, nr_exclusive, wake_flags, key, &bookmark); - spin_unlock_irqrestore(&wq_head->lock, flags); - - while (bookmark.flags & WQ_FLAG_BOOKMARK) { + do { spin_lock_irqsave(&wq_head->lock, flags); nr_exclusive = __wake_up_common(wq_head, mode, nr_exclusive, wake_flags, key, &bookmark); spin_unlock_irqrestore(&wq_head->lock, flags); - } + } while (bookmark.flags & WQ_FLAG_BOOKMARK); } /** diff --git a/kernel/sched/wait_bit.c b/kernel/sched/wait_bit.c index c67c6d24adc2..45eba18a2898 100644 --- a/kernel/sched/wait_bit.c +++ b/kernel/sched/wait_bit.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * The implementation of the wait_bit*() and related waiting APIs: */ diff --git a/kernel/seccomp.c b/kernel/seccomp.c index e815781ed751..dba52a7db5e8 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -148,8 +148,8 @@ static void populate_seccomp_data(struct seccomp_data *sd) unsigned long args[6]; sd->nr = syscall_get_nr(task, regs); - sd->arch = syscall_get_arch(); - syscall_get_arguments(task, regs, 0, 6, args); + sd->arch = syscall_get_arch(task); + syscall_get_arguments(task, regs, args); sd->args[0] = args[0]; sd->args[1] = args[1]; sd->args[2] = args[2]; @@ -267,6 +267,7 @@ static u32 seccomp_run_filters(const struct seccomp_data *sd, * All filters in the list are evaluated and the lowest BPF return * value always takes priority (ignoring the DATA). */ + preempt_disable(); for (; f; f = f->prev) { u32 cur_ret = BPF_PROG_RUN(f->prog, sd); @@ -275,6 +276,7 @@ static u32 seccomp_run_filters(const struct seccomp_data *sd, *match = f; } } + preempt_enable(); return ret; } #endif /* CONFIG_SECCOMP_FILTER */ @@ -329,7 +331,7 @@ static int is_ancestor(struct seccomp_filter *parent, * Expects sighand and cred_guard_mutex locks to be held. * * Returns 0 on success, -ve on error, or the pid of a thread which was - * either not in the correct seccomp mode or it did not have an ancestral + * either not in the correct seccomp mode or did not have an ancestral * seccomp filter. */ static inline pid_t seccomp_can_sync_threads(void) @@ -443,8 +445,8 @@ static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog) * behavior of privileged children. */ if (!task_no_new_privs(current) && - security_capable_noaudit(current_cred(), current_user_ns(), - CAP_SYS_ADMIN) != 0) + security_capable(current_cred(), current_user_ns(), + CAP_SYS_ADMIN, CAP_OPT_NOAUDIT) != 0) return ERR_PTR(-EACCES); /* Allocate a new seccomp_filter */ @@ -500,7 +502,10 @@ out: * * Caller must be holding current->sighand->siglock lock. * - * Returns 0 on success, -ve on error. + * Returns 0 on success, -ve on error, or + * - in TSYNC mode: the pid of a thread which was either not in the correct + * seccomp mode or did not have an ancestral seccomp filter + * - in NEW_LISTENER mode: the fd of the new listener */ static long seccomp_attach_filter(unsigned int flags, struct seccomp_filter *filter) @@ -589,7 +594,7 @@ static void seccomp_init_siginfo(kernel_siginfo_t *info, int syscall, int reason info->si_code = SYS_SECCOMP; info->si_call_addr = (void __user *)KSTK_EIP(current); info->si_errno = reason; - info->si_arch = syscall_get_arch(); + info->si_arch = syscall_get_arch(current); info->si_syscall = syscall; } @@ -604,7 +609,7 @@ static void seccomp_send_sigsys(int syscall, int reason) { struct kernel_siginfo info; seccomp_init_siginfo(&info, syscall, reason); - force_sig_info(SIGSYS, &info, current); + force_sig_info(&info); } #endif /* CONFIG_SECCOMP_FILTER */ @@ -1256,6 +1261,16 @@ static long seccomp_set_mode_filter(unsigned int flags, if (flags & ~SECCOMP_FILTER_FLAG_MASK) return -EINVAL; + /* + * In the successful case, NEW_LISTENER returns the new listener fd. + * But in the failure case, TSYNC returns the thread that died. If you + * combine these two flags, there's no way to tell whether something + * succeeded or failed. So, let's disallow this combination. + */ + if ((flags & SECCOMP_FILTER_FLAG_TSYNC) && + (flags & SECCOMP_FILTER_FLAG_NEW_LISTENER)) + return -EINVAL; + /* Prepare the new filter before holding any locks. */ prepared = seccomp_prepare_user_filter(filter); if (IS_ERR(prepared)) @@ -1302,7 +1317,7 @@ out: mutex_unlock(¤t->signal->cred_guard_mutex); out_put_fd: if (flags & SECCOMP_FILTER_FLAG_NEW_LISTENER) { - if (ret < 0) { + if (ret) { listener_f->private_data = NULL; fput(listener_f); put_unused_fd(listener); diff --git a/kernel/signal.c b/kernel/signal.c index 99fa8ff06fd9..91b789dd6e72 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * linux/kernel/signal.c * @@ -19,7 +20,9 @@ #include <linux/sched/task.h> #include <linux/sched/task_stack.h> #include <linux/sched/cputime.h> +#include <linux/file.h> #include <linux/fs.h> +#include <linux/proc_fs.h> #include <linux/tty.h> #include <linux/binfmts.h> #include <linux/coredump.h> @@ -41,6 +44,8 @@ #include <linux/compiler.h> #include <linux/posix-timers.h> #include <linux/livepatch.h> +#include <linux/cgroup.h> +#include <linux/audit.h> #define CREATE_TRACE_POINTS #include <trace/events/signal.h> @@ -50,7 +55,6 @@ #include <asm/unistd.h> #include <asm/siginfo.h> #include <asm/cacheflush.h> -#include "audit.h" /* audit_signal_info() */ /* * SLAB caches for signal bits. @@ -144,9 +148,10 @@ static inline bool has_pending_signals(sigset_t *signal, sigset_t *blocked) static bool recalc_sigpending_tsk(struct task_struct *t) { - if ((t->jobctl & JOBCTL_PENDING_MASK) || + if ((t->jobctl & (JOBCTL_PENDING_MASK | JOBCTL_TRAP_FREEZE)) || PENDING(&t->pending, &t->blocked) || - PENDING(&t->signal->shared_pending, &t->blocked)) { + PENDING(&t->signal->shared_pending, &t->blocked) || + cgroup_task_frozen(t)) { set_tsk_thread_flag(t, TIF_SIGPENDING); return true; } @@ -836,6 +841,7 @@ static int check_kill_permission(int sig, struct kernel_siginfo *info, */ if (!sid || sid == task_session(current)) break; + /* fall through */ default: return -EPERM; } @@ -1051,29 +1057,8 @@ static inline bool legacy_queue(struct sigpending *signals, int sig) return (sig < SIGRTMIN) && sigismember(&signals->signal, sig); } -#ifdef CONFIG_USER_NS -static inline void userns_fixup_signal_uid(struct kernel_siginfo *info, struct task_struct *t) -{ - if (current_user_ns() == task_cred_xxx(t, user_ns)) - return; - - if (SI_FROMKERNEL(info)) - return; - - rcu_read_lock(); - info->si_uid = from_kuid_munged(task_cred_xxx(t, user_ns), - make_kuid(current_user_ns(), info->si_uid)); - rcu_read_unlock(); -} -#else -static inline void userns_fixup_signal_uid(struct kernel_siginfo *info, struct task_struct *t) -{ - return; -} -#endif - static int __send_signal(int sig, struct kernel_siginfo *info, struct task_struct *t, - enum pid_type type, int from_ancestor_ns) + enum pid_type type, bool force) { struct sigpending *pending; struct sigqueue *q; @@ -1083,8 +1068,7 @@ static int __send_signal(int sig, struct kernel_siginfo *info, struct task_struc assert_spin_locked(&t->sighand->siglock); result = TRACE_SIGNAL_IGNORED; - if (!prepare_signal(sig, t, - from_ancestor_ns || (info == SEND_SIG_PRIV))) + if (!prepare_signal(sig, t, force)) goto ret; pending = (type != PIDTYPE_PID) ? &t->signal->shared_pending : &t->pending; @@ -1129,7 +1113,11 @@ static int __send_signal(int sig, struct kernel_siginfo *info, struct task_struc q->info.si_code = SI_USER; q->info.si_pid = task_tgid_nr_ns(current, task_active_pid_ns(t)); - q->info.si_uid = from_kuid_munged(current_user_ns(), current_uid()); + rcu_read_lock(); + q->info.si_uid = + from_kuid_munged(task_cred_xxx(t, user_ns), + current_uid()); + rcu_read_unlock(); break; case (unsigned long) SEND_SIG_PRIV: clear_siginfo(&q->info); @@ -1141,30 +1129,24 @@ static int __send_signal(int sig, struct kernel_siginfo *info, struct task_struc break; default: copy_siginfo(&q->info, info); - if (from_ancestor_ns) - q->info.si_pid = 0; break; } - - userns_fixup_signal_uid(&q->info, t); - - } else if (!is_si_special(info)) { - if (sig >= SIGRTMIN && info->si_code != SI_USER) { - /* - * Queue overflow, abort. We may abort if the - * signal was rt and sent by user using something - * other than kill(). - */ - result = TRACE_SIGNAL_OVERFLOW_FAIL; - ret = -EAGAIN; - goto ret; - } else { - /* - * This is a silent loss of information. We still - * send the signal, but the *info bits are lost. - */ - result = TRACE_SIGNAL_LOSE_INFO; - } + } else if (!is_si_special(info) && + sig >= SIGRTMIN && info->si_code != SI_USER) { + /* + * Queue overflow, abort. We may abort if the + * signal was rt and sent by user using something + * other than kill(). + */ + result = TRACE_SIGNAL_OVERFLOW_FAIL; + ret = -EAGAIN; + goto ret; + } else { + /* + * This is a silent loss of information. We still + * send the signal, but the *info bits are lost. + */ + result = TRACE_SIGNAL_LOSE_INFO; } out_set: @@ -1191,17 +1173,62 @@ ret: return ret; } +static inline bool has_si_pid_and_uid(struct kernel_siginfo *info) +{ + bool ret = false; + switch (siginfo_layout(info->si_signo, info->si_code)) { + case SIL_KILL: + case SIL_CHLD: + case SIL_RT: + ret = true; + break; + case SIL_TIMER: + case SIL_POLL: + case SIL_FAULT: + case SIL_FAULT_MCEERR: + case SIL_FAULT_BNDERR: + case SIL_FAULT_PKUERR: + case SIL_SYS: + ret = false; + break; + } + return ret; +} + static int send_signal(int sig, struct kernel_siginfo *info, struct task_struct *t, enum pid_type type) { - int from_ancestor_ns = 0; + /* Should SIGKILL or SIGSTOP be received by a pid namespace init? */ + bool force = false; -#ifdef CONFIG_PID_NS - from_ancestor_ns = si_fromuser(info) && - !task_pid_nr_ns(current, task_active_pid_ns(t)); -#endif + if (info == SEND_SIG_NOINFO) { + /* Force if sent from an ancestor pid namespace */ + force = !task_pid_nr_ns(current, task_active_pid_ns(t)); + } else if (info == SEND_SIG_PRIV) { + /* Don't ignore kernel generated signals */ + force = true; + } else if (has_si_pid_and_uid(info)) { + /* SIGKILL and SIGSTOP is special or has ids */ + struct user_namespace *t_user_ns; + + rcu_read_lock(); + t_user_ns = task_cred_xxx(t, user_ns); + if (current_user_ns() != t_user_ns) { + kuid_t uid = make_kuid(current_user_ns(), info->si_uid); + info->si_uid = from_kuid_munged(t_user_ns, uid); + } + rcu_read_unlock(); + + /* A kernel generated signal? */ + force = (info->si_code == SI_KERNEL); - return __send_signal(sig, info, t, type, from_ancestor_ns); + /* From an ancestor pid namespace? */ + if (!task_pid_nr_ns(current, task_active_pid_ns(t))) { + info->si_pid = 0; + force = true; + } + } + return __send_signal(sig, info, t, type, force); } static void print_fatal_signal(int signr) @@ -1268,12 +1295,13 @@ int do_send_sig_info(int sig, struct kernel_siginfo *info, struct task_struct *p * We don't want to have recursive SIGSEGV's etc, for example, * that is why we also clear SIGNAL_UNKILLABLE. */ -int -force_sig_info(int sig, struct kernel_siginfo *info, struct task_struct *t) +static int +force_sig_info_to_task(struct kernel_siginfo *info, struct task_struct *t) { unsigned long int flags; int ret, blocked, ignored; struct k_sigaction *action; + int sig = info->si_signo; spin_lock_irqsave(&t->sighand->siglock, flags); action = &t->sighand->action[sig-1]; @@ -1298,6 +1326,11 @@ force_sig_info(int sig, struct kernel_siginfo *info, struct task_struct *t) return ret; } +int force_sig_info(struct kernel_siginfo *info) +{ + return force_sig_info_to_task(info, current); +} + /* * Nuke all other threads in the group. */ @@ -1434,13 +1467,44 @@ static inline bool kill_as_cred_perm(const struct cred *cred, uid_eq(cred->uid, pcred->uid); } -/* like kill_pid_info(), but doesn't use uid/euid of "current" */ -int kill_pid_info_as_cred(int sig, struct kernel_siginfo *info, struct pid *pid, - const struct cred *cred) +/* + * The usb asyncio usage of siginfo is wrong. The glibc support + * for asyncio which uses SI_ASYNCIO assumes the layout is SIL_RT. + * AKA after the generic fields: + * kernel_pid_t si_pid; + * kernel_uid32_t si_uid; + * sigval_t si_value; + * + * Unfortunately when usb generates SI_ASYNCIO it assumes the layout + * after the generic fields is: + * void __user *si_addr; + * + * This is a practical problem when there is a 64bit big endian kernel + * and a 32bit userspace. As the 32bit address will encoded in the low + * 32bits of the pointer. Those low 32bits will be stored at higher + * address than appear in a 32 bit pointer. So userspace will not + * see the address it was expecting for it's completions. + * + * There is nothing in the encoding that can allow + * copy_siginfo_to_user32 to detect this confusion of formats, so + * handle this by requiring the caller of kill_pid_usb_asyncio to + * notice when this situration takes place and to store the 32bit + * pointer in sival_int, instead of sival_addr of the sigval_t addr + * parameter. + */ +int kill_pid_usb_asyncio(int sig, int errno, sigval_t addr, + struct pid *pid, const struct cred *cred) { - int ret = -EINVAL; + struct kernel_siginfo info; struct task_struct *p; unsigned long flags; + int ret = -EINVAL; + + clear_siginfo(&info); + info.si_signo = sig; + info.si_errno = errno; + info.si_code = SI_ASYNCIO; + *((sigval_t *)&info.si_pid) = addr; if (!valid_signal(sig)) return ret; @@ -1451,17 +1515,17 @@ int kill_pid_info_as_cred(int sig, struct kernel_siginfo *info, struct pid *pid, ret = -ESRCH; goto out_unlock; } - if (si_fromuser(info) && !kill_as_cred_perm(cred, p)) { + if (!kill_as_cred_perm(cred, p)) { ret = -EPERM; goto out_unlock; } - ret = security_task_kill(p, info, sig, cred); + ret = security_task_kill(p, &info, sig, cred); if (ret) goto out_unlock; if (sig) { if (lock_task_sighand(p, &flags)) { - ret = __send_signal(sig, info, p, PIDTYPE_TGID, 0); + ret = __send_signal(sig, &info, p, PIDTYPE_TGID, false); unlock_task_sighand(p, &flags); } else ret = -ESRCH; @@ -1470,7 +1534,7 @@ out_unlock: rcu_read_unlock(); return ret; } -EXPORT_SYMBOL_GPL(kill_pid_info_as_cred); +EXPORT_SYMBOL_GPL(kill_pid_usb_asyncio); /* * kill_something_info() interprets pid in interesting ways just like kill(2). @@ -1546,9 +1610,17 @@ send_sig(int sig, struct task_struct *p, int priv) } EXPORT_SYMBOL(send_sig); -void force_sig(int sig, struct task_struct *p) +void force_sig(int sig) { - force_sig_info(sig, SEND_SIG_PRIV, p); + struct kernel_siginfo info; + + clear_siginfo(&info); + info.si_signo = sig; + info.si_errno = 0; + info.si_code = SI_KERNEL; + info.si_pid = 0; + info.si_uid = 0; + force_sig_info(&info); } EXPORT_SYMBOL(force_sig); @@ -1558,18 +1630,20 @@ EXPORT_SYMBOL(force_sig); * the problem was already a SIGSEGV, we'll want to * make sure we don't even try to deliver the signal.. */ -void force_sigsegv(int sig, struct task_struct *p) +void force_sigsegv(int sig) { + struct task_struct *p = current; + if (sig == SIGSEGV) { unsigned long flags; spin_lock_irqsave(&p->sighand->siglock, flags); p->sighand->action[sig - 1].sa.sa_handler = SIG_DFL; spin_unlock_irqrestore(&p->sighand->siglock, flags); } - force_sig(SIGSEGV, p); + force_sig(SIGSEGV); } -int force_sig_fault(int sig, int code, void __user *addr +int force_sig_fault_to_task(int sig, int code, void __user *addr ___ARCH_SI_TRAPNO(int trapno) ___ARCH_SI_IA64(int imm, unsigned int flags, unsigned long isr) , struct task_struct *t) @@ -1589,7 +1663,16 @@ int force_sig_fault(int sig, int code, void __user *addr info.si_flags = flags; info.si_isr = isr; #endif - return force_sig_info(info.si_signo, &info, t); + return force_sig_info_to_task(&info, t); +} + +int force_sig_fault(int sig, int code, void __user *addr + ___ARCH_SI_TRAPNO(int trapno) + ___ARCH_SI_IA64(int imm, unsigned int flags, unsigned long isr)) +{ + return force_sig_fault_to_task(sig, code, addr + ___ARCH_SI_TRAPNO(trapno) + ___ARCH_SI_IA64(imm, flags, isr), current); } int send_sig_fault(int sig, int code, void __user *addr @@ -1615,7 +1698,7 @@ int send_sig_fault(int sig, int code, void __user *addr return send_sig_info(info.si_signo, &info, t); } -int force_sig_mceerr(int code, void __user *addr, short lsb, struct task_struct *t) +int force_sig_mceerr(int code, void __user *addr, short lsb) { struct kernel_siginfo info; @@ -1626,7 +1709,7 @@ int force_sig_mceerr(int code, void __user *addr, short lsb, struct task_struct info.si_code = code; info.si_addr = addr; info.si_addr_lsb = lsb; - return force_sig_info(info.si_signo, &info, t); + return force_sig_info(&info); } int send_sig_mceerr(int code, void __user *addr, short lsb, struct task_struct *t) @@ -1655,7 +1738,7 @@ int force_sig_bnderr(void __user *addr, void __user *lower, void __user *upper) info.si_addr = addr; info.si_lower = lower; info.si_upper = upper; - return force_sig_info(info.si_signo, &info, current); + return force_sig_info(&info); } #ifdef SEGV_PKUERR @@ -1669,7 +1752,7 @@ int force_sig_pkuerr(void __user *addr, u32 pkey) info.si_code = SEGV_PKUERR; info.si_addr = addr; info.si_pkey = pkey; - return force_sig_info(info.si_signo, &info, current); + return force_sig_info(&info); } #endif @@ -1685,7 +1768,7 @@ int force_sig_ptrace_errno_trap(int errno, void __user *addr) info.si_errno = errno; info.si_code = TRAP_HWBKPT; info.si_addr = addr; - return force_sig_info(info.si_signo, &info, current); + return force_sig_info(&info); } int kill_pgrp(struct pid *pid, int sig, int priv) @@ -1798,6 +1881,14 @@ ret: return ret; } +static void do_notify_pidfd(struct task_struct *task) +{ + struct pid *pid; + + pid = task_pid(task); + wake_up_all(&pid->wait_pidfd); +} + /* * Let a parent know about the death of a child. * For a stopped/continued status change, use do_notify_parent_cldstop instead. @@ -1821,6 +1912,9 @@ bool do_notify_parent(struct task_struct *tsk, int sig) BUG_ON(!tsk->ptrace && (tsk->group_leader != tsk || !thread_group_empty(tsk))); + /* Wake up all pidfd waiters */ + do_notify_pidfd(tsk); + if (sig != SIGCHLD) { /* * This is only possible if parent == real_parent. @@ -2106,7 +2200,9 @@ static void ptrace_stop(int exit_code, int why, int clear_code, kernel_siginfo_t preempt_disable(); read_unlock(&tasklist_lock); preempt_enable_no_resched(); + cgroup_enter_frozen(); freezable_schedule(); + cgroup_leave_frozen(true); } else { /* * By the time we got the lock, our tracer went away. @@ -2284,6 +2380,7 @@ static bool do_signal_stop(int signr) } /* Now we don't run again until woken by SIGCONT or SIGKILL */ + cgroup_enter_frozen(); freezable_schedule(); return true; } else { @@ -2330,6 +2427,43 @@ static void do_jobctl_trap(void) } } +/** + * do_freezer_trap - handle the freezer jobctl trap + * + * Puts the task into frozen state, if only the task is not about to quit. + * In this case it drops JOBCTL_TRAP_FREEZE. + * + * CONTEXT: + * Must be called with @current->sighand->siglock held, + * which is always released before returning. + */ +static void do_freezer_trap(void) + __releases(¤t->sighand->siglock) +{ + /* + * If there are other trap bits pending except JOBCTL_TRAP_FREEZE, + * let's make another loop to give it a chance to be handled. + * In any case, we'll return back. + */ + if ((current->jobctl & (JOBCTL_PENDING_MASK | JOBCTL_TRAP_FREEZE)) != + JOBCTL_TRAP_FREEZE) { + spin_unlock_irq(¤t->sighand->siglock); + return; + } + + /* + * Now we're sure that there is no pending fatal signal and no + * pending traps. Clear TIF_SIGPENDING to not get out of schedule() + * immediately (if there is a non-fatal signal pending), and + * put the task into sleep. + */ + __set_current_state(TASK_INTERRUPTIBLE); + clear_thread_flag(TIF_SIGPENDING); + spin_unlock_irq(¤t->sighand->siglock); + cgroup_enter_frozen(); + freezable_schedule(); +} + static int ptrace_signal(int signr, kernel_siginfo_t *info) { /* @@ -2436,9 +2570,14 @@ relock: } /* Has this task already been marked for death? */ - ksig->info.si_signo = signr = SIGKILL; - if (signal_group_exit(signal)) + if (signal_group_exit(signal)) { + ksig->info.si_signo = signr = SIGKILL; + sigdelset(¤t->pending.signal, SIGKILL); + trace_signal_deliver(SIGKILL, SEND_SIG_NOINFO, + &sighand->action[SIGKILL - 1]); + recalc_sigpending(); goto fatal; + } for (;;) { struct k_sigaction *ka; @@ -2447,9 +2586,24 @@ relock: do_signal_stop(0)) goto relock; - if (unlikely(current->jobctl & JOBCTL_TRAP_MASK)) { - do_jobctl_trap(); + if (unlikely(current->jobctl & + (JOBCTL_TRAP_MASK | JOBCTL_TRAP_FREEZE))) { + if (current->jobctl & JOBCTL_TRAP_MASK) { + do_jobctl_trap(); + spin_unlock_irq(&sighand->siglock); + } else if (current->jobctl & JOBCTL_TRAP_FREEZE) + do_freezer_trap(); + + goto relock; + } + + /* + * If the task is leaving the frozen state, let's update + * cgroup counters and reset the frozen bit. + */ + if (unlikely(cgroup_task_frozen(current))) { spin_unlock_irq(&sighand->siglock); + cgroup_leave_frozen(false); goto relock; } @@ -2545,6 +2699,8 @@ relock: fatal: spin_unlock_irq(&sighand->siglock); + if (unlikely(cgroup_task_frozen(current))) + cgroup_leave_frozen(true); /* * Anything else is fatal, maybe with a core dump. @@ -2608,7 +2764,7 @@ static void signal_delivered(struct ksignal *ksig, int stepping) void signal_setup_done(int failed, struct ksignal *ksig, int stepping) { if (failed) - force_sigsegv(ksig->sig, current); + force_sigsegv(ksig->sig); else signal_delivered(ksig, stepping); } @@ -2795,79 +2951,49 @@ EXPORT_SYMBOL(sigprocmask); * * This is useful for syscalls such as ppoll, pselect, io_pgetevents and * epoll_pwait where a new sigmask is passed from userland for the syscalls. + * + * Note that it does set_restore_sigmask() in advance, so it must be always + * paired with restore_saved_sigmask_unless() before return from syscall. */ -int set_user_sigmask(const sigset_t __user *usigmask, sigset_t *set, - sigset_t *oldset, size_t sigsetsize) +int set_user_sigmask(const sigset_t __user *umask, size_t sigsetsize) { - if (!usigmask) - return 0; + sigset_t kmask; + if (!umask) + return 0; if (sigsetsize != sizeof(sigset_t)) return -EINVAL; - if (copy_from_user(set, usigmask, sizeof(sigset_t))) + if (copy_from_user(&kmask, umask, sizeof(sigset_t))) return -EFAULT; - *oldset = current->blocked; - set_current_blocked(set); + set_restore_sigmask(); + current->saved_sigmask = current->blocked; + set_current_blocked(&kmask); return 0; } -EXPORT_SYMBOL(set_user_sigmask); #ifdef CONFIG_COMPAT -int set_compat_user_sigmask(const compat_sigset_t __user *usigmask, - sigset_t *set, sigset_t *oldset, +int set_compat_user_sigmask(const compat_sigset_t __user *umask, size_t sigsetsize) { - if (!usigmask) - return 0; + sigset_t kmask; + if (!umask) + return 0; if (sigsetsize != sizeof(compat_sigset_t)) return -EINVAL; - if (get_compat_sigset(set, usigmask)) + if (get_compat_sigset(&kmask, umask)) return -EFAULT; - *oldset = current->blocked; - set_current_blocked(set); + set_restore_sigmask(); + current->saved_sigmask = current->blocked; + set_current_blocked(&kmask); return 0; } -EXPORT_SYMBOL(set_compat_user_sigmask); #endif -/* - * restore_user_sigmask: - * usigmask: sigmask passed in from userland. - * sigsaved: saved sigmask when the syscall started and changed the sigmask to - * usigmask. - * - * This is useful for syscalls such as ppoll, pselect, io_pgetevents and - * epoll_pwait where a new sigmask is passed in from userland for the syscalls. - */ -void restore_user_sigmask(const void __user *usigmask, sigset_t *sigsaved) -{ - - if (!usigmask) - return; - /* - * When signals are pending, do not restore them here. - * Restoring sigmask here can lead to delivering signals that the above - * syscalls are intended to block because of the sigmask passed in. - */ - if (signal_pending(current)) { - current->saved_sigmask = *sigsaved; - set_restore_sigmask(); - return; - } - - /* - * This is needed because the fast syscall return path does not restore - * saved_sigmask when signals are not pending. - */ - set_current_blocked(sigsaved); -} -EXPORT_SYMBOL(restore_user_sigmask); - /** * sys_rt_sigprocmask - change the list of currently blocked signals * @how: whether to add, remove, or set signals @@ -3452,7 +3578,7 @@ COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait_time64, compat_sigset_t __user *, uthese, } #ifdef CONFIG_COMPAT_32BIT_TIME -COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait, compat_sigset_t __user *, uthese, +COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait_time32, compat_sigset_t __user *, uthese, struct compat_siginfo __user *, uinfo, struct old_timespec32 __user *, uts, compat_size_t, sigsetsize) { @@ -3484,6 +3610,16 @@ COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait, compat_sigset_t __user *, uthese, #endif #endif +static inline void prepare_kill_siginfo(int sig, struct kernel_siginfo *info) +{ + clear_siginfo(info); + info->si_signo = sig; + info->si_errno = 0; + info->si_code = SI_USER; + info->si_pid = task_tgid_vnr(current); + info->si_uid = from_kuid_munged(current_user_ns(), current_uid()); +} + /** * sys_kill - send a signal to a process * @pid: the PID of the process @@ -3493,16 +3629,125 @@ SYSCALL_DEFINE2(kill, pid_t, pid, int, sig) { struct kernel_siginfo info; - clear_siginfo(&info); - info.si_signo = sig; - info.si_errno = 0; - info.si_code = SI_USER; - info.si_pid = task_tgid_vnr(current); - info.si_uid = from_kuid_munged(current_user_ns(), current_uid()); + prepare_kill_siginfo(sig, &info); return kill_something_info(sig, &info, pid); } +/* + * Verify that the signaler and signalee either are in the same pid namespace + * or that the signaler's pid namespace is an ancestor of the signalee's pid + * namespace. + */ +static bool access_pidfd_pidns(struct pid *pid) +{ + struct pid_namespace *active = task_active_pid_ns(current); + struct pid_namespace *p = ns_of_pid(pid); + + for (;;) { + if (!p) + return false; + if (p == active) + break; + p = p->parent; + } + + return true; +} + +static int copy_siginfo_from_user_any(kernel_siginfo_t *kinfo, siginfo_t *info) +{ +#ifdef CONFIG_COMPAT + /* + * Avoid hooking up compat syscalls and instead handle necessary + * conversions here. Note, this is a stop-gap measure and should not be + * considered a generic solution. + */ + if (in_compat_syscall()) + return copy_siginfo_from_user32( + kinfo, (struct compat_siginfo __user *)info); +#endif + return copy_siginfo_from_user(kinfo, info); +} + +static struct pid *pidfd_to_pid(const struct file *file) +{ + if (file->f_op == &pidfd_fops) + return file->private_data; + + return tgid_pidfd_to_pid(file); +} + +/** + * sys_pidfd_send_signal - Signal a process through a pidfd + * @pidfd: file descriptor of the process + * @sig: signal to send + * @info: signal info + * @flags: future flags + * + * The syscall currently only signals via PIDTYPE_PID which covers + * kill(<positive-pid>, <signal>. It does not signal threads or process + * groups. + * In order to extend the syscall to threads and process groups the @flags + * argument should be used. In essence, the @flags argument will determine + * what is signaled and not the file descriptor itself. Put in other words, + * grouping is a property of the flags argument not a property of the file + * descriptor. + * + * Return: 0 on success, negative errno on failure + */ +SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig, + siginfo_t __user *, info, unsigned int, flags) +{ + int ret; + struct fd f; + struct pid *pid; + kernel_siginfo_t kinfo; + + /* Enforce flags be set to 0 until we add an extension. */ + if (flags) + return -EINVAL; + + f = fdget(pidfd); + if (!f.file) + return -EBADF; + + /* Is this a pidfd? */ + pid = pidfd_to_pid(f.file); + if (IS_ERR(pid)) { + ret = PTR_ERR(pid); + goto err; + } + + ret = -EINVAL; + if (!access_pidfd_pidns(pid)) + goto err; + + if (info) { + ret = copy_siginfo_from_user_any(&kinfo, info); + if (unlikely(ret)) + goto err; + + ret = -EINVAL; + if (unlikely(sig != kinfo.si_signo)) + goto err; + + /* Only allow sending arbitrary signals to yourself. */ + ret = -EPERM; + if ((task_pid(current) != pid) && + (kinfo.si_code >= 0 || kinfo.si_code == SI_TKILL)) + goto err; + } else { + prepare_kill_siginfo(sig, &kinfo); + } + + ret = kill_pid_info(sig, &kinfo, pid); + +err: + fdput(f); + return ret; +} + static int do_send_specific(pid_t tgid, pid_t pid, int sig, struct kernel_siginfo *info) { @@ -4289,6 +4534,28 @@ static inline void siginfo_buildtime_checks(void) CHECK_OFFSET(si_syscall); CHECK_OFFSET(si_arch); #undef CHECK_OFFSET + + /* usb asyncio */ + BUILD_BUG_ON(offsetof(struct siginfo, si_pid) != + offsetof(struct siginfo, si_addr)); + if (sizeof(int) == sizeof(void __user *)) { + BUILD_BUG_ON(sizeof_field(struct siginfo, si_pid) != + sizeof(void __user *)); + } else { + BUILD_BUG_ON((sizeof_field(struct siginfo, si_pid) + + sizeof_field(struct siginfo, si_uid)) != + sizeof(void __user *)); + BUILD_BUG_ON(offsetofend(struct siginfo, si_pid) != + offsetof(struct siginfo, si_uid)); + } +#ifdef CONFIG_COMPAT + BUILD_BUG_ON(offsetof(struct compat_siginfo, si_pid) != + offsetof(struct compat_siginfo, si_addr)); + BUILD_BUG_ON(sizeof_field(struct compat_siginfo, si_pid) != + sizeof(compat_uptr_t)); + BUILD_BUG_ON(sizeof_field(struct compat_siginfo, si_pid) != + sizeof_field(struct siginfo, si_pid)); +#endif } void __init signals_init(void) diff --git a/kernel/smp.c b/kernel/smp.c index f4cf1b0bb3b8..616d4d114847 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Generic helpers for smp ipi calls * @@ -33,7 +34,7 @@ struct call_function_data { cpumask_var_t cpumask_ipi; }; -static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_function_data, cfd_data); +static DEFINE_PER_CPU_ALIGNED(struct call_function_data, cfd_data); static DEFINE_PER_CPU_SHARED_ALIGNED(struct llist_head, call_single_queue); @@ -486,13 +487,11 @@ EXPORT_SYMBOL(smp_call_function_many); * You must not call this function with disabled interrupts or from a * hardware interrupt handler or from a bottom half handler. */ -int smp_call_function(smp_call_func_t func, void *info, int wait) +void smp_call_function(smp_call_func_t func, void *info, int wait) { preempt_disable(); smp_call_function_many(cpu_online_mask, func, info, wait); preempt_enable(); - - return 0; } EXPORT_SYMBOL(smp_call_function); @@ -593,18 +592,16 @@ void __init smp_init(void) * early_boot_irqs_disabled is set. Use local_irq_save/restore() instead * of local_irq_disable/enable(). */ -int on_each_cpu(void (*func) (void *info), void *info, int wait) +void on_each_cpu(void (*func) (void *info), void *info, int wait) { unsigned long flags; - int ret = 0; preempt_disable(); - ret = smp_call_function(func, info, wait); + smp_call_function(func, info, wait); local_irq_save(flags); func(info); local_irq_restore(flags); preempt_enable(); - return ret; } EXPORT_SYMBOL(on_each_cpu); diff --git a/kernel/smpboot.c b/kernel/smpboot.c index c230c2dd48e1..2efe1e206167 100644 --- a/kernel/smpboot.c +++ b/kernel/smpboot.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Common SMP CPU bringup/teardown functions */ diff --git a/kernel/softirq.c b/kernel/softirq.c index d28813306b2c..0427a86743a4 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -1,10 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * linux/kernel/softirq.c * * Copyright (C) 1992 Linus Torvalds * - * Distribute under GPLv2. - * * Rewritten. Old one was good in 2.2, but in 2.3 it was immoral. --ANK (990903) */ @@ -89,7 +88,8 @@ static bool ksoftirqd_running(unsigned long pending) if (pending & SOFTIRQ_NOW_MASK) return false; - return tsk && (tsk->state == TASK_RUNNING); + return tsk && (tsk->state == TASK_RUNNING) && + !__kthread_should_park(tsk); } /* @@ -572,57 +572,6 @@ void tasklet_kill(struct tasklet_struct *t) } EXPORT_SYMBOL(tasklet_kill); -/* - * tasklet_hrtimer - */ - -/* - * The trampoline is called when the hrtimer expires. It schedules a tasklet - * to run __tasklet_hrtimer_trampoline() which in turn will call the intended - * hrtimer callback, but from softirq context. - */ -static enum hrtimer_restart __hrtimer_tasklet_trampoline(struct hrtimer *timer) -{ - struct tasklet_hrtimer *ttimer = - container_of(timer, struct tasklet_hrtimer, timer); - - tasklet_hi_schedule(&ttimer->tasklet); - return HRTIMER_NORESTART; -} - -/* - * Helper function which calls the hrtimer callback from - * tasklet/softirq context - */ -static void __tasklet_hrtimer_trampoline(unsigned long data) -{ - struct tasklet_hrtimer *ttimer = (void *)data; - enum hrtimer_restart restart; - - restart = ttimer->function(&ttimer->timer); - if (restart != HRTIMER_NORESTART) - hrtimer_restart(&ttimer->timer); -} - -/** - * tasklet_hrtimer_init - Init a tasklet/hrtimer combo for softirq callbacks - * @ttimer: tasklet_hrtimer which is initialized - * @function: hrtimer callback function which gets called from softirq context - * @which_clock: clock id (CLOCK_MONOTONIC/CLOCK_REALTIME) - * @mode: hrtimer mode (HRTIMER_MODE_ABS/HRTIMER_MODE_REL) - */ -void tasklet_hrtimer_init(struct tasklet_hrtimer *ttimer, - enum hrtimer_restart (*function)(struct hrtimer *), - clockid_t which_clock, enum hrtimer_mode mode) -{ - hrtimer_init(&ttimer->timer, which_clock, mode); - ttimer->timer.function = __hrtimer_tasklet_trampoline; - tasklet_init(&ttimer->tasklet, __tasklet_hrtimer_trampoline, - (unsigned long)ttimer); - ttimer->function = function; -} -EXPORT_SYMBOL_GPL(tasklet_hrtimer_init); - void __init softirq_init(void) { int cpu; @@ -700,7 +649,7 @@ static int takeover_tasklets(unsigned int cpu) /* Find end, append list for that CPU. */ if (&per_cpu(tasklet_vec, cpu).head != per_cpu(tasklet_vec, cpu).tail) { *__this_cpu_read(tasklet_vec.tail) = per_cpu(tasklet_vec, cpu).head; - this_cpu_write(tasklet_vec.tail, per_cpu(tasklet_vec, cpu).tail); + __this_cpu_write(tasklet_vec.tail, per_cpu(tasklet_vec, cpu).tail); per_cpu(tasklet_vec, cpu).head = NULL; per_cpu(tasklet_vec, cpu).tail = &per_cpu(tasklet_vec, cpu).head; } diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c index f8edee9c792d..e6a02b274b73 100644 --- a/kernel/stacktrace.c +++ b/kernel/stacktrace.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * kernel/stacktrace.c * @@ -5,41 +6,56 @@ * * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> */ +#include <linux/sched/task_stack.h> +#include <linux/sched/debug.h> #include <linux/sched.h> #include <linux/kernel.h> #include <linux/export.h> #include <linux/kallsyms.h> #include <linux/stacktrace.h> -void print_stack_trace(struct stack_trace *trace, int spaces) +/** + * stack_trace_print - Print the entries in the stack trace + * @entries: Pointer to storage array + * @nr_entries: Number of entries in the storage array + * @spaces: Number of leading spaces to print + */ +void stack_trace_print(unsigned long *entries, unsigned int nr_entries, + int spaces) { - int i; + unsigned int i; - if (WARN_ON(!trace->entries)) + if (WARN_ON(!entries)) return; - for (i = 0; i < trace->nr_entries; i++) - printk("%*c%pS\n", 1 + spaces, ' ', (void *)trace->entries[i]); + for (i = 0; i < nr_entries; i++) + printk("%*c%pS\n", 1 + spaces, ' ', (void *)entries[i]); } -EXPORT_SYMBOL_GPL(print_stack_trace); +EXPORT_SYMBOL_GPL(stack_trace_print); -int snprint_stack_trace(char *buf, size_t size, - struct stack_trace *trace, int spaces) +/** + * stack_trace_snprint - Print the entries in the stack trace into a buffer + * @buf: Pointer to the print buffer + * @size: Size of the print buffer + * @entries: Pointer to storage array + * @nr_entries: Number of entries in the storage array + * @spaces: Number of leading spaces to print + * + * Return: Number of bytes printed. + */ +int stack_trace_snprint(char *buf, size_t size, unsigned long *entries, + unsigned int nr_entries, int spaces) { - int i; - int generated; - int total = 0; + unsigned int generated, i, total = 0; - if (WARN_ON(!trace->entries)) + if (WARN_ON(!entries)) return 0; - for (i = 0; i < trace->nr_entries; i++) { + for (i = 0; i < nr_entries && size; i++) { generated = snprintf(buf, size, "%*c%pS\n", 1 + spaces, ' ', - (void *)trace->entries[i]); + (void *)entries[i]); total += generated; - - /* Assume that generated isn't a negative number */ if (generated >= size) { buf += size; size = 0; @@ -51,7 +67,176 @@ int snprint_stack_trace(char *buf, size_t size, return total; } -EXPORT_SYMBOL_GPL(snprint_stack_trace); +EXPORT_SYMBOL_GPL(stack_trace_snprint); + +#ifdef CONFIG_ARCH_STACKWALK + +struct stacktrace_cookie { + unsigned long *store; + unsigned int size; + unsigned int skip; + unsigned int len; +}; + +static bool stack_trace_consume_entry(void *cookie, unsigned long addr, + bool reliable) +{ + struct stacktrace_cookie *c = cookie; + + if (c->len >= c->size) + return false; + + if (c->skip > 0) { + c->skip--; + return true; + } + c->store[c->len++] = addr; + return c->len < c->size; +} + +static bool stack_trace_consume_entry_nosched(void *cookie, unsigned long addr, + bool reliable) +{ + if (in_sched_functions(addr)) + return true; + return stack_trace_consume_entry(cookie, addr, reliable); +} + +/** + * stack_trace_save - Save a stack trace into a storage array + * @store: Pointer to storage array + * @size: Size of the storage array + * @skipnr: Number of entries to skip at the start of the stack trace + * + * Return: Number of trace entries stored. + */ +unsigned int stack_trace_save(unsigned long *store, unsigned int size, + unsigned int skipnr) +{ + stack_trace_consume_fn consume_entry = stack_trace_consume_entry; + struct stacktrace_cookie c = { + .store = store, + .size = size, + .skip = skipnr + 1, + }; + + arch_stack_walk(consume_entry, &c, current, NULL); + return c.len; +} +EXPORT_SYMBOL_GPL(stack_trace_save); + +/** + * stack_trace_save_tsk - Save a task stack trace into a storage array + * @task: The task to examine + * @store: Pointer to storage array + * @size: Size of the storage array + * @skipnr: Number of entries to skip at the start of the stack trace + * + * Return: Number of trace entries stored. + */ +unsigned int stack_trace_save_tsk(struct task_struct *tsk, unsigned long *store, + unsigned int size, unsigned int skipnr) +{ + stack_trace_consume_fn consume_entry = stack_trace_consume_entry_nosched; + struct stacktrace_cookie c = { + .store = store, + .size = size, + .skip = skipnr + 1, + }; + + if (!try_get_task_stack(tsk)) + return 0; + + arch_stack_walk(consume_entry, &c, tsk, NULL); + put_task_stack(tsk); + return c.len; +} + +/** + * stack_trace_save_regs - Save a stack trace based on pt_regs into a storage array + * @regs: Pointer to pt_regs to examine + * @store: Pointer to storage array + * @size: Size of the storage array + * @skipnr: Number of entries to skip at the start of the stack trace + * + * Return: Number of trace entries stored. + */ +unsigned int stack_trace_save_regs(struct pt_regs *regs, unsigned long *store, + unsigned int size, unsigned int skipnr) +{ + stack_trace_consume_fn consume_entry = stack_trace_consume_entry; + struct stacktrace_cookie c = { + .store = store, + .size = size, + .skip = skipnr, + }; + + arch_stack_walk(consume_entry, &c, current, regs); + return c.len; +} + +#ifdef CONFIG_HAVE_RELIABLE_STACKTRACE +/** + * stack_trace_save_tsk_reliable - Save task stack with verification + * @tsk: Pointer to the task to examine + * @store: Pointer to storage array + * @size: Size of the storage array + * + * Return: An error if it detects any unreliable features of the + * stack. Otherwise it guarantees that the stack trace is + * reliable and returns the number of entries stored. + * + * If the task is not 'current', the caller *must* ensure the task is inactive. + */ +int stack_trace_save_tsk_reliable(struct task_struct *tsk, unsigned long *store, + unsigned int size) +{ + stack_trace_consume_fn consume_entry = stack_trace_consume_entry; + struct stacktrace_cookie c = { + .store = store, + .size = size, + }; + int ret; + + /* + * If the task doesn't have a stack (e.g., a zombie), the stack is + * "reliably" empty. + */ + if (!try_get_task_stack(tsk)) + return 0; + + ret = arch_stack_walk_reliable(consume_entry, &c, tsk); + put_task_stack(tsk); + return ret ? ret : c.len; +} +#endif + +#ifdef CONFIG_USER_STACKTRACE_SUPPORT +/** + * stack_trace_save_user - Save a user space stack trace into a storage array + * @store: Pointer to storage array + * @size: Size of the storage array + * + * Return: Number of trace entries stored. + */ +unsigned int stack_trace_save_user(unsigned long *store, unsigned int size) +{ + stack_trace_consume_fn consume_entry = stack_trace_consume_entry; + struct stacktrace_cookie c = { + .store = store, + .size = size, + }; + + /* Trace user stack if not a kernel thread */ + if (current->flags & PF_KTHREAD) + return 0; + + arch_stack_walk_user(consume_entry, &c, task_pt_regs(current)); + return c.len; +} +#endif + +#else /* CONFIG_ARCH_STACKWALK */ /* * Architectures that do not implement save_stack_trace_*() @@ -70,10 +255,117 @@ save_stack_trace_regs(struct pt_regs *regs, struct stack_trace *trace) WARN_ONCE(1, KERN_INFO "save_stack_trace_regs() not implemented yet.\n"); } -__weak int -save_stack_trace_tsk_reliable(struct task_struct *tsk, - struct stack_trace *trace) +/** + * stack_trace_save - Save a stack trace into a storage array + * @store: Pointer to storage array + * @size: Size of the storage array + * @skipnr: Number of entries to skip at the start of the stack trace + * + * Return: Number of trace entries stored + */ +unsigned int stack_trace_save(unsigned long *store, unsigned int size, + unsigned int skipnr) +{ + struct stack_trace trace = { + .entries = store, + .max_entries = size, + .skip = skipnr + 1, + }; + + save_stack_trace(&trace); + return trace.nr_entries; +} +EXPORT_SYMBOL_GPL(stack_trace_save); + +/** + * stack_trace_save_tsk - Save a task stack trace into a storage array + * @task: The task to examine + * @store: Pointer to storage array + * @size: Size of the storage array + * @skipnr: Number of entries to skip at the start of the stack trace + * + * Return: Number of trace entries stored + */ +unsigned int stack_trace_save_tsk(struct task_struct *task, + unsigned long *store, unsigned int size, + unsigned int skipnr) { - WARN_ONCE(1, KERN_INFO "save_stack_tsk_reliable() not implemented yet.\n"); - return -ENOSYS; + struct stack_trace trace = { + .entries = store, + .max_entries = size, + .skip = skipnr + 1, + }; + + save_stack_trace_tsk(task, &trace); + return trace.nr_entries; } + +/** + * stack_trace_save_regs - Save a stack trace based on pt_regs into a storage array + * @regs: Pointer to pt_regs to examine + * @store: Pointer to storage array + * @size: Size of the storage array + * @skipnr: Number of entries to skip at the start of the stack trace + * + * Return: Number of trace entries stored + */ +unsigned int stack_trace_save_regs(struct pt_regs *regs, unsigned long *store, + unsigned int size, unsigned int skipnr) +{ + struct stack_trace trace = { + .entries = store, + .max_entries = size, + .skip = skipnr, + }; + + save_stack_trace_regs(regs, &trace); + return trace.nr_entries; +} + +#ifdef CONFIG_HAVE_RELIABLE_STACKTRACE +/** + * stack_trace_save_tsk_reliable - Save task stack with verification + * @tsk: Pointer to the task to examine + * @store: Pointer to storage array + * @size: Size of the storage array + * + * Return: An error if it detects any unreliable features of the + * stack. Otherwise it guarantees that the stack trace is + * reliable and returns the number of entries stored. + * + * If the task is not 'current', the caller *must* ensure the task is inactive. + */ +int stack_trace_save_tsk_reliable(struct task_struct *tsk, unsigned long *store, + unsigned int size) +{ + struct stack_trace trace = { + .entries = store, + .max_entries = size, + }; + int ret = save_stack_trace_tsk_reliable(tsk, &trace); + + return ret ? ret : trace.nr_entries; +} +#endif + +#ifdef CONFIG_USER_STACKTRACE_SUPPORT +/** + * stack_trace_save_user - Save a user space stack trace into a storage array + * @store: Pointer to storage array + * @size: Size of the storage array + * + * Return: Number of trace entries stored + */ +unsigned int stack_trace_save_user(unsigned long *store, unsigned int size) +{ + struct stack_trace trace = { + .entries = store, + .max_entries = size, + }; + + save_stack_trace_user(&trace); + return trace.nr_entries; +} +#endif /* CONFIG_USER_STACKTRACE_SUPPORT */ + +#endif /* !CONFIG_ARCH_STACKWALK */ diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 067cb83f37ea..b4f83f7bdf86 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * kernel/stop_machine.c * @@ -5,8 +6,6 @@ * Copyright (C) 2008, 2005 Rusty Russell rusty@rustcorp.com.au * Copyright (C) 2010 SUSE Linux Products GmbH * Copyright (C) 2010 Tejun Heo <tj@kernel.org> - * - * This file is released under the GPLv2 and any later version. */ #include <linux/completion.h> #include <linux/cpu.h> @@ -178,12 +177,18 @@ static void ack_state(struct multi_stop_data *msdata) set_state(msdata, msdata->state + 1); } +void __weak stop_machine_yield(const struct cpumask *cpumask) +{ + cpu_relax(); +} + /* This is the cpu_stop function which stops the CPU. */ static int multi_cpu_stop(void *data) { struct multi_stop_data *msdata = data; enum multi_stop_state curstate = MULTI_STOP_NONE; int cpu = smp_processor_id(), err = 0; + const struct cpumask *cpumask; unsigned long flags; bool is_active; @@ -193,15 +198,18 @@ static int multi_cpu_stop(void *data) */ local_save_flags(flags); - if (!msdata->active_cpus) - is_active = cpu == cpumask_first(cpu_online_mask); - else - is_active = cpumask_test_cpu(cpu, msdata->active_cpus); + if (!msdata->active_cpus) { + cpumask = cpu_online_mask; + is_active = cpu == cpumask_first(cpumask); + } else { + cpumask = msdata->active_cpus; + is_active = cpumask_test_cpu(cpu, cpumask); + } /* Simple state machine */ do { /* Chill out and ensure we re-read multi_stop_state. */ - cpu_relax_yield(); + stop_machine_yield(cpumask); if (msdata->state != curstate) { curstate = msdata->state; switch (curstate) { @@ -513,7 +521,7 @@ repeat: } preempt_count_dec(); WARN_ONCE(preempt_count(), - "cpu_stop: %pf(%p) leaked preempt count\n", fn, arg); + "cpu_stop: %ps(%p) leaked preempt count\n", fn, arg); goto repeat; } } diff --git a/kernel/sys.c b/kernel/sys.c index f7eb62eceb24..2969304c29fe 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -516,7 +516,7 @@ long __sys_setreuid(uid_t ruid, uid_t euid) new->uid = kruid; if (!uid_eq(old->uid, kruid) && !uid_eq(old->euid, kruid) && - !ns_capable(old->user_ns, CAP_SETUID)) + !ns_capable_setid(old->user_ns, CAP_SETUID)) goto error; } @@ -525,7 +525,7 @@ long __sys_setreuid(uid_t ruid, uid_t euid) if (!uid_eq(old->uid, keuid) && !uid_eq(old->euid, keuid) && !uid_eq(old->suid, keuid) && - !ns_capable(old->user_ns, CAP_SETUID)) + !ns_capable_setid(old->user_ns, CAP_SETUID)) goto error; } @@ -584,7 +584,7 @@ long __sys_setuid(uid_t uid) old = current_cred(); retval = -EPERM; - if (ns_capable(old->user_ns, CAP_SETUID)) { + if (ns_capable_setid(old->user_ns, CAP_SETUID)) { new->suid = new->uid = kuid; if (!uid_eq(kuid, old->uid)) { retval = set_user(new); @@ -646,7 +646,7 @@ long __sys_setresuid(uid_t ruid, uid_t euid, uid_t suid) old = current_cred(); retval = -EPERM; - if (!ns_capable(old->user_ns, CAP_SETUID)) { + if (!ns_capable_setid(old->user_ns, CAP_SETUID)) { if (ruid != (uid_t) -1 && !uid_eq(kruid, old->uid) && !uid_eq(kruid, old->euid) && !uid_eq(kruid, old->suid)) goto error; @@ -814,7 +814,7 @@ long __sys_setfsuid(uid_t uid) if (uid_eq(kuid, old->uid) || uid_eq(kuid, old->euid) || uid_eq(kuid, old->suid) || uid_eq(kuid, old->fsuid) || - ns_capable(old->user_ns, CAP_SETUID)) { + ns_capable_setid(old->user_ns, CAP_SETUID)) { if (!uid_eq(kuid, old->fsuid)) { new->fsuid = kuid; if (security_task_fix_setuid(new, old, LSM_SETID_FS) == 0) @@ -1747,6 +1747,7 @@ void getrusage(struct task_struct *p, int who, struct rusage *r) if (who == RUSAGE_CHILDREN) break; + /* fall through */ case RUSAGE_SELF: thread_group_cputime_adjusted(p, &tgutime, &tgstime); @@ -1881,13 +1882,14 @@ exit_err: } /* + * Check arithmetic relations of passed addresses. + * * WARNING: we don't require any capability here so be very careful * in what is allowed for modification from userspace. */ -static int validate_prctl_map(struct prctl_mm_map *prctl_map) +static int validate_prctl_map_addr(struct prctl_mm_map *prctl_map) { unsigned long mmap_max_addr = TASK_SIZE; - struct mm_struct *mm = current->mm; int error = -EINVAL, i; static const unsigned char offsets[] = { @@ -1923,7 +1925,7 @@ static int validate_prctl_map(struct prctl_mm_map *prctl_map) ((unsigned long)prctl_map->__m1 __op \ (unsigned long)prctl_map->__m2) ? 0 : -EINVAL error = __prctl_check_order(start_code, <, end_code); - error |= __prctl_check_order(start_data, <, end_data); + error |= __prctl_check_order(start_data,<=, end_data); error |= __prctl_check_order(start_brk, <=, brk); error |= __prctl_check_order(arg_start, <=, arg_end); error |= __prctl_check_order(env_start, <=, env_end); @@ -1948,24 +1950,6 @@ static int validate_prctl_map(struct prctl_mm_map *prctl_map) prctl_map->start_data)) goto out; - /* - * Someone is trying to cheat the auxv vector. - */ - if (prctl_map->auxv_size) { - if (!prctl_map->auxv || prctl_map->auxv_size > sizeof(mm->saved_auxv)) - goto out; - } - - /* - * Finally, make sure the caller has the rights to - * change /proc/pid/exe link: only local sys admin should - * be allowed to. - */ - if (prctl_map->exe_fd != (u32)-1) { - if (!ns_capable(current_user_ns(), CAP_SYS_ADMIN)) - goto out; - } - error = 0; out: return error; @@ -1992,11 +1976,18 @@ static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data if (copy_from_user(&prctl_map, addr, sizeof(prctl_map))) return -EFAULT; - error = validate_prctl_map(&prctl_map); + error = validate_prctl_map_addr(&prctl_map); if (error) return error; if (prctl_map.auxv_size) { + /* + * Someone is trying to cheat the auxv vector. + */ + if (!prctl_map.auxv || + prctl_map.auxv_size > sizeof(mm->saved_auxv)) + return -EINVAL; + memset(user_auxv, 0, sizeof(user_auxv)); if (copy_from_user(user_auxv, (const void __user *)prctl_map.auxv, @@ -2009,6 +2000,14 @@ static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data } if (prctl_map.exe_fd != (u32)-1) { + /* + * Make sure the caller has the rights to + * change /proc/pid/exe link: only local sys admin should + * be allowed to. + */ + if (!ns_capable(current_user_ns(), CAP_SYS_ADMIN)) + return -EINVAL; + error = prctl_set_mm_exe_file(mm, prctl_map.exe_fd); if (error) return error; @@ -2096,7 +2095,11 @@ static int prctl_set_mm(int opt, unsigned long addr, unsigned long arg4, unsigned long arg5) { struct mm_struct *mm = current->mm; - struct prctl_mm_map prctl_map; + struct prctl_mm_map prctl_map = { + .auxv = NULL, + .auxv_size = 0, + .exe_fd = -1, + }; struct vm_area_struct *vma; int error; @@ -2124,9 +2127,15 @@ static int prctl_set_mm(int opt, unsigned long addr, error = -EINVAL; - down_write(&mm->mmap_sem); + /* + * arg_lock protects concurent updates of arg boundaries, we need + * mmap_sem for a) concurrent sys_brk, b) finding VMA for addr + * validation. + */ + down_read(&mm->mmap_sem); vma = find_vma(mm, addr); + spin_lock(&mm->arg_lock); prctl_map.start_code = mm->start_code; prctl_map.end_code = mm->end_code; prctl_map.start_data = mm->start_data; @@ -2138,9 +2147,6 @@ static int prctl_set_mm(int opt, unsigned long addr, prctl_map.arg_end = mm->arg_end; prctl_map.env_start = mm->env_start; prctl_map.env_end = mm->env_end; - prctl_map.auxv = NULL; - prctl_map.auxv_size = 0; - prctl_map.exe_fd = -1; switch (opt) { case PR_SET_MM_START_CODE: @@ -2180,7 +2186,7 @@ static int prctl_set_mm(int opt, unsigned long addr, goto out; } - error = validate_prctl_map(&prctl_map); + error = validate_prctl_map_addr(&prctl_map); if (error) goto out; @@ -2217,7 +2223,8 @@ static int prctl_set_mm(int opt, unsigned long addr, error = 0; out: - up_write(&mm->mmap_sem); + spin_unlock(&mm->arg_lock); + up_read(&mm->mmap_sem); return error; } diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index ab9d0e3c6d50..34b76895b81e 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -42,10 +42,15 @@ COND_SYSCALL(io_destroy); COND_SYSCALL(io_submit); COND_SYSCALL_COMPAT(io_submit); COND_SYSCALL(io_cancel); +COND_SYSCALL(io_getevents_time32); COND_SYSCALL(io_getevents); +COND_SYSCALL(io_pgetevents_time32); COND_SYSCALL(io_pgetevents); -COND_SYSCALL_COMPAT(io_getevents); +COND_SYSCALL_COMPAT(io_pgetevents_time32); COND_SYSCALL_COMPAT(io_pgetevents); +COND_SYSCALL(io_uring_setup); +COND_SYSCALL(io_uring_enter); +COND_SYSCALL(io_uring_register); /* fs/xattr.c */ @@ -114,9 +119,9 @@ COND_SYSCALL_COMPAT(signalfd4); /* fs/timerfd.c */ COND_SYSCALL(timerfd_create); COND_SYSCALL(timerfd_settime); -COND_SYSCALL_COMPAT(timerfd_settime); +COND_SYSCALL(timerfd_settime32); COND_SYSCALL(timerfd_gettime); -COND_SYSCALL_COMPAT(timerfd_gettime); +COND_SYSCALL(timerfd_gettime32); /* fs/utimes.c */ @@ -132,10 +137,12 @@ COND_SYSCALL(capset); /* kernel/exit.c */ /* kernel/fork.c */ +/* __ARCH_WANT_SYS_CLONE3 */ +COND_SYSCALL(clone3); /* kernel/futex.c */ COND_SYSCALL(futex); -COND_SYSCALL_COMPAT(futex); +COND_SYSCALL(futex_time32); COND_SYSCALL(set_robust_list); COND_SYSCALL_COMPAT(set_robust_list); COND_SYSCALL(get_robust_list); @@ -162,8 +169,6 @@ COND_SYSCALL(syslog); /* kernel/sched/core.c */ -/* kernel/signal.c */ - /* kernel/sys.c */ COND_SYSCALL(setregid); COND_SYSCALL(setgid); @@ -187,9 +192,9 @@ COND_SYSCALL(mq_open); COND_SYSCALL_COMPAT(mq_open); COND_SYSCALL(mq_unlink); COND_SYSCALL(mq_timedsend); -COND_SYSCALL_COMPAT(mq_timedsend); +COND_SYSCALL(mq_timedsend_time32); COND_SYSCALL(mq_timedreceive); -COND_SYSCALL_COMPAT(mq_timedreceive); +COND_SYSCALL(mq_timedreceive_time32); COND_SYSCALL(mq_notify); COND_SYSCALL_COMPAT(mq_notify); COND_SYSCALL(mq_getsetattr); @@ -197,8 +202,10 @@ COND_SYSCALL_COMPAT(mq_getsetattr); /* ipc/msg.c */ COND_SYSCALL(msgget); +COND_SYSCALL(old_msgctl); COND_SYSCALL(msgctl); COND_SYSCALL_COMPAT(msgctl); +COND_SYSCALL_COMPAT(old_msgctl); COND_SYSCALL(msgrcv); COND_SYSCALL_COMPAT(msgrcv); COND_SYSCALL(msgsnd); @@ -206,16 +213,20 @@ COND_SYSCALL_COMPAT(msgsnd); /* ipc/sem.c */ COND_SYSCALL(semget); +COND_SYSCALL(old_semctl); COND_SYSCALL(semctl); COND_SYSCALL_COMPAT(semctl); +COND_SYSCALL_COMPAT(old_semctl); COND_SYSCALL(semtimedop); -COND_SYSCALL_COMPAT(semtimedop); +COND_SYSCALL(semtimedop_time32); COND_SYSCALL(semop); /* ipc/shm.c */ COND_SYSCALL(shmget); +COND_SYSCALL(old_shmctl); COND_SYSCALL(shmctl); COND_SYSCALL_COMPAT(shmctl); +COND_SYSCALL_COMPAT(old_shmctl); COND_SYSCALL(shmat); COND_SYSCALL_COMPAT(shmat); COND_SYSCALL(shmdt); @@ -285,7 +296,7 @@ COND_SYSCALL(perf_event_open); COND_SYSCALL(accept4); COND_SYSCALL(recvmmsg); COND_SYSCALL(recvmmsg_time32); -COND_SYSCALL_COMPAT(recvmmsg); +COND_SYSCALL_COMPAT(recvmmsg_time32); COND_SYSCALL_COMPAT(recvmmsg_time64); /* @@ -366,6 +377,7 @@ COND_SYSCALL(kexec_file_load); /* s390 */ COND_SYSCALL(s390_pci_mmio_read); COND_SYSCALL(s390_pci_mmio_write); +COND_SYSCALL(s390_ipc); COND_SYSCALL_COMPAT(s390_ipc); /* powerpc */ diff --git a/kernel/sysctl.c b/kernel/sysctl.c index ba4d9e85feb8..43186ccfa139 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * sysctl.c: General linux system control interface * @@ -66,6 +67,9 @@ #include <linux/kexec.h> #include <linux/bpf.h> #include <linux/mount.h> +#include <linux/userfaultfd_k.h> + +#include "../lib/kstrtox.h" #include <linux/uaccess.h> #include <asm/processor.h> @@ -126,7 +130,9 @@ static int zero; static int __maybe_unused one = 1; static int __maybe_unused two = 2; static int __maybe_unused four = 4; +static unsigned long zero_ul; static unsigned long one_ul = 1; +static unsigned long long_max = LONG_MAX; static int one_hundred = 100; static int one_thousand = 1000; #ifdef CONFIG_PRINTK @@ -182,17 +188,17 @@ extern int no_unaligned_warning; * enum sysctl_writes_mode - supported sysctl write modes * * @SYSCTL_WRITES_LEGACY: each write syscall must fully contain the sysctl value - * to be written, and multiple writes on the same sysctl file descriptor - * will rewrite the sysctl value, regardless of file position. No warning - * is issued when the initial position is not 0. + * to be written, and multiple writes on the same sysctl file descriptor + * will rewrite the sysctl value, regardless of file position. No warning + * is issued when the initial position is not 0. * @SYSCTL_WRITES_WARN: same as above but warn when the initial file position is - * not 0. + * not 0. * @SYSCTL_WRITES_STRICT: writes to numeric sysctl entries must always be at - * file position 0 and the value must be fully contained in the buffer - * sent to the write syscall. If dealing with strings respect the file - * position, but restrict this to the max length of the buffer, anything - * passed the max lenght will be ignored. Multiple writes will append - * to the buffer. + * file position 0 and the value must be fully contained in the buffer + * sent to the write syscall. If dealing with strings respect the file + * position, but restrict this to the max length of the buffer, anything + * passed the max length will be ignored. Multiple writes will append + * to the buffer. * * These write modes control how current file position affects the behavior of * updating sysctl values through the proc interface on each write. @@ -446,6 +452,22 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = sched_rr_handler, }, +#ifdef CONFIG_UCLAMP_TASK + { + .procname = "sched_util_clamp_min", + .data = &sysctl_sched_uclamp_util_min, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sysctl_sched_uclamp_handler, + }, + { + .procname = "sched_util_clamp_max", + .data = &sysctl_sched_uclamp_util_max, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sysctl_sched_uclamp_handler, + }, +#endif #ifdef CONFIG_SCHED_AUTOGROUP { .procname = "sched_autogroup_enabled", @@ -467,6 +489,17 @@ static struct ctl_table kern_table[] = { .extra1 = &one, }, #endif +#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) + { + .procname = "sched_energy_aware", + .data = &sysctl_sched_energy_aware, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sched_energy_aware_handler, + .extra1 = &zero, + .extra2 = &one, + }, +#endif #ifdef CONFIG_PROVE_LOCKING { .procname = "prove_locking", @@ -1229,6 +1262,13 @@ static struct ctl_table kern_table[] = { .extra1 = &one, .extra2 = &one, }, + { + .procname = "bpf_stats_enabled", + .data = &bpf_stats_enabled_key.key, + .maxlen = sizeof(bpf_stats_enabled_key), + .mode = 0644, + .proc_handler = proc_do_static_key, + }, #endif #if defined(CONFIG_TREE_RCU) || defined(CONFIG_PREEMPT_RCU) { @@ -1446,7 +1486,7 @@ static struct ctl_table vm_table[] = { .data = &sysctl_extfrag_threshold, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = sysctl_extfrag_handler, + .proc_handler = proc_dointvec_minmax, .extra1 = &min_extfrag_threshold, .extra2 = &max_extfrag_threshold, }, @@ -1691,6 +1731,17 @@ static struct ctl_table vm_table[] = { .extra2 = (void *)&mmap_rnd_compat_bits_max, }, #endif +#ifdef CONFIG_USERFAULTFD + { + .procname = "unprivileged_userfaultfd", + .data = &sysctl_unprivileged_userfaultfd, + .maxlen = sizeof(sysctl_unprivileged_userfaultfd), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &one, + }, +#endif { } }; @@ -1722,6 +1773,8 @@ static struct ctl_table fs_table[] = { .maxlen = sizeof(files_stat.max_files), .mode = 0644, .proc_handler = proc_doulongvec_minmax, + .extra1 = &zero_ul, + .extra2 = &long_max, }, { .procname = "nr_open", @@ -2092,6 +2145,41 @@ static void proc_skip_char(char **buf, size_t *size, const char v) } } +/** + * strtoul_lenient - parse an ASCII formatted integer from a buffer and only + * fail on overflow + * + * @cp: kernel buffer containing the string to parse + * @endp: pointer to store the trailing characters + * @base: the base to use + * @res: where the parsed integer will be stored + * + * In case of success 0 is returned and @res will contain the parsed integer, + * @endp will hold any trailing characters. + * This function will fail the parse on overflow. If there wasn't an overflow + * the function will defer the decision what characters count as invalid to the + * caller. + */ +static int strtoul_lenient(const char *cp, char **endp, unsigned int base, + unsigned long *res) +{ + unsigned long long result; + unsigned int rv; + + cp = _parse_integer_fixup_radix(cp, &base); + rv = _parse_integer(cp, base, &result); + if ((rv & KSTRTOX_OVERFLOW) || (result != (unsigned long)result)) + return -ERANGE; + + cp += rv; + + if (endp) + *endp = (char *)cp; + + *res = (unsigned long)result; + return 0; +} + #define TMPBUFLEN 22 /** * proc_get_long - reads an ASCII formatted integer from a user buffer @@ -2135,7 +2223,8 @@ static int proc_get_long(char **buf, size_t *size, if (!isdigit(*p)) return -EINVAL; - *val = simple_strtoul(p, &p, 0); + if (strtoul_lenient(p, &p, 0, val)) + return -EINVAL; len = p - tmp; @@ -2577,23 +2666,25 @@ static int do_proc_dointvec_minmax_conv(bool *negp, unsigned long *lvalp, int *valp, int write, void *data) { + int tmp, ret; struct do_proc_dointvec_minmax_conv_param *param = data; + /* + * If writing, first do so via a temporary local int so we can + * bounds-check it before touching *valp. + */ + int *ip = write ? &tmp : valp; + + ret = do_proc_dointvec_conv(negp, lvalp, ip, write, data); + if (ret) + return ret; + if (write) { - int val = *negp ? -*lvalp : *lvalp; - if ((param->min && *param->min > val) || - (param->max && *param->max < val)) + if ((param->min && *param->min > tmp) || + (param->max && *param->max < tmp)) return -EINVAL; - *valp = val; - } else { - int val = *valp; - if (val < 0) { - *negp = true; - *lvalp = -(unsigned long)val; - } else { - *negp = false; - *lvalp = (unsigned long)val; - } + *valp = tmp; } + return 0; } @@ -2642,22 +2733,22 @@ static int do_proc_douintvec_minmax_conv(unsigned long *lvalp, unsigned int *valp, int write, void *data) { + int ret; + unsigned int tmp; struct do_proc_douintvec_minmax_conv_param *param = data; + /* write via temporary local uint for bounds-checking */ + unsigned int *up = write ? &tmp : valp; - if (write) { - unsigned int val = *lvalp; + ret = do_proc_douintvec_conv(lvalp, up, write, data); + if (ret) + return ret; - if (*lvalp > UINT_MAX) - return -EINVAL; - - if ((param->min && *param->min > val) || - (param->max && *param->max < val)) + if (write) { + if ((param->min && *param->min > tmp) || + (param->max && *param->max < tmp)) return -ERANGE; - *valp = val; - } else { - unsigned int val = *valp; - *lvalp = (unsigned long) val; + *valp = tmp; } return 0; @@ -2805,8 +2896,10 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int if (neg) continue; val = convmul * val / convdiv; - if ((min && val < *min) || (max && val > *max)) - continue; + if ((min && val < *min) || (max && val > *max)) { + err = -EINVAL; + break; + } *i = val; } else { val = convdiv * (*i) / convmul; @@ -3089,17 +3182,19 @@ int proc_do_large_bitmap(struct ctl_table *table, int write, if (write) { char *kbuf, *p; + size_t skipped = 0; - if (left > PAGE_SIZE - 1) + if (left > PAGE_SIZE - 1) { left = PAGE_SIZE - 1; + /* How much of the buffer we'll skip this pass */ + skipped = *lenp - left; + } p = kbuf = memdup_user_nul(buffer, left); if (IS_ERR(kbuf)) return PTR_ERR(kbuf); - tmp_bitmap = kcalloc(BITS_TO_LONGS(bitmap_len), - sizeof(unsigned long), - GFP_KERNEL); + tmp_bitmap = bitmap_zalloc(bitmap_len, GFP_KERNEL); if (!tmp_bitmap) { kfree(kbuf); return -ENOMEM; @@ -3108,9 +3203,22 @@ int proc_do_large_bitmap(struct ctl_table *table, int write, while (!err && left) { unsigned long val_a, val_b; bool neg; + size_t saved_left; + /* In case we stop parsing mid-number, we can reset */ + saved_left = left; err = proc_get_long(&p, &left, &val_a, &neg, tr_a, sizeof(tr_a), &c); + /* + * If we consumed the entirety of a truncated buffer or + * only one char is left (may be a "-"), then stop here, + * reset, & come back for more. + */ + if ((left <= 1) && skipped) { + left = saved_left; + break; + } + if (err) break; if (val_a >= bitmap_len || neg) { @@ -3128,6 +3236,15 @@ int proc_do_large_bitmap(struct ctl_table *table, int write, err = proc_get_long(&p, &left, &val_b, &neg, tr_b, sizeof(tr_b), &c); + /* + * If we consumed all of a truncated buffer or + * then stop here, reset, & come back for more. + */ + if (!left && skipped) { + left = saved_left; + break; + } + if (err) break; if (val_b >= bitmap_len || neg || @@ -3146,6 +3263,7 @@ int proc_do_large_bitmap(struct ctl_table *table, int write, proc_skip_char(&p, &left, '\n'); } kfree(kbuf); + left += skipped; } else { unsigned long bit_a, bit_b = 0; @@ -3190,7 +3308,7 @@ int proc_do_large_bitmap(struct ctl_table *table, int write, *ppos += *lenp; } - kfree(tmp_bitmap); + bitmap_free(tmp_bitmap); return err; } @@ -3257,9 +3375,46 @@ int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write, return -ENOSYS; } +int proc_do_large_bitmap(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return -ENOSYS; +} #endif /* CONFIG_PROC_SYSCTL */ +#if defined(CONFIG_SYSCTL) +int proc_do_static_key(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + struct static_key *key = (struct static_key *)table->data; + static DEFINE_MUTEX(static_key_mutex); + int val, ret; + struct ctl_table tmp = { + .data = &val, + .maxlen = sizeof(val), + .mode = table->mode, + .extra1 = &zero, + .extra2 = &one, + }; + + if (write && !capable(CAP_SYS_ADMIN)) + return -EPERM; + + mutex_lock(&static_key_mutex); + val = static_key_enabled(key); + ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); + if (write && !ret) { + if (val) + static_key_enable(key); + else + static_key_disable(key); + } + mutex_unlock(&static_key_mutex); + return ret; +} +#endif /* * No sense putting this after each symbol definition, twice, * exception granted :-) @@ -3274,3 +3429,4 @@ EXPORT_SYMBOL(proc_dointvec_ms_jiffies); EXPORT_SYMBOL(proc_dostring); EXPORT_SYMBOL(proc_doulongvec_minmax); EXPORT_SYMBOL(proc_doulongvec_ms_jiffies_minmax); +EXPORT_SYMBOL(proc_do_large_bitmap); diff --git a/kernel/taskstats.c b/kernel/taskstats.c index 4e62a4a8fa91..13a0f2e6ebc2 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -1,19 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * taskstats.c - Export per-task statistics to userland * * Copyright (C) Shailabh Nagar, IBM Corp. 2006 * (C) Balbir Singh, IBM Corp. 2006 - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * */ #include <linux/kernel.h> @@ -375,7 +365,7 @@ static struct taskstats *mk_reply(struct sk_buff *skb, int type, u32 pid) ? TASKSTATS_TYPE_AGGR_PID : TASKSTATS_TYPE_AGGR_TGID; - na = nla_nest_start(skb, aggr); + na = nla_nest_start_noflag(skb, aggr); if (!na) goto err; @@ -649,17 +639,41 @@ err: static const struct genl_ops taskstats_ops[] = { { .cmd = TASKSTATS_CMD_GET, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = taskstats_user_cmd, - .policy = taskstats_cmd_get_policy, - .flags = GENL_ADMIN_PERM, + /* policy enforced later */ + .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_HASPOL, }, { .cmd = CGROUPSTATS_CMD_GET, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = cgroupstats_user_cmd, - .policy = cgroupstats_cmd_get_policy, + /* policy enforced later */ + .flags = GENL_CMD_CAP_HASPOL, }, }; +static int taskstats_pre_doit(const struct genl_ops *ops, struct sk_buff *skb, + struct genl_info *info) +{ + const struct nla_policy *policy = NULL; + + switch (ops->cmd) { + case TASKSTATS_CMD_GET: + policy = taskstats_cmd_get_policy; + break; + case CGROUPSTATS_CMD_GET: + policy = cgroupstats_cmd_get_policy; + break; + default: + return -EINVAL; + } + + return nlmsg_validate_deprecated(info->nlhdr, GENL_HDRLEN, + TASKSTATS_CMD_ATTR_MAX, policy, + info->extack); +} + static struct genl_family family __ro_after_init = { .name = TASKSTATS_GENL_NAME, .version = TASKSTATS_GENL_VERSION, @@ -667,6 +681,7 @@ static struct genl_family family __ro_after_init = { .module = THIS_MODULE, .ops = taskstats_ops, .n_ops = ARRAY_SIZE(taskstats_ops), + .pre_doit = taskstats_pre_doit, }; /* Needed early in initialization */ diff --git a/kernel/test_kprobes.c b/kernel/test_kprobes.c index 7bca480151b0..76c997fdbc9d 100644 --- a/kernel/test_kprobes.c +++ b/kernel/test_kprobes.c @@ -1,17 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * test_kprobes.c - simple sanity test for *probes * * Copyright IBM Corp. 2008 - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it would be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See - * the GNU General Public License for more details. */ #define pr_fmt(fmt) "Kprobe smoke test: " fmt diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index 58b981f4bb5d..fcc42353f125 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0-only # # Timer subsystem related configuration options # @@ -117,6 +118,35 @@ config NO_HZ_FULL endchoice +config CONTEXT_TRACKING + bool + +config CONTEXT_TRACKING_FORCE + bool "Force context tracking" + depends on CONTEXT_TRACKING + default y if !NO_HZ_FULL + help + The major pre-requirement for full dynticks to work is to + support the context tracking subsystem. But there are also + other dependencies to provide in order to make the full + dynticks working. + + This option stands for testing when an arch implements the + context tracking backend but doesn't yet fullfill all the + requirements to make the full dynticks feature working. + Without the full dynticks, there is no way to test the support + for context tracking and the subsystems that rely on it: RCU + userspace extended quiescent state and tickless cputime + accounting. This option copes with the absence of the full + dynticks subsystem by forcing the context tracking on all + CPUs in the system. + + Say Y only if you're working on the development of an + architecture backend for the context tracking. + + Say N otherwise, this option brings an overhead that you + don't want in production. + config NO_HZ bool "Old Idle dynticks config" depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS diff --git a/kernel/time/Makefile b/kernel/time/Makefile index f1e46f338a9c..1867044800bb 100644 --- a/kernel/time/Makefile +++ b/kernel/time/Makefile @@ -16,5 +16,6 @@ ifeq ($(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST),y) endif obj-$(CONFIG_GENERIC_SCHED_CLOCK) += sched_clock.o obj-$(CONFIG_TICK_ONESHOT) += tick-oneshot.o tick-sched.o +obj-$(CONFIG_HAVE_GENERIC_VDSO) += vsyscall.o obj-$(CONFIG_DEBUG_FS) += timekeeping_debug.o obj-$(CONFIG_TEST_UDELAY) += test_udelay.o diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 2c97e8c2d29f..57518efc3810 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -233,7 +233,6 @@ EXPORT_SYMBOL_GPL(alarm_expires_remaining); /** * alarmtimer_suspend - Suspend time callback * @dev: unused - * @state: unused * * When we are going into suspend, we look through the bases * to see which is the soonest timer to expire. We then @@ -594,7 +593,7 @@ static ktime_t alarm_timer_remaining(struct k_itimer *timr, ktime_t now) { struct alarm *alarm = &timr->it.alarm.alarmtimer; - return ktime_sub(now, alarm->node.expires); + return ktime_sub(alarm->node.expires, now); } /** diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 5e77662dd2d9..f5490222e134 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -611,6 +611,22 @@ void clockevents_resume(void) } #ifdef CONFIG_HOTPLUG_CPU + +# ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST +/** + * tick_offline_cpu - Take CPU out of the broadcast mechanism + * @cpu: The outgoing CPU + * + * Called on the outgoing CPU after it took itself offline. + */ +void tick_offline_cpu(unsigned int cpu) +{ + raw_spin_lock(&clockevents_lock); + tick_broadcast_offline(cpu); + raw_spin_unlock(&clockevents_lock); +} +# endif + /** * tick_cleanup_dead_cpu - Cleanup the tick and clockevents of a dead cpu */ @@ -621,8 +637,6 @@ void tick_cleanup_dead_cpu(int cpu) raw_spin_lock_irqsave(&clockevents_lock, flags); - tick_shutdown_broadcast_oneshot(cpu); - tick_shutdown_broadcast(cpu); tick_shutdown(cpu); /* * Unregister the clock event devices which were diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 3bcc19ceb073..fff5f64981c6 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -105,12 +105,12 @@ static DEFINE_SPINLOCK(watchdog_lock); static int watchdog_running; static atomic_t watchdog_reset_pending; -static void inline clocksource_watchdog_lock(unsigned long *flags) +static inline void clocksource_watchdog_lock(unsigned long *flags) { spin_lock_irqsave(&watchdog_lock, *flags); } -static void inline clocksource_watchdog_unlock(unsigned long *flags) +static inline void clocksource_watchdog_unlock(unsigned long *flags) { spin_unlock_irqrestore(&watchdog_lock, *flags); } diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index f5cfa1b73d6f..5ee77f1a8a92 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -30,7 +30,6 @@ #include <linux/syscalls.h> #include <linux/interrupt.h> #include <linux/tick.h> -#include <linux/seq_file.h> #include <linux/err.h> #include <linux/debugobjects.h> #include <linux/sched/signal.h> @@ -364,7 +363,7 @@ static bool hrtimer_fixup_activate(void *addr, enum debug_obj_state state) switch (state) { case ODEBUG_STATE_ACTIVE: WARN_ON(1); - + /* fall through */ default: return false; } @@ -1115,9 +1114,10 @@ EXPORT_SYMBOL_GPL(hrtimer_start_range_ns); * @timer: hrtimer to stop * * Returns: - * 0 when the timer was not active - * 1 when the timer was active - * -1 when the timer is currently executing the callback function and + * + * * 0 when the timer was not active + * * 1 when the timer was active + * * -1 when the timer is currently executing the callback function and * cannot be stopped */ int hrtimer_try_to_cancel(struct hrtimer *timer) @@ -1771,7 +1771,7 @@ SYSCALL_DEFINE2(nanosleep, struct __kernel_timespec __user *, rqtp, #ifdef CONFIG_COMPAT_32BIT_TIME -COMPAT_SYSCALL_DEFINE2(nanosleep, struct old_timespec32 __user *, rqtp, +SYSCALL_DEFINE2(nanosleep_time32, struct old_timespec32 __user *, rqtp, struct old_timespec32 __user *, rmtp) { struct timespec64 tu; diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c index dc1b6f1929f9..d23b434c2ca7 100644 --- a/kernel/time/jiffies.c +++ b/kernel/time/jiffies.c @@ -63,7 +63,7 @@ __cacheline_aligned_in_smp DEFINE_SEQLOCK(jiffies_lock); #if (BITS_PER_LONG < 64) u64 get_jiffies_64(void) { - unsigned long seq; + unsigned int seq; u64 ret; do { @@ -89,7 +89,7 @@ struct clocksource * __init __weak clocksource_default_clock(void) return &clocksource_jiffies; } -struct clocksource refined_jiffies; +static struct clocksource refined_jiffies; int register_refined_jiffies(long cycles_per_second) { diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 36a2bef00125..65eb796610dc 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -17,6 +17,7 @@ #include <linux/mm.h> #include <linux/module.h> #include <linux/rtc.h> +#include <linux/audit.h> #include "ntp_internal.h" #include "timekeeping_internal.h" @@ -42,6 +43,7 @@ static u64 tick_length_base; #define MAX_TICKADJ 500LL /* usecs */ #define MAX_TICKADJ_SCALED \ (((MAX_TICKADJ * NSEC_PER_USEC) << NTP_SCALE_SHIFT) / NTP_INTERVAL_FREQ) +#define MAX_TAI_OFFSET 100000 /* * phase-lock loop variables @@ -188,13 +190,13 @@ static inline int is_error_status(int status) && (status & (STA_PPSWANDER|STA_PPSERROR))); } -static inline void pps_fill_timex(struct timex *txc) +static inline void pps_fill_timex(struct __kernel_timex *txc) { txc->ppsfreq = shift_right((pps_freq >> PPM_SCALE_INV_SHIFT) * PPM_SCALE_INV, NTP_SCALE_SHIFT); txc->jitter = pps_jitter; if (!(time_status & STA_NANO)) - txc->jitter /= NSEC_PER_USEC; + txc->jitter = pps_jitter / NSEC_PER_USEC; txc->shift = pps_shift; txc->stabil = pps_stabil; txc->jitcnt = pps_jitcnt; @@ -220,7 +222,7 @@ static inline int is_error_status(int status) return status & (STA_UNSYNC|STA_CLOCKERR); } -static inline void pps_fill_timex(struct timex *txc) +static inline void pps_fill_timex(struct __kernel_timex *txc) { /* PPS is not implemented, so these are zero */ txc->ppsfreq = 0; @@ -633,7 +635,7 @@ void ntp_notify_cmos_timer(void) /* * Propagate a new txc->status value into the NTP state: */ -static inline void process_adj_status(const struct timex *txc) +static inline void process_adj_status(const struct __kernel_timex *txc) { if ((time_status & STA_PLL) && !(txc->status & STA_PLL)) { time_state = TIME_OK; @@ -656,7 +658,8 @@ static inline void process_adj_status(const struct timex *txc) } -static inline void process_adjtimex_modes(const struct timex *txc, s32 *time_tai) +static inline void process_adjtimex_modes(const struct __kernel_timex *txc, + s32 *time_tai) { if (txc->modes & ADJ_STATUS) process_adj_status(txc); @@ -689,7 +692,8 @@ static inline void process_adjtimex_modes(const struct timex *txc, s32 *time_tai time_constant = max(time_constant, 0l); } - if (txc->modes & ADJ_TAI && txc->constant > 0) + if (txc->modes & ADJ_TAI && + txc->constant >= 0 && txc->constant <= MAX_TAI_OFFSET) *time_tai = txc->constant; if (txc->modes & ADJ_OFFSET) @@ -707,7 +711,8 @@ static inline void process_adjtimex_modes(const struct timex *txc, s32 *time_tai * adjtimex mainly allows reading (and writing, if superuser) of * kernel time-keeping variables. used by xntpd. */ -int __do_adjtimex(struct timex *txc, const struct timespec64 *ts, s32 *time_tai) +int __do_adjtimex(struct __kernel_timex *txc, const struct timespec64 *ts, + s32 *time_tai, struct audit_ntp_data *ad) { int result; @@ -718,18 +723,33 @@ int __do_adjtimex(struct timex *txc, const struct timespec64 *ts, s32 *time_tai) /* adjtime() is independent from ntp_adjtime() */ time_adjust = txc->offset; ntp_update_frequency(); + + audit_ntp_set_old(ad, AUDIT_NTP_ADJUST, save_adjust); + audit_ntp_set_new(ad, AUDIT_NTP_ADJUST, time_adjust); } txc->offset = save_adjust; } else { - /* If there are input parameters, then process them: */ - if (txc->modes) + if (txc->modes) { + audit_ntp_set_old(ad, AUDIT_NTP_OFFSET, time_offset); + audit_ntp_set_old(ad, AUDIT_NTP_FREQ, time_freq); + audit_ntp_set_old(ad, AUDIT_NTP_STATUS, time_status); + audit_ntp_set_old(ad, AUDIT_NTP_TAI, *time_tai); + audit_ntp_set_old(ad, AUDIT_NTP_TICK, tick_usec); + process_adjtimex_modes(txc, time_tai); + audit_ntp_set_new(ad, AUDIT_NTP_OFFSET, time_offset); + audit_ntp_set_new(ad, AUDIT_NTP_FREQ, time_freq); + audit_ntp_set_new(ad, AUDIT_NTP_STATUS, time_status); + audit_ntp_set_new(ad, AUDIT_NTP_TAI, *time_tai); + audit_ntp_set_new(ad, AUDIT_NTP_TICK, tick_usec); + } + txc->offset = shift_right(time_offset * NTP_INTERVAL_FREQ, NTP_SCALE_SHIFT); if (!(time_status & STA_NANO)) - txc->offset /= NSEC_PER_USEC; + txc->offset = (u32)txc->offset / NSEC_PER_USEC; } result = time_state; /* mostly `TIME_OK' */ @@ -754,7 +774,7 @@ int __do_adjtimex(struct timex *txc, const struct timespec64 *ts, s32 *time_tai) txc->time.tv_sec = (time_t)ts->tv_sec; txc->time.tv_usec = ts->tv_nsec; if (!(time_status & STA_NANO)) - txc->time.tv_usec /= NSEC_PER_USEC; + txc->time.tv_usec = ts->tv_nsec / NSEC_PER_USEC; /* Handle leapsec adjustments */ if (unlikely(ts->tv_sec >= ntp_next_leap_sec)) { diff --git a/kernel/time/ntp_internal.h b/kernel/time/ntp_internal.h index c24b0e13f011..908ecaa65fc3 100644 --- a/kernel/time/ntp_internal.h +++ b/kernel/time/ntp_internal.h @@ -8,6 +8,8 @@ extern void ntp_clear(void); extern u64 ntp_tick_length(void); extern ktime_t ntp_get_next_leap(void); extern int second_overflow(time64_t secs); -extern int __do_adjtimex(struct timex *txc, const struct timespec64 *ts, s32 *time_tai); +extern int __do_adjtimex(struct __kernel_timex *txc, + const struct timespec64 *ts, + s32 *time_tai, struct audit_ntp_data *ad); extern void __hardpps(const struct timespec64 *phase_ts, const struct timespec64 *raw_ts); #endif /* _LINUX_NTP_INTERNAL_H */ diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c index 425bbfce6819..ec960bb939fd 100644 --- a/kernel/time/posix-clock.c +++ b/kernel/time/posix-clock.c @@ -228,7 +228,7 @@ static void put_clock_desc(struct posix_clock_desc *cd) fput(cd->fp); } -static int pc_clock_adjtime(clockid_t id, struct timex *tx) +static int pc_clock_adjtime(clockid_t id, struct __kernel_timex *tx) { struct posix_clock_desc cd; int err; diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 80f955210861..0a426f4e3125 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -67,13 +67,13 @@ static void bump_cpu_timer(struct k_itimer *timer, u64 now) int i; u64 delta, incr; - if (timer->it.cpu.incr == 0) + if (!timer->it_interval) return; if (now < timer->it.cpu.expires) return; - incr = timer->it.cpu.incr; + incr = timer->it_interval; delta = now + incr - timer->it.cpu.expires; /* Don't use (incr*2 < delta), incr*2 might overflow. */ @@ -520,7 +520,7 @@ static void cpu_timer_fire(struct k_itimer *timer) */ wake_up_process(timer->it_process); timer->it.cpu.expires = 0; - } else if (timer->it.cpu.incr == 0) { + } else if (!timer->it_interval) { /* * One-shot timer. Clear it as soon as it's fired. */ @@ -606,7 +606,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags, */ ret = 0; - old_incr = timer->it.cpu.incr; + old_incr = timer->it_interval; old_expires = timer->it.cpu.expires; if (unlikely(timer->it.cpu.firing)) { timer->it.cpu.firing = -1; @@ -684,8 +684,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags, * Install the new reload setting, and * set up the signal and overrun bookkeeping. */ - timer->it.cpu.incr = timespec64_to_ns(&new->it_interval); - timer->it_interval = ns_to_ktime(timer->it.cpu.incr); + timer->it_interval = timespec64_to_ktime(new->it_interval); /* * This acts as a modification timestamp for the timer, @@ -724,7 +723,7 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec64 *itp /* * Easy part: convert the reload time. */ - itp->it_interval = ns_to_timespec64(timer->it.cpu.incr); + itp->it_interval = ktime_to_timespec64(timer->it_interval); if (!timer->it.cpu.expires) return; diff --git a/kernel/time/posix-stubs.c b/kernel/time/posix-stubs.c index a51895486e5e..67df65f887ac 100644 --- a/kernel/time/posix-stubs.c +++ b/kernel/time/posix-stubs.c @@ -45,6 +45,7 @@ SYS_NI(timer_delete); SYS_NI(clock_adjtime); SYS_NI(getitimer); SYS_NI(setitimer); +SYS_NI(clock_adjtime32); #ifdef __ARCH_WANT_SYS_ALARM SYS_NI(alarm); #endif @@ -150,16 +151,16 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags, #ifdef CONFIG_COMPAT COMPAT_SYS_NI(timer_create); -COMPAT_SYS_NI(clock_adjtime); -COMPAT_SYS_NI(timer_settime); -COMPAT_SYS_NI(timer_gettime); COMPAT_SYS_NI(getitimer); COMPAT_SYS_NI(setitimer); #endif #ifdef CONFIG_COMPAT_32BIT_TIME -COMPAT_SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock, - struct old_timespec32 __user *, tp) +SYS_NI(timer_settime32); +SYS_NI(timer_gettime32); + +SYSCALL_DEFINE2(clock_settime32, const clockid_t, which_clock, + struct old_timespec32 __user *, tp) { struct timespec64 new_tp; @@ -171,8 +172,8 @@ COMPAT_SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock, return do_sys_settimeofday64(&new_tp, NULL); } -COMPAT_SYSCALL_DEFINE2(clock_gettime, clockid_t, which_clock, - struct old_timespec32 __user *, tp) +SYSCALL_DEFINE2(clock_gettime32, clockid_t, which_clock, + struct old_timespec32 __user *, tp) { int ret; struct timespec64 kernel_tp; @@ -186,8 +187,8 @@ COMPAT_SYSCALL_DEFINE2(clock_gettime, clockid_t, which_clock, return 0; } -COMPAT_SYSCALL_DEFINE2(clock_getres, clockid_t, which_clock, - struct old_timespec32 __user *, tp) +SYSCALL_DEFINE2(clock_getres_time32, clockid_t, which_clock, + struct old_timespec32 __user *, tp) { struct timespec64 rtn_tp = { .tv_sec = 0, @@ -206,9 +207,9 @@ COMPAT_SYSCALL_DEFINE2(clock_getres, clockid_t, which_clock, } } -COMPAT_SYSCALL_DEFINE4(clock_nanosleep, clockid_t, which_clock, int, flags, - struct old_timespec32 __user *, rqtp, - struct old_timespec32 __user *, rmtp) +SYSCALL_DEFINE4(clock_nanosleep_time32, clockid_t, which_clock, int, flags, + struct old_timespec32 __user *, rqtp, + struct old_timespec32 __user *, rmtp) { struct timespec64 t; diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 0e84bb72a3da..d7f2d91acdac 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -179,7 +179,7 @@ static int posix_clock_realtime_set(const clockid_t which_clock, } static int posix_clock_realtime_adj(const clockid_t which_clock, - struct timex *t) + struct __kernel_timex *t) { return do_adjtimex(t); } @@ -730,8 +730,8 @@ SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id, #ifdef CONFIG_COMPAT_32BIT_TIME -COMPAT_SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id, - struct old_itimerspec32 __user *, setting) +SYSCALL_DEFINE2(timer_gettime32, timer_t, timer_id, + struct old_itimerspec32 __user *, setting) { struct itimerspec64 cur_setting; @@ -903,9 +903,9 @@ SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags, } #ifdef CONFIG_COMPAT_32BIT_TIME -COMPAT_SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags, - struct old_itimerspec32 __user *, new, - struct old_itimerspec32 __user *, old) +SYSCALL_DEFINE4(timer_settime32, timer_t, timer_id, int, flags, + struct old_itimerspec32 __user *, new, + struct old_itimerspec32 __user *, old) { struct itimerspec64 new_spec, old_spec; struct itimerspec64 *rtn = old ? &old_spec : NULL; @@ -980,23 +980,16 @@ retry_delete: */ static void itimer_delete(struct k_itimer *timer) { - unsigned long flags; - retry_delete: - spin_lock_irqsave(&timer->it_lock, flags); + spin_lock_irq(&timer->it_lock); if (timer_delete_hook(timer) == TIMER_RETRY) { - unlock_timer(timer, flags); + spin_unlock_irq(&timer->it_lock); goto retry_delete; } list_del(&timer->list); - /* - * This keeps any tasks waiting on the spin lock from thinking - * they got something (see the lock code above). - */ - timer->it_signal = NULL; - unlock_timer(timer, flags); + spin_unlock_irq(&timer->it_lock); release_posix_timer(timer, IT_ID_SET); } @@ -1047,22 +1040,28 @@ SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock, return error; } -SYSCALL_DEFINE2(clock_adjtime, const clockid_t, which_clock, - struct timex __user *, utx) +int do_clock_adjtime(const clockid_t which_clock, struct __kernel_timex * ktx) { const struct k_clock *kc = clockid_to_kclock(which_clock); - struct timex ktx; - int err; if (!kc) return -EINVAL; if (!kc->clock_adj) return -EOPNOTSUPP; + return kc->clock_adj(which_clock, ktx); +} + +SYSCALL_DEFINE2(clock_adjtime, const clockid_t, which_clock, + struct __kernel_timex __user *, utx) +{ + struct __kernel_timex ktx; + int err; + if (copy_from_user(&ktx, utx, sizeof(ktx))) return -EFAULT; - err = kc->clock_adj(which_clock, &ktx); + err = do_clock_adjtime(which_clock, &ktx); if (err >= 0 && copy_to_user(utx, &ktx, sizeof(ktx))) return -EFAULT; @@ -1090,8 +1089,8 @@ SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock, #ifdef CONFIG_COMPAT_32BIT_TIME -COMPAT_SYSCALL_DEFINE2(clock_settime, clockid_t, which_clock, - struct old_timespec32 __user *, tp) +SYSCALL_DEFINE2(clock_settime32, clockid_t, which_clock, + struct old_timespec32 __user *, tp) { const struct k_clock *kc = clockid_to_kclock(which_clock); struct timespec64 ts; @@ -1105,8 +1104,8 @@ COMPAT_SYSCALL_DEFINE2(clock_settime, clockid_t, which_clock, return kc->clock_set(which_clock, &ts); } -COMPAT_SYSCALL_DEFINE2(clock_gettime, clockid_t, which_clock, - struct old_timespec32 __user *, tp) +SYSCALL_DEFINE2(clock_gettime32, clockid_t, which_clock, + struct old_timespec32 __user *, tp) { const struct k_clock *kc = clockid_to_kclock(which_clock); struct timespec64 ts; @@ -1123,40 +1122,26 @@ COMPAT_SYSCALL_DEFINE2(clock_gettime, clockid_t, which_clock, return err; } -#endif - -#ifdef CONFIG_COMPAT - -COMPAT_SYSCALL_DEFINE2(clock_adjtime, clockid_t, which_clock, - struct compat_timex __user *, utp) +SYSCALL_DEFINE2(clock_adjtime32, clockid_t, which_clock, + struct old_timex32 __user *, utp) { - const struct k_clock *kc = clockid_to_kclock(which_clock); - struct timex ktx; + struct __kernel_timex ktx; int err; - if (!kc) - return -EINVAL; - if (!kc->clock_adj) - return -EOPNOTSUPP; - - err = compat_get_timex(&ktx, utp); + err = get_old_timex32(&ktx, utp); if (err) return err; - err = kc->clock_adj(which_clock, &ktx); + err = do_clock_adjtime(which_clock, &ktx); if (err >= 0) - err = compat_put_timex(utp, &ktx); + err = put_old_timex32(utp, &ktx); return err; } -#endif - -#ifdef CONFIG_COMPAT_32BIT_TIME - -COMPAT_SYSCALL_DEFINE2(clock_getres, clockid_t, which_clock, - struct old_timespec32 __user *, tp) +SYSCALL_DEFINE2(clock_getres_time32, clockid_t, which_clock, + struct old_timespec32 __user *, tp) { const struct k_clock *kc = clockid_to_kclock(which_clock); struct timespec64 ts; @@ -1212,9 +1197,9 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags, #ifdef CONFIG_COMPAT_32BIT_TIME -COMPAT_SYSCALL_DEFINE4(clock_nanosleep, clockid_t, which_clock, int, flags, - struct old_timespec32 __user *, rqtp, - struct old_timespec32 __user *, rmtp) +SYSCALL_DEFINE4(clock_nanosleep_time32, clockid_t, which_clock, int, flags, + struct old_timespec32 __user *, rqtp, + struct old_timespec32 __user *, rmtp) { const struct k_clock *kc = clockid_to_kclock(which_clock); struct timespec64 t; diff --git a/kernel/time/posix-timers.h b/kernel/time/posix-timers.h index ddb21145211a..de5daa6d975a 100644 --- a/kernel/time/posix-timers.h +++ b/kernel/time/posix-timers.h @@ -8,7 +8,7 @@ struct k_clock { const struct timespec64 *tp); int (*clock_get)(const clockid_t which_clock, struct timespec64 *tp); - int (*clock_adj)(const clockid_t which_clock, struct timex *tx); + int (*clock_adj)(const clockid_t which_clock, struct __kernel_timex *tx); int (*timer_create)(struct k_itimer *timer); int (*nsleep)(const clockid_t which_clock, int flags, const struct timespec64 *); diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c index 094b82ca95e5..142b07619918 100644 --- a/kernel/time/sched_clock.c +++ b/kernel/time/sched_clock.c @@ -94,7 +94,7 @@ static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift) unsigned long long notrace sched_clock(void) { u64 cyc, res; - unsigned long seq; + unsigned int seq; struct clock_read_data *rd; do { @@ -231,7 +231,7 @@ sched_clock_register(u64 (*read)(void), int bits, unsigned long rate) if (irqtime > 0 || (irqtime == -1 && rate >= 1000000)) enable_sched_clock_irqtime(); - pr_debug("Registered %pF as sched_clock source\n", read); + pr_debug("Registered %pS as sched_clock source\n", read); } void __init generic_sched_clock_init(void) @@ -267,12 +267,12 @@ void __init generic_sched_clock_init(void) */ static u64 notrace suspended_sched_clock_read(void) { - unsigned long seq = raw_read_seqcount(&cd.seq); + unsigned int seq = raw_read_seqcount(&cd.seq); return cd.read_data[seq & 1].epoch_cyc; } -static int sched_clock_suspend(void) +int sched_clock_suspend(void) { struct clock_read_data *rd = &cd.read_data[0]; @@ -283,7 +283,7 @@ static int sched_clock_suspend(void) return 0; } -static void sched_clock_resume(void) +void sched_clock_resume(void) { struct clock_read_data *rd = &cd.read_data[0]; diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 803fa67aace9..e51778c312f1 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -36,10 +36,16 @@ static __cacheline_aligned_in_smp DEFINE_RAW_SPINLOCK(tick_broadcast_lock); static void tick_broadcast_setup_oneshot(struct clock_event_device *bc); static void tick_broadcast_clear_oneshot(int cpu); static void tick_resume_broadcast_oneshot(struct clock_event_device *bc); +# ifdef CONFIG_HOTPLUG_CPU +static void tick_broadcast_oneshot_offline(unsigned int cpu); +# endif #else static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc) { BUG(); } static inline void tick_broadcast_clear_oneshot(int cpu) { } static inline void tick_resume_broadcast_oneshot(struct clock_event_device *bc) { } +# ifdef CONFIG_HOTPLUG_CPU +static inline void tick_broadcast_oneshot_offline(unsigned int cpu) { } +# endif #endif /* @@ -375,6 +381,7 @@ void tick_broadcast_control(enum tick_broadcast_mode mode) switch (mode) { case TICK_BROADCAST_FORCE: tick_broadcast_forced = 1; + /* fall through */ case TICK_BROADCAST_ON: cpumask_set_cpu(cpu, tick_broadcast_on); if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_mask)) { @@ -432,27 +439,29 @@ void tick_set_periodic_handler(struct clock_event_device *dev, int broadcast) } #ifdef CONFIG_HOTPLUG_CPU -/* - * Remove a CPU from broadcasting - */ -void tick_shutdown_broadcast(unsigned int cpu) +static void tick_shutdown_broadcast(void) { - struct clock_event_device *bc; - unsigned long flags; - - raw_spin_lock_irqsave(&tick_broadcast_lock, flags); - - bc = tick_broadcast_device.evtdev; - cpumask_clear_cpu(cpu, tick_broadcast_mask); - cpumask_clear_cpu(cpu, tick_broadcast_on); + struct clock_event_device *bc = tick_broadcast_device.evtdev; if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) { if (bc && cpumask_empty(tick_broadcast_mask)) clockevents_shutdown(bc); } +} - raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); +/* + * Remove a CPU from broadcasting + */ +void tick_broadcast_offline(unsigned int cpu) +{ + raw_spin_lock(&tick_broadcast_lock); + cpumask_clear_cpu(cpu, tick_broadcast_mask); + cpumask_clear_cpu(cpu, tick_broadcast_on); + tick_broadcast_oneshot_offline(cpu); + tick_shutdown_broadcast(); + raw_spin_unlock(&tick_broadcast_lock); } + #endif void tick_suspend_broadcast(void) @@ -800,13 +809,13 @@ int __tick_broadcast_oneshot_control(enum tick_broadcast_state state) * either the CPU handling the broadcast * interrupt or we got woken by something else. * - * We are not longer in the broadcast mask, so + * We are no longer in the broadcast mask, so * if the cpu local expiry time is already * reached, we would reprogram the cpu local * timer with an already expired event. * * This can lead to a ping-pong when we return - * to idle and therefor rearm the broadcast + * to idle and therefore rearm the broadcast * timer before the cpu local timer was able * to fire. This happens because the forced * reprogramming makes sure that the event @@ -949,14 +958,10 @@ void hotplug_cpu__broadcast_tick_pull(int deadcpu) } /* - * Remove a dead CPU from broadcasting + * Remove a dying CPU from broadcasting */ -void tick_shutdown_broadcast_oneshot(unsigned int cpu) +static void tick_broadcast_oneshot_offline(unsigned int cpu) { - unsigned long flags; - - raw_spin_lock_irqsave(&tick_broadcast_lock, flags); - /* * Clear the broadcast masks for the dead cpu, but do not stop * the broadcast device! @@ -964,8 +969,6 @@ void tick_shutdown_broadcast_oneshot(unsigned int cpu) cpumask_clear_cpu(cpu, tick_broadcast_oneshot_mask); cpumask_clear_cpu(cpu, tick_broadcast_pending_mask); cpumask_clear_cpu(cpu, tick_broadcast_force_mask); - - raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); } #endif diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index 529143b4c8d2..59225b484e4e 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -46,6 +46,14 @@ ktime_t tick_period; * procedure also covers cpu hotplug. */ int tick_do_timer_cpu __read_mostly = TICK_DO_TIMER_BOOT; +#ifdef CONFIG_NO_HZ_FULL +/* + * tick_do_timer_boot_cpu indicates the boot CPU temporarily owns + * tick_do_timer_cpu and it should be taken over by an eligible secondary + * when one comes online. + */ +static int tick_do_timer_boot_cpu __read_mostly = -1; +#endif /* * Debugging: see timer_list.c @@ -149,7 +157,7 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast) !tick_broadcast_oneshot_active()) { clockevents_switch_state(dev, CLOCK_EVT_STATE_PERIODIC); } else { - unsigned long seq; + unsigned int seq; ktime_t next; do { @@ -167,6 +175,26 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast) } } +#ifdef CONFIG_NO_HZ_FULL +static void giveup_do_timer(void *info) +{ + int cpu = *(unsigned int *)info; + + WARN_ON(tick_do_timer_cpu != smp_processor_id()); + + tick_do_timer_cpu = cpu; +} + +static void tick_take_do_timer_from_boot(void) +{ + int cpu = smp_processor_id(); + int from = tick_do_timer_boot_cpu; + + if (from >= 0 && from != cpu) + smp_call_function_single(from, giveup_do_timer, &cpu, 1); +} +#endif + /* * Setup the tick device */ @@ -186,12 +214,26 @@ static void tick_setup_device(struct tick_device *td, * this cpu: */ if (tick_do_timer_cpu == TICK_DO_TIMER_BOOT) { - if (!tick_nohz_full_cpu(cpu)) - tick_do_timer_cpu = cpu; - else - tick_do_timer_cpu = TICK_DO_TIMER_NONE; + tick_do_timer_cpu = cpu; + tick_next_period = ktime_get(); tick_period = NSEC_PER_SEC / HZ; +#ifdef CONFIG_NO_HZ_FULL + /* + * The boot CPU may be nohz_full, in which case set + * tick_do_timer_boot_cpu so the first housekeeping + * secondary that comes up will take do_timer from + * us. + */ + if (tick_nohz_full_cpu(cpu)) + tick_do_timer_boot_cpu = cpu; + + } else if (tick_do_timer_boot_cpu != -1 && + !tick_nohz_full_cpu(cpu)) { + tick_take_do_timer_from_boot(); + tick_do_timer_boot_cpu = -1; + WARN_ON(tick_do_timer_cpu != cpu); +#endif } /* @@ -487,6 +529,7 @@ void tick_freeze(void) trace_suspend_resume(TPS("timekeeping_freeze"), smp_processor_id(), true); system_state = SYSTEM_SUSPEND; + sched_clock_suspend(); timekeeping_suspend(); } else { tick_suspend_local(); @@ -510,6 +553,7 @@ void tick_unfreeze(void) if (tick_freeze_depth == num_online_cpus()) { timekeeping_resume(); + sched_clock_resume(); system_state = SYSTEM_RUNNING; trace_suspend_resume(TPS("timekeeping_freeze"), smp_processor_id(), false); diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index e277284c2831..7b2496136729 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -64,7 +64,6 @@ extern ssize_t sysfs_get_uname(const char *buf, char *dst, size_t cnt); extern int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu); extern void tick_install_broadcast_device(struct clock_event_device *dev); extern int tick_is_broadcast_device(struct clock_event_device *dev); -extern void tick_shutdown_broadcast(unsigned int cpu); extern void tick_suspend_broadcast(void); extern void tick_resume_broadcast(void); extern bool tick_resume_check_broadcast(void); @@ -78,7 +77,6 @@ static inline void tick_install_broadcast_device(struct clock_event_device *dev) static inline int tick_is_broadcast_device(struct clock_event_device *dev) { return 0; } static inline int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu) { return 0; } static inline void tick_do_periodic_broadcast(struct clock_event_device *d) { } -static inline void tick_shutdown_broadcast(unsigned int cpu) { } static inline void tick_suspend_broadcast(void) { } static inline void tick_resume_broadcast(void) { } static inline bool tick_resume_check_broadcast(void) { return false; } @@ -128,19 +126,23 @@ static inline int tick_check_oneshot_change(int allow_nohz) { return 0; } /* Functions related to oneshot broadcasting */ #if defined(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) && defined(CONFIG_TICK_ONESHOT) extern void tick_broadcast_switch_to_oneshot(void); -extern void tick_shutdown_broadcast_oneshot(unsigned int cpu); extern int tick_broadcast_oneshot_active(void); extern void tick_check_oneshot_broadcast_this_cpu(void); bool tick_broadcast_oneshot_available(void); extern struct cpumask *tick_get_broadcast_oneshot_mask(void); #else /* !(BROADCAST && ONESHOT): */ static inline void tick_broadcast_switch_to_oneshot(void) { } -static inline void tick_shutdown_broadcast_oneshot(unsigned int cpu) { } static inline int tick_broadcast_oneshot_active(void) { return 0; } static inline void tick_check_oneshot_broadcast_this_cpu(void) { } static inline bool tick_broadcast_oneshot_available(void) { return tick_oneshot_possible(); } #endif /* !(BROADCAST && ONESHOT) */ +#if defined(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) && defined(CONFIG_HOTPLUG_CPU) +extern void tick_broadcast_offline(unsigned int cpu); +#else +static inline void tick_broadcast_offline(unsigned int cpu) { } +#endif + /* NO_HZ_FULL internal */ #ifdef CONFIG_NO_HZ_FULL extern void tick_nohz_init(void); diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 6fa52cd6df0b..be9707f68024 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -121,10 +121,16 @@ static void tick_sched_do_timer(struct tick_sched *ts, ktime_t now) * into a long sleep. If two CPUs happen to assign themselves to * this duty, then the jiffies update is still serialized by * jiffies_lock. + * + * If nohz_full is enabled, this should not happen because the + * tick_do_timer_cpu never relinquishes. */ - if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE) - && !tick_nohz_full_cpu(cpu)) + if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE)) { +#ifdef CONFIG_NO_HZ_FULL + WARN_ON(tick_nohz_full_running); +#endif tick_do_timer_cpu = cpu; + } #endif /* Check, if the jiffies need an update */ @@ -395,8 +401,8 @@ void __init tick_nohz_full_setup(cpumask_var_t cpumask) static int tick_nohz_cpu_down(unsigned int cpu) { /* - * The boot CPU handles housekeeping duty (unbound timers, - * workqueues, timekeeping, ...) on behalf of full dynticks + * The tick_do_timer_cpu CPU handles housekeeping duty (unbound + * timers, workqueues, timekeeping, ...) on behalf of full dynticks * CPUs. It must remain online when nohz full is enabled. */ if (tick_nohz_full_running && tick_do_timer_cpu == cpu) @@ -423,12 +429,15 @@ void __init tick_nohz_init(void) return; } - cpu = smp_processor_id(); + if (IS_ENABLED(CONFIG_PM_SLEEP_SMP) && + !IS_ENABLED(CONFIG_PM_SLEEP_SMP_NONZERO_CPU)) { + cpu = smp_processor_id(); - if (cpumask_test_cpu(cpu, tick_nohz_full_mask)) { - pr_warn("NO_HZ: Clearing %d from nohz_full range for timekeeping\n", - cpu); - cpumask_clear_cpu(cpu, tick_nohz_full_mask); + if (cpumask_test_cpu(cpu, tick_nohz_full_mask)) { + pr_warn("NO_HZ: Clearing %d from nohz_full range " + "for timekeeping\n", cpu); + cpumask_clear_cpu(cpu, tick_nohz_full_mask); + } } for_each_cpu(cpu, tick_nohz_full_mask) @@ -645,7 +654,8 @@ static inline bool local_timer_softirq_pending(void) static ktime_t tick_nohz_next_event(struct tick_sched *ts, int cpu) { u64 basemono, next_tick, next_tmr, next_rcu, delta, expires; - unsigned long seq, basejiff; + unsigned long basejiff; + unsigned int seq; /* Read jiffies and the time when jiffies were updated last */ do { @@ -772,7 +782,6 @@ static void tick_nohz_stop_tick(struct tick_sched *ts, int cpu) */ if (!ts->tick_stopped) { calc_load_nohz_start(); - cpu_load_update_nohz_start(); quiet_vmstat(); ts->last_tick = hrtimer_get_expires(&ts->sched_timer); @@ -819,7 +828,6 @@ static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now) { /* Update jiffies first */ tick_do_update_jiffies64(now); - cpu_load_update_nohz_stop(); /* * Clear the timer idle flag, so we avoid IPIs on remote queueing and @@ -904,8 +912,13 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts) /* * Boot safety: make sure the timekeeping duty has been * assigned before entering dyntick-idle mode, + * tick_do_timer_cpu is TICK_DO_TIMER_BOOT */ - if (tick_do_timer_cpu == TICK_DO_TIMER_NONE) + if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_BOOT)) + return false; + + /* Should not happen for nohz-full */ + if (WARN_ON_ONCE(tick_do_timer_cpu == TICK_DO_TIMER_NONE)) return false; } @@ -1023,6 +1036,18 @@ bool tick_nohz_idle_got_tick(void) } /** + * tick_nohz_get_next_hrtimer - return the next expiration time for the hrtimer + * or the tick, whatever that expires first. Note that, if the tick has been + * stopped, it returns the next hrtimer. + * + * Called from power state control code with interrupts disabled + */ +ktime_t tick_nohz_get_next_hrtimer(void) +{ + return __this_cpu_read(tick_cpu_device.evtdev)->next_event; +} + +/** * tick_nohz_get_sleep_length - return the expected length of the current sleep * @delta_next: duration until the next event if the tick cannot be stopped * diff --git a/kernel/time/tick-sched.h b/kernel/time/tick-sched.h index 6de959a854b2..4fb06527cf64 100644 --- a/kernel/time/tick-sched.h +++ b/kernel/time/tick-sched.h @@ -24,12 +24,19 @@ enum tick_nohz_mode { * struct tick_sched - sched tick emulation and no idle tick control/stats * @sched_timer: hrtimer to schedule the periodic tick in high * resolution mode + * @check_clocks: Notification mechanism about clocksource changes + * @nohz_mode: Mode - one state of tick_nohz_mode + * @inidle: Indicator that the CPU is in the tick idle mode + * @tick_stopped: Indicator that the idle tick has been stopped + * @idle_active: Indicator that the CPU is actively in the tick idle mode; + * it is resetted during irq handling phases. + * @do_timer_lst: CPU was the last one doing do_timer before going idle + * @got_idle_tick: Tick timer function has run with @inidle set * @last_tick: Store the last tick expiry time when the tick * timer is modified for nohz sleeps. This is necessary * to resume the tick timer operation in the timeline * when the CPU returns from nohz sleep. * @next_tick: Next tick to be fired when in dynticks mode. - * @tick_stopped: Indicator that the idle tick has been stopped * @idle_jiffies: jiffies at the entry to idle for idle time accounting * @idle_calls: Total number of idle calls * @idle_sleeps: Number of idle calls, where the sched tick was stopped @@ -40,8 +47,8 @@ enum tick_nohz_mode { * @iowait_sleeptime: Sum of the time slept in idle with sched tick stopped, with IO outstanding * @timer_expires: Anticipated timer expiration time (in case sched tick is stopped) * @timer_expires_base: Base time clock monotonic for @timer_expires - * @do_timer_lst: CPU was the last one doing do_timer before going idle - * @got_idle_tick: Tick timer function has run with @inidle set + * @next_timer: Expiry time of next expiring timer for debugging purpose only + * @tick_dep_mask: Tick dependency mask - is set, if someone needs the tick */ struct tick_sched { struct hrtimer sched_timer; diff --git a/kernel/time/time.c b/kernel/time/time.c index 2edb5088a70b..5c54ca632d08 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -98,11 +98,11 @@ SYSCALL_DEFINE1(stime, time_t __user *, tptr) #endif /* __ARCH_WANT_SYS_TIME */ -#ifdef CONFIG_COMPAT -#ifdef __ARCH_WANT_COMPAT_SYS_TIME +#ifdef CONFIG_COMPAT_32BIT_TIME +#ifdef __ARCH_WANT_SYS_TIME32 /* old_time32_t is a 32 bit "long" and needs to get converted. */ -COMPAT_SYSCALL_DEFINE1(time, old_time32_t __user *, tloc) +SYSCALL_DEFINE1(time32, old_time32_t __user *, tloc) { old_time32_t i; @@ -116,7 +116,7 @@ COMPAT_SYSCALL_DEFINE1(time, old_time32_t __user *, tloc) return i; } -COMPAT_SYSCALL_DEFINE1(stime, old_time32_t __user *, tptr) +SYSCALL_DEFINE1(stime32, old_time32_t __user *, tptr) { struct timespec64 tv; int err; @@ -134,7 +134,7 @@ COMPAT_SYSCALL_DEFINE1(stime, old_time32_t __user *, tptr) return 0; } -#endif /* __ARCH_WANT_COMPAT_SYS_TIME */ +#endif /* __ARCH_WANT_SYS_TIME32 */ #endif SYSCALL_DEFINE2(gettimeofday, struct timeval __user *, tv, @@ -171,7 +171,7 @@ int do_sys_settimeofday64(const struct timespec64 *tv, const struct timezone *tz static int firsttime = 1; int error = 0; - if (tv && !timespec64_valid(tv)) + if (tv && !timespec64_valid_settod(tv)) return -EINVAL; error = security_settime64(tv, tz); @@ -251,6 +251,10 @@ COMPAT_SYSCALL_DEFINE2(settimeofday, struct old_timeval32 __user *, tv, if (tv) { if (compat_get_timeval(&user_tv, tv)) return -EFAULT; + + if (!timeval_valid(&user_tv)) + return -EINVAL; + new_ts.tv_sec = user_tv.tv_sec; new_ts.tv_nsec = user_tv.tv_usec * NSEC_PER_USEC; } @@ -263,35 +267,99 @@ COMPAT_SYSCALL_DEFINE2(settimeofday, struct old_timeval32 __user *, tv, } #endif -SYSCALL_DEFINE1(adjtimex, struct timex __user *, txc_p) +#if !defined(CONFIG_64BIT_TIME) || defined(CONFIG_64BIT) +SYSCALL_DEFINE1(adjtimex, struct __kernel_timex __user *, txc_p) { - struct timex txc; /* Local copy of parameter */ + struct __kernel_timex txc; /* Local copy of parameter */ int ret; /* Copy the user data space into the kernel copy * structure. But bear in mind that the structures * may change */ - if (copy_from_user(&txc, txc_p, sizeof(struct timex))) + if (copy_from_user(&txc, txc_p, sizeof(struct __kernel_timex))) return -EFAULT; ret = do_adjtimex(&txc); - return copy_to_user(txc_p, &txc, sizeof(struct timex)) ? -EFAULT : ret; + return copy_to_user(txc_p, &txc, sizeof(struct __kernel_timex)) ? -EFAULT : ret; } +#endif -#ifdef CONFIG_COMPAT +#ifdef CONFIG_COMPAT_32BIT_TIME +int get_old_timex32(struct __kernel_timex *txc, const struct old_timex32 __user *utp) +{ + struct old_timex32 tx32; + + memset(txc, 0, sizeof(struct __kernel_timex)); + if (copy_from_user(&tx32, utp, sizeof(struct old_timex32))) + return -EFAULT; -COMPAT_SYSCALL_DEFINE1(adjtimex, struct compat_timex __user *, utp) + txc->modes = tx32.modes; + txc->offset = tx32.offset; + txc->freq = tx32.freq; + txc->maxerror = tx32.maxerror; + txc->esterror = tx32.esterror; + txc->status = tx32.status; + txc->constant = tx32.constant; + txc->precision = tx32.precision; + txc->tolerance = tx32.tolerance; + txc->time.tv_sec = tx32.time.tv_sec; + txc->time.tv_usec = tx32.time.tv_usec; + txc->tick = tx32.tick; + txc->ppsfreq = tx32.ppsfreq; + txc->jitter = tx32.jitter; + txc->shift = tx32.shift; + txc->stabil = tx32.stabil; + txc->jitcnt = tx32.jitcnt; + txc->calcnt = tx32.calcnt; + txc->errcnt = tx32.errcnt; + txc->stbcnt = tx32.stbcnt; + + return 0; +} + +int put_old_timex32(struct old_timex32 __user *utp, const struct __kernel_timex *txc) +{ + struct old_timex32 tx32; + + memset(&tx32, 0, sizeof(struct old_timex32)); + tx32.modes = txc->modes; + tx32.offset = txc->offset; + tx32.freq = txc->freq; + tx32.maxerror = txc->maxerror; + tx32.esterror = txc->esterror; + tx32.status = txc->status; + tx32.constant = txc->constant; + tx32.precision = txc->precision; + tx32.tolerance = txc->tolerance; + tx32.time.tv_sec = txc->time.tv_sec; + tx32.time.tv_usec = txc->time.tv_usec; + tx32.tick = txc->tick; + tx32.ppsfreq = txc->ppsfreq; + tx32.jitter = txc->jitter; + tx32.shift = txc->shift; + tx32.stabil = txc->stabil; + tx32.jitcnt = txc->jitcnt; + tx32.calcnt = txc->calcnt; + tx32.errcnt = txc->errcnt; + tx32.stbcnt = txc->stbcnt; + tx32.tai = txc->tai; + if (copy_to_user(utp, &tx32, sizeof(struct old_timex32))) + return -EFAULT; + return 0; +} + +SYSCALL_DEFINE1(adjtimex_time32, struct old_timex32 __user *, utp) { - struct timex txc; + struct __kernel_timex txc; int err, ret; - err = compat_get_timex(&txc, utp); + err = get_old_timex32(&txc, utp); if (err) return err; ret = do_adjtimex(&txc); - err = compat_put_timex(utp, &txc); + err = put_old_timex32(utp, &txc); if (err) return err; @@ -719,6 +787,16 @@ u64 jiffies64_to_nsecs(u64 j) } EXPORT_SYMBOL(jiffies64_to_nsecs); +u64 jiffies64_to_msecs(const u64 j) +{ +#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ) + return (MSEC_PER_SEC / HZ) * j; +#else + return div_u64(j * HZ_TO_MSEC_NUM, HZ_TO_MSEC_DEN); +#endif +} +EXPORT_SYMBOL(jiffies64_to_msecs); + /** * nsecs_to_jiffies64 - Convert nsecs in u64 to jiffies64 * diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index ac5dbf2cd4a2..d911c8470149 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -21,6 +21,7 @@ #include <linux/stop_machine.h> #include <linux/pvclock_gtod.h> #include <linux/compiler.h> +#include <linux/audit.h> #include "tick-internal.h" #include "ntp_internal.h" @@ -720,7 +721,7 @@ static void timekeeping_forward_now(struct timekeeper *tk) void ktime_get_real_ts64(struct timespec64 *ts) { struct timekeeper *tk = &tk_core.timekeeper; - unsigned long seq; + unsigned int seq; u64 nsecs; WARN_ON(timekeeping_suspended); @@ -807,17 +808,18 @@ ktime_t ktime_get_coarse_with_offset(enum tk_offsets offs) struct timekeeper *tk = &tk_core.timekeeper; unsigned int seq; ktime_t base, *offset = offsets[offs]; + u64 nsecs; WARN_ON(timekeeping_suspended); do { seq = read_seqcount_begin(&tk_core.seq); base = ktime_add(tk->tkr_mono.base, *offset); + nsecs = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift; } while (read_seqcount_retry(&tk_core.seq, seq)); - return base; - + return ktime_add_ns(base, nsecs); } EXPORT_SYMBOL_GPL(ktime_get_coarse_with_offset); @@ -829,7 +831,7 @@ EXPORT_SYMBOL_GPL(ktime_get_coarse_with_offset); ktime_t ktime_mono_to_any(ktime_t tmono, enum tk_offsets offs) { ktime_t *offset = offsets[offs]; - unsigned long seq; + unsigned int seq; ktime_t tconv; do { @@ -960,7 +962,7 @@ time64_t __ktime_get_real_seconds(void) void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot) { struct timekeeper *tk = &tk_core.timekeeper; - unsigned long seq; + unsigned int seq; ktime_t base_raw; ktime_t base_real; u64 nsec_raw; @@ -1122,7 +1124,7 @@ int get_device_system_crosststamp(int (*get_time_fn) ktime_t base_real, base_raw; u64 nsec_real, nsec_raw; u8 cs_was_changed_seq; - unsigned long seq; + unsigned int seq; bool do_interp; int ret; @@ -1221,7 +1223,7 @@ int do_settimeofday64(const struct timespec64 *ts) unsigned long flags; int ret = 0; - if (!timespec64_valid_strict(ts)) + if (!timespec64_valid_settod(ts)) return -EINVAL; raw_spin_lock_irqsave(&timekeeper_lock, flags); @@ -1250,6 +1252,9 @@ out: /* signal hrtimers about time change */ clock_was_set(); + if (!ret) + audit_tk_injoffset(ts_delta); + return ret; } EXPORT_SYMBOL(do_settimeofday64); @@ -1278,7 +1283,7 @@ static int timekeeping_inject_offset(const struct timespec64 *ts) /* Make sure the proposed value is valid */ tmp = timespec64_add(tk_xtime(tk), *ts); if (timespec64_compare(&tk->wall_to_monotonic, ts) > 0 || - !timespec64_valid_strict(&tmp)) { + !timespec64_valid_settod(&tmp)) { ret = -EINVAL; goto error; } @@ -1409,7 +1414,7 @@ int timekeeping_notify(struct clocksource *clock) void ktime_get_raw_ts64(struct timespec64 *ts) { struct timekeeper *tk = &tk_core.timekeeper; - unsigned long seq; + unsigned int seq; u64 nsecs; do { @@ -1431,7 +1436,7 @@ EXPORT_SYMBOL(ktime_get_raw_ts64); int timekeeping_valid_for_hres(void) { struct timekeeper *tk = &tk_core.timekeeper; - unsigned long seq; + unsigned int seq; int ret; do { @@ -1450,7 +1455,7 @@ int timekeeping_valid_for_hres(void) u64 timekeeping_max_deferment(void) { struct timekeeper *tk = &tk_core.timekeeper; - unsigned long seq; + unsigned int seq; u64 ret; do { @@ -1527,7 +1532,7 @@ void __init timekeeping_init(void) unsigned long flags; read_persistent_wall_and_boot_offset(&wall_time, &boot_offset); - if (timespec64_valid_strict(&wall_time) && + if (timespec64_valid_settod(&wall_time) && timespec64_to_ns(&wall_time) > 0) { persistent_clock_exists = true; } else if (timespec64_to_ns(&wall_time) != 0) { @@ -2150,7 +2155,7 @@ EXPORT_SYMBOL_GPL(getboottime64); void ktime_get_coarse_real_ts64(struct timespec64 *ts) { struct timekeeper *tk = &tk_core.timekeeper; - unsigned long seq; + unsigned int seq; do { seq = read_seqcount_begin(&tk_core.seq); @@ -2164,7 +2169,7 @@ void ktime_get_coarse_ts64(struct timespec64 *ts) { struct timekeeper *tk = &tk_core.timekeeper; struct timespec64 now, mono; - unsigned long seq; + unsigned int seq; do { seq = read_seqcount_begin(&tk_core.seq); @@ -2234,7 +2239,7 @@ ktime_t ktime_get_update_offsets_now(unsigned int *cwsseq, ktime_t *offs_real, /** * timekeeping_validate_timex - Ensures the timex is ok for use in do_adjtimex */ -static int timekeeping_validate_timex(const struct timex *txc) +static int timekeeping_validate_timex(const struct __kernel_timex *txc) { if (txc->modes & ADJ_ADJTIME) { /* singleshot must not be used with any other mode bits */ @@ -2300,9 +2305,10 @@ static int timekeeping_validate_timex(const struct timex *txc) /** * do_adjtimex() - Accessor function to NTP __do_adjtimex function */ -int do_adjtimex(struct timex *txc) +int do_adjtimex(struct __kernel_timex *txc) { struct timekeeper *tk = &tk_core.timekeeper; + struct audit_ntp_data ad; unsigned long flags; struct timespec64 ts; s32 orig_tai, tai; @@ -2322,15 +2328,19 @@ int do_adjtimex(struct timex *txc) ret = timekeeping_inject_offset(&delta); if (ret) return ret; + + audit_tk_injoffset(delta); } + audit_ntp_init(&ad); + ktime_get_real_ts64(&ts); raw_spin_lock_irqsave(&timekeeper_lock, flags); write_seqcount_begin(&tk_core.seq); orig_tai = tai = tk->tai_offset; - ret = __do_adjtimex(txc, &ts, &tai); + ret = __do_adjtimex(txc, &ts, &tai, &ad); if (tai != orig_tai) { __timekeeping_set_tai_offset(tk, tai); @@ -2341,6 +2351,8 @@ int do_adjtimex(struct timex *txc) write_seqcount_end(&tk_core.seq); raw_spin_unlock_irqrestore(&timekeeper_lock, flags); + audit_ntp_log(&ad); + /* Update the multiplier immediately if frequency was set directly */ if (txc->modes & (ADJ_FREQUENCY | ADJ_TICK)) timekeeping_advance(TK_ADV_FREQ); diff --git a/kernel/time/timekeeping.h b/kernel/time/timekeeping.h index 7a9b4eb7a1d5..141ab3ab0354 100644 --- a/kernel/time/timekeeping.h +++ b/kernel/time/timekeeping.h @@ -14,6 +14,13 @@ extern u64 timekeeping_max_deferment(void); extern void timekeeping_warp_clock(void); extern int timekeeping_suspend(void); extern void timekeeping_resume(void); +#ifdef CONFIG_GENERIC_SCHED_CLOCK +extern int sched_clock_suspend(void); +extern void sched_clock_resume(void); +#else +static inline int sched_clock_suspend(void) { return 0; } +static inline void sched_clock_resume(void) { } +#endif extern void do_timer(unsigned long ticks); extern void update_wall_time(void); diff --git a/kernel/time/timekeeping_debug.c b/kernel/time/timekeeping_debug.c index 86489950d690..b73e8850e58d 100644 --- a/kernel/time/timekeeping_debug.c +++ b/kernel/time/timekeeping_debug.c @@ -37,15 +37,8 @@ DEFINE_SHOW_ATTRIBUTE(tk_debug_sleep_time); static int __init tk_debug_sleep_time_init(void) { - struct dentry *d; - - d = debugfs_create_file("sleep_time", 0444, NULL, NULL, - &tk_debug_sleep_time_fops); - if (!d) { - pr_err("Failed to create sleep_time debug file\n"); - return -ENOMEM; - } - + debugfs_create_file("sleep_time", 0444, NULL, NULL, + &tk_debug_sleep_time_fops); return 0; } late_initcall(tk_debug_sleep_time_init); diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 444156debfa0..343c7ba33b1c 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -536,6 +536,8 @@ static void enqueue_timer(struct timer_base *base, struct timer_list *timer, hlist_add_head(&timer->entry, base->vectors + idx); __set_bit(idx, base->pending_map); timer_set_idx(timer, idx); + + trace_timer_start(timer, timer->expires, timer->flags); } static void @@ -647,7 +649,7 @@ static bool timer_fixup_activate(void *addr, enum debug_obj_state state) case ODEBUG_STATE_ACTIVE: WARN_ON(1); - + /* fall through */ default: return false; } @@ -757,13 +759,6 @@ static inline void debug_init(struct timer_list *timer) trace_timer_init(timer); } -static inline void -debug_activate(struct timer_list *timer, unsigned long expires) -{ - debug_timer_activate(timer); - trace_timer_start(timer, expires, timer->flags); -} - static inline void debug_deactivate(struct timer_list *timer) { debug_timer_deactivate(timer); @@ -1037,7 +1032,7 @@ __mod_timer(struct timer_list *timer, unsigned long expires, unsigned int option } } - debug_activate(timer, expires); + debug_timer_activate(timer); timer->expires = expires; /* @@ -1171,7 +1166,7 @@ void add_timer_on(struct timer_list *timer, int cpu) } forward_timer_base(base); - debug_activate(timer, timer->expires); + debug_timer_activate(timer); internal_add_timer(base, timer); raw_spin_unlock_irqrestore(&base->lock, flags); } @@ -1298,7 +1293,9 @@ int del_timer_sync(struct timer_list *timer) EXPORT_SYMBOL(del_timer_sync); #endif -static void call_timer_fn(struct timer_list *timer, void (*fn)(struct timer_list *)) +static void call_timer_fn(struct timer_list *timer, + void (*fn)(struct timer_list *), + unsigned long baseclk) { int count = preempt_count(); @@ -1321,14 +1318,14 @@ static void call_timer_fn(struct timer_list *timer, void (*fn)(struct timer_list */ lock_map_acquire(&lockdep_map); - trace_timer_expire_entry(timer); + trace_timer_expire_entry(timer, baseclk); fn(timer); trace_timer_expire_exit(timer); lock_map_release(&lockdep_map); if (count != preempt_count()) { - WARN_ONCE(1, "timer: %pF preempt leak: %08x -> %08x\n", + WARN_ONCE(1, "timer: %pS preempt leak: %08x -> %08x\n", fn, count, preempt_count()); /* * Restore the preempt count. That gives us a decent @@ -1342,6 +1339,13 @@ static void call_timer_fn(struct timer_list *timer, void (*fn)(struct timer_list static void expire_timers(struct timer_base *base, struct hlist_head *head) { + /* + * This value is required only for tracing. base->clk was + * incremented directly before expire_timers was called. But expiry + * is related to the old base->clk value. + */ + unsigned long baseclk = base->clk - 1; + while (!hlist_empty(head)) { struct timer_list *timer; void (*fn)(struct timer_list *); @@ -1355,11 +1359,11 @@ static void expire_timers(struct timer_base *base, struct hlist_head *head) if (timer->flags & TIMER_IRQSAFE) { raw_spin_unlock(&base->lock); - call_timer_fn(timer, fn); + call_timer_fn(timer, fn, baseclk); raw_spin_lock(&base->lock); } else { raw_spin_unlock_irq(&base->lock); - call_timer_fn(timer, fn); + call_timer_fn(timer, fn, baseclk); raw_spin_lock_irq(&base->lock); } } @@ -1632,7 +1636,7 @@ void update_process_times(int user_tick) /* Note: this timer irq context must be accounted for as well. */ account_process_tick(p, user_tick); run_local_timers(); - rcu_check_callbacks(user_tick); + rcu_sched_clock_irq(user_tick); #ifdef CONFIG_IRQ_WORK if (in_irq()) irq_work_tick(); diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index 98ba50dcb1b2..acb326f5f50a 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c @@ -282,23 +282,6 @@ static inline void timer_list_header(struct seq_file *m, u64 now) SEQ_printf(m, "\n"); } -static int timer_list_show(struct seq_file *m, void *v) -{ - struct timer_list_iter *iter = v; - - if (iter->cpu == -1 && !iter->second_pass) - timer_list_header(m, iter->now); - else if (!iter->second_pass) - print_cpu(m, iter->cpu, iter->now); -#ifdef CONFIG_GENERIC_CLOCKEVENTS - else if (iter->cpu == -1 && iter->second_pass) - timer_list_show_tickdevices_header(m); - else - print_tickdevice(m, tick_get_device(iter->cpu), iter->cpu); -#endif - return 0; -} - void sysrq_timer_list_show(void) { u64 now = ktime_to_ns(ktime_get()); @@ -317,6 +300,24 @@ void sysrq_timer_list_show(void) return; } +#ifdef CONFIG_PROC_FS +static int timer_list_show(struct seq_file *m, void *v) +{ + struct timer_list_iter *iter = v; + + if (iter->cpu == -1 && !iter->second_pass) + timer_list_header(m, iter->now); + else if (!iter->second_pass) + print_cpu(m, iter->cpu, iter->now); +#ifdef CONFIG_GENERIC_CLOCKEVENTS + else if (iter->cpu == -1 && iter->second_pass) + timer_list_show_tickdevices_header(m); + else + print_tickdevice(m, tick_get_device(iter->cpu), iter->cpu); +#endif + return 0; +} + static void *move_iter(struct timer_list_iter *iter, loff_t offset) { for (; offset; offset--) { @@ -376,3 +377,4 @@ static int __init init_timer_list_procfs(void) return 0; } __initcall(init_timer_list_procfs); +#endif diff --git a/kernel/time/vsyscall.c b/kernel/time/vsyscall.c new file mode 100644 index 000000000000..8cf3596a4ce6 --- /dev/null +++ b/kernel/time/vsyscall.c @@ -0,0 +1,129 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright 2019 ARM Ltd. + * + * Generic implementation of update_vsyscall and update_vsyscall_tz. + * + * Based on the x86 specific implementation. + */ + +#include <linux/hrtimer.h> +#include <linux/timekeeper_internal.h> +#include <vdso/datapage.h> +#include <vdso/helpers.h> +#include <vdso/vsyscall.h> + +static inline void update_vdso_data(struct vdso_data *vdata, + struct timekeeper *tk) +{ + struct vdso_timestamp *vdso_ts; + u64 nsec; + + vdata[CS_HRES_COARSE].cycle_last = tk->tkr_mono.cycle_last; + vdata[CS_HRES_COARSE].mask = tk->tkr_mono.mask; + vdata[CS_HRES_COARSE].mult = tk->tkr_mono.mult; + vdata[CS_HRES_COARSE].shift = tk->tkr_mono.shift; + vdata[CS_RAW].cycle_last = tk->tkr_raw.cycle_last; + vdata[CS_RAW].mask = tk->tkr_raw.mask; + vdata[CS_RAW].mult = tk->tkr_raw.mult; + vdata[CS_RAW].shift = tk->tkr_raw.shift; + + /* CLOCK_REALTIME */ + vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_REALTIME]; + vdso_ts->sec = tk->xtime_sec; + vdso_ts->nsec = tk->tkr_mono.xtime_nsec; + + /* CLOCK_MONOTONIC */ + vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_MONOTONIC]; + vdso_ts->sec = tk->xtime_sec + tk->wall_to_monotonic.tv_sec; + + nsec = tk->tkr_mono.xtime_nsec; + nsec += ((u64)tk->wall_to_monotonic.tv_nsec << tk->tkr_mono.shift); + while (nsec >= (((u64)NSEC_PER_SEC) << tk->tkr_mono.shift)) { + nsec -= (((u64)NSEC_PER_SEC) << tk->tkr_mono.shift); + vdso_ts->sec++; + } + vdso_ts->nsec = nsec; + + /* CLOCK_MONOTONIC_RAW */ + vdso_ts = &vdata[CS_RAW].basetime[CLOCK_MONOTONIC_RAW]; + vdso_ts->sec = tk->raw_sec; + vdso_ts->nsec = tk->tkr_raw.xtime_nsec; + + /* CLOCK_BOOTTIME */ + vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_BOOTTIME]; + vdso_ts->sec = tk->xtime_sec + tk->wall_to_monotonic.tv_sec; + nsec = tk->tkr_mono.xtime_nsec; + nsec += ((u64)(tk->wall_to_monotonic.tv_nsec + + ktime_to_ns(tk->offs_boot)) << tk->tkr_mono.shift); + while (nsec >= (((u64)NSEC_PER_SEC) << tk->tkr_mono.shift)) { + nsec -= (((u64)NSEC_PER_SEC) << tk->tkr_mono.shift); + vdso_ts->sec++; + } + vdso_ts->nsec = nsec; + + /* CLOCK_TAI */ + vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_TAI]; + vdso_ts->sec = tk->xtime_sec + (s64)tk->tai_offset; + vdso_ts->nsec = tk->tkr_mono.xtime_nsec; + + /* + * Read without the seqlock held by clock_getres(). + * Note: No need to have a second copy. + */ + WRITE_ONCE(vdata[CS_HRES_COARSE].hrtimer_res, hrtimer_resolution); +} + +void update_vsyscall(struct timekeeper *tk) +{ + struct vdso_data *vdata = __arch_get_k_vdso_data(); + struct vdso_timestamp *vdso_ts; + u64 nsec; + + if (__arch_update_vdso_data()) { + /* + * Some architectures might want to skip the update of the + * data page. + */ + return; + } + + /* copy vsyscall data */ + vdso_write_begin(vdata); + + vdata[CS_HRES_COARSE].clock_mode = __arch_get_clock_mode(tk); + vdata[CS_RAW].clock_mode = __arch_get_clock_mode(tk); + + /* CLOCK_REALTIME_COARSE */ + vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_REALTIME_COARSE]; + vdso_ts->sec = tk->xtime_sec; + vdso_ts->nsec = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift; + + /* CLOCK_MONOTONIC_COARSE */ + vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_MONOTONIC_COARSE]; + vdso_ts->sec = tk->xtime_sec + tk->wall_to_monotonic.tv_sec; + nsec = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift; + nsec = nsec + tk->wall_to_monotonic.tv_nsec; + vdso_ts->sec += __iter_div_u64_rem(nsec, NSEC_PER_SEC, &vdso_ts->nsec); + + if (__arch_use_vsyscall(vdata)) + update_vdso_data(vdata, tk); + + __arch_update_vsyscall(vdata, tk); + + vdso_write_end(vdata); + + __arch_sync_vdso_data(vdata); +} + +void update_vsyscall_tz(void) +{ + struct vdso_data *vdata = __arch_get_k_vdso_data(); + + if (__arch_use_vsyscall(vdata)) { + vdata[CS_HRES_COARSE].tz_minuteswest = sys_tz.tz_minuteswest; + vdata[CS_HRES_COARSE].tz_dsttime = sys_tz.tz_dsttime; + } + + __arch_sync_vdso_data(vdata); +} diff --git a/kernel/torture.c b/kernel/torture.c index bbf6d473e50c..a8d9bdfba7c3 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -1,23 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0+ /* * Common functions for in-kernel torture tests. * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, you can access it online at - * http://www.gnu.org/licenses/gpl-2.0.html. - * * Copyright (C) IBM Corporation, 2014 * - * Author: Paul E. McKenney <paulmck@us.ibm.com> + * Author: Paul E. McKenney <paulmck@linux.ibm.com> * Based on kernel/rcu/torture.c. */ @@ -53,7 +40,7 @@ #include "rcu/rcu.h" MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com>"); +MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.ibm.com>"); static char *torture_type; static int verbose; @@ -75,6 +62,7 @@ static DEFINE_MUTEX(fullstop_mutex); static struct task_struct *onoff_task; static long onoff_holdoff; static long onoff_interval; +static torture_ofl_func *onoff_f; static long n_offline_attempts; static long n_offline_successes; static unsigned long sum_offline; @@ -100,6 +88,8 @@ bool torture_offline(int cpu, long *n_offl_attempts, long *n_offl_successes, if (!cpu_online(cpu) || !cpu_is_hotpluggable(cpu)) return false; + if (num_online_cpus() <= 1) + return false; /* Can't offline the last CPU. */ if (verbose > 1) pr_alert("%s" TORTURE_FLAG @@ -118,6 +108,8 @@ bool torture_offline(int cpu, long *n_offl_attempts, long *n_offl_successes, pr_alert("%s" TORTURE_FLAG "torture_onoff task: offlined %d\n", torture_type, cpu); + if (onoff_f) + onoff_f(); (*n_offl_successes)++; delta = jiffies - starttime; *sum_offl += delta; @@ -243,11 +235,12 @@ stop: /* * Initiate online-offline handling. */ -int torture_onoff_init(long ooholdoff, long oointerval) +int torture_onoff_init(long ooholdoff, long oointerval, torture_ofl_func *f) { #ifdef CONFIG_HOTPLUG_CPU onoff_holdoff = ooholdoff; onoff_interval = oointerval; + onoff_f = f; if (onoff_interval <= 0) return 0; return torture_create_kthread(torture_onoff, NULL, onoff_task); @@ -577,6 +570,7 @@ static void torture_shutdown_cleanup(void) static struct task_struct *stutter_task; static int stutter_pause_test; static int stutter; +static int stutter_gap; /* * Block until the stutter interval ends. This must be called periodically @@ -585,10 +579,12 @@ static int stutter; bool stutter_wait(const char *title) { int spt; + bool ret = false; cond_resched_tasks_rcu_qs(); spt = READ_ONCE(stutter_pause_test); for (; spt; spt = READ_ONCE(stutter_pause_test)) { + ret = true; if (spt == 1) { schedule_timeout_interruptible(1); } else if (spt == 2) { @@ -599,7 +595,7 @@ bool stutter_wait(const char *title) } torture_shutdown_absorb(title); } - return !!spt; + return ret; } EXPORT_SYMBOL_GPL(stutter_wait); @@ -609,17 +605,24 @@ EXPORT_SYMBOL_GPL(stutter_wait); */ static int torture_stutter(void *arg) { + int wtime; + VERBOSE_TOROUT_STRING("torture_stutter task started"); do { if (!torture_must_stop() && stutter > 1) { - WRITE_ONCE(stutter_pause_test, 1); - schedule_timeout_interruptible(stutter - 1); + wtime = stutter; + if (stutter > HZ + 1) { + WRITE_ONCE(stutter_pause_test, 1); + wtime = stutter - HZ - 1; + schedule_timeout_interruptible(wtime); + wtime = HZ + 1; + } WRITE_ONCE(stutter_pause_test, 2); - schedule_timeout_interruptible(1); + schedule_timeout_interruptible(wtime); } WRITE_ONCE(stutter_pause_test, 0); if (!torture_must_stop()) - schedule_timeout_interruptible(stutter); + schedule_timeout_interruptible(stutter_gap); torture_shutdown_absorb("torture_stutter"); } while (!torture_must_stop()); torture_kthread_stopping("torture_stutter"); @@ -629,9 +632,10 @@ static int torture_stutter(void *arg) /* * Initialize and kick off the torture_stutter kthread. */ -int torture_stutter_init(const int s) +int torture_stutter_init(const int s, const int sgap) { stutter = s; + stutter_gap = sgap; return torture_create_kthread(torture_stutter, NULL, stutter_task); } EXPORT_SYMBOL_GPL(torture_stutter_init); diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index fa8b1fe824f3..564e5fdb025f 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0-only # # Architectures that offer an FUNCTION_TRACER implementation should # select HAVE_FUNCTION_TRACER: @@ -370,6 +371,7 @@ config PROFILE_ANNOTATED_BRANCHES config PROFILE_ALL_BRANCHES bool "Profile all if conditionals" if !FORTIFY_SOURCE select TRACE_BRANCH_PROFILING + imply CC_DISABLE_WARN_MAYBE_UNINITIALIZED # avoid false positives help This tracer profiles all branch conditions. Every if () taken in the kernel is recorded whether it hit or miss. @@ -773,13 +775,6 @@ config TRACE_EVAL_MAP_FILE If unsure, say N -config TRACING_EVENTS_GPIO - bool "Trace gpio events" - depends on GPIOLIB - default y - help - Enable tracing events for gpio subsystem - config GCOV_PROFILE_FTRACE bool "Enable GCOV profiling on ftrace subsystem" depends on GCOV_KERNEL diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index fac0ddf8a8e2..2d6e93ab0478 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -512,8 +512,6 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, dir = debugfs_lookup(buts->name, blk_debugfs_root); if (!dir) bt->dir = dir = debugfs_create_dir(buts->name, blk_debugfs_root); - if (!dir) - goto err; bt->dev = dev; atomic_set(&bt->dropped, 0); @@ -522,12 +520,8 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, ret = -EIO; bt->dropped_file = debugfs_create_file("dropped", 0444, dir, bt, &blk_dropped_fops); - if (!bt->dropped_file) - goto err; bt->msg_file = debugfs_create_file("msg", 0222, dir, bt, &blk_msg_fops); - if (!bt->msg_file) - goto err; bt->rchan = relay_open("trace", dir, buts->buf_size, buts->buf_nr, &blk_relay_callbacks, bt); @@ -723,6 +717,7 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg) #endif case BLKTRACESTART: start = 1; + /* fall through */ case BLKTRACESTOP: ret = __blk_trace_startstop(q, start); break; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index f1a86a0d881d..ca1255d14576 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -14,9 +14,14 @@ #include <linux/syscalls.h> #include <linux/error-injection.h> +#include <asm/tlb.h> + #include "trace_probe.h" #include "trace.h" +#define bpf_event_rcu_dereference(p) \ + rcu_dereference_protected(p, lockdep_is_held(&bpf_event_mutex)) + #ifdef CONFIG_MODULES struct bpf_trace_module { struct module *module; @@ -163,6 +168,10 @@ BPF_CALL_3(bpf_probe_write_user, void *, unsafe_ptr, const void *, src, * access_ok() should prevent writing to non-user memory, but in * some situations (nommu, temporary switch, etc) access_ok() does * not provide enough validation, hence the check on KERNEL_DS. + * + * nmi_uaccess_okay() ensures the probe is not run in an interim + * state, when the task or mm are switched. This is specifically + * required to prevent the use of temporary mm. */ if (unlikely(in_interrupt() || @@ -170,6 +179,8 @@ BPF_CALL_3(bpf_probe_write_user, void *, unsafe_ptr, const void *, src, return -EPERM; if (unlikely(uaccess_kernel())) return -EPERM; + if (unlikely(!nmi_uaccess_okay())) + return -EPERM; if (!access_ok(unsafe_ptr, size)) return -EPERM; @@ -402,8 +413,6 @@ static const struct bpf_func_proto bpf_perf_event_read_value_proto = { .arg4_type = ARG_CONST_SIZE, }; -static DEFINE_PER_CPU(struct perf_sample_data, bpf_trace_sd); - static __always_inline u64 __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map, u64 flags, struct perf_sample_data *sd) @@ -431,28 +440,53 @@ __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map, if (unlikely(event->oncpu != cpu)) return -EOPNOTSUPP; - perf_event_output(event, sd, regs); - return 0; + return perf_event_output(event, sd, regs); } +/* + * Support executing tracepoints in normal, irq, and nmi context that each call + * bpf_perf_event_output + */ +struct bpf_trace_sample_data { + struct perf_sample_data sds[3]; +}; + +static DEFINE_PER_CPU(struct bpf_trace_sample_data, bpf_trace_sds); +static DEFINE_PER_CPU(int, bpf_trace_nest_level); BPF_CALL_5(bpf_perf_event_output, struct pt_regs *, regs, struct bpf_map *, map, u64, flags, void *, data, u64, size) { - struct perf_sample_data *sd = this_cpu_ptr(&bpf_trace_sd); + struct bpf_trace_sample_data *sds = this_cpu_ptr(&bpf_trace_sds); + int nest_level = this_cpu_inc_return(bpf_trace_nest_level); struct perf_raw_record raw = { .frag = { .size = size, .data = data, }, }; + struct perf_sample_data *sd; + int err; - if (unlikely(flags & ~(BPF_F_INDEX_MASK))) - return -EINVAL; + if (WARN_ON_ONCE(nest_level > ARRAY_SIZE(sds->sds))) { + err = -EBUSY; + goto out; + } + + sd = &sds->sds[nest_level - 1]; + + if (unlikely(flags & ~(BPF_F_INDEX_MASK))) { + err = -EINVAL; + goto out; + } perf_sample_data_init(sd, 0, 0); sd->raw = &raw; - return __bpf_perf_event_output(regs, map, flags, sd); + err = __bpf_perf_event_output(regs, map, flags, sd); + +out: + this_cpu_dec(bpf_trace_nest_level); + return err; } static const struct bpf_func_proto bpf_perf_event_output_proto = { @@ -560,6 +594,69 @@ static const struct bpf_func_proto bpf_probe_read_str_proto = { .arg3_type = ARG_ANYTHING, }; +struct send_signal_irq_work { + struct irq_work irq_work; + struct task_struct *task; + u32 sig; +}; + +static DEFINE_PER_CPU(struct send_signal_irq_work, send_signal_work); + +static void do_bpf_send_signal(struct irq_work *entry) +{ + struct send_signal_irq_work *work; + + work = container_of(entry, struct send_signal_irq_work, irq_work); + group_send_sig_info(work->sig, SEND_SIG_PRIV, work->task, PIDTYPE_TGID); +} + +BPF_CALL_1(bpf_send_signal, u32, sig) +{ + struct send_signal_irq_work *work = NULL; + + /* Similar to bpf_probe_write_user, task needs to be + * in a sound condition and kernel memory access be + * permitted in order to send signal to the current + * task. + */ + if (unlikely(current->flags & (PF_KTHREAD | PF_EXITING))) + return -EPERM; + if (unlikely(uaccess_kernel())) + return -EPERM; + if (unlikely(!nmi_uaccess_okay())) + return -EPERM; + + if (in_nmi()) { + /* Do an early check on signal validity. Otherwise, + * the error is lost in deferred irq_work. + */ + if (unlikely(!valid_signal(sig))) + return -EINVAL; + + work = this_cpu_ptr(&send_signal_work); + if (work->irq_work.flags & IRQ_WORK_BUSY) + return -EBUSY; + + /* Add the current task, which is the target of sending signal, + * to the irq_work. The current task may change when queued + * irq works get executed. + */ + work->task = current; + work->sig = sig; + irq_work_queue(&work->irq_work); + return 0; + } + + return group_send_sig_info(sig, SEND_SIG_PRIV, current, PIDTYPE_TGID); +} + +static const struct bpf_func_proto bpf_send_signal_proto = { + .func = bpf_send_signal, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_ANYTHING, +}; + static const struct bpf_func_proto * tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { @@ -570,6 +667,12 @@ tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_map_update_elem_proto; case BPF_FUNC_map_delete_elem: return &bpf_map_delete_elem_proto; + case BPF_FUNC_map_push_elem: + return &bpf_map_push_elem_proto; + case BPF_FUNC_map_pop_elem: + return &bpf_map_pop_elem_proto; + case BPF_FUNC_map_peek_elem: + return &bpf_map_peek_elem_proto; case BPF_FUNC_probe_read: return &bpf_probe_read_proto; case BPF_FUNC_ktime_get_ns: @@ -604,6 +707,8 @@ tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_get_current_cgroup_id: return &bpf_get_current_cgroup_id_proto; #endif + case BPF_FUNC_send_signal: + return &bpf_send_signal_proto; default: return NULL; } @@ -809,16 +914,48 @@ pe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) /* * bpf_raw_tp_regs are separate from bpf_pt_regs used from skb/xdp * to avoid potential recursive reuse issue when/if tracepoints are added - * inside bpf_*_event_output, bpf_get_stackid and/or bpf_get_stack + * inside bpf_*_event_output, bpf_get_stackid and/or bpf_get_stack. + * + * Since raw tracepoints run despite bpf_prog_active, support concurrent usage + * in normal, irq, and nmi context. */ -static DEFINE_PER_CPU(struct pt_regs, bpf_raw_tp_regs); +struct bpf_raw_tp_regs { + struct pt_regs regs[3]; +}; +static DEFINE_PER_CPU(struct bpf_raw_tp_regs, bpf_raw_tp_regs); +static DEFINE_PER_CPU(int, bpf_raw_tp_nest_level); +static struct pt_regs *get_bpf_raw_tp_regs(void) +{ + struct bpf_raw_tp_regs *tp_regs = this_cpu_ptr(&bpf_raw_tp_regs); + int nest_level = this_cpu_inc_return(bpf_raw_tp_nest_level); + + if (WARN_ON_ONCE(nest_level > ARRAY_SIZE(tp_regs->regs))) { + this_cpu_dec(bpf_raw_tp_nest_level); + return ERR_PTR(-EBUSY); + } + + return &tp_regs->regs[nest_level - 1]; +} + +static void put_bpf_raw_tp_regs(void) +{ + this_cpu_dec(bpf_raw_tp_nest_level); +} + BPF_CALL_5(bpf_perf_event_output_raw_tp, struct bpf_raw_tracepoint_args *, args, struct bpf_map *, map, u64, flags, void *, data, u64, size) { - struct pt_regs *regs = this_cpu_ptr(&bpf_raw_tp_regs); + struct pt_regs *regs = get_bpf_raw_tp_regs(); + int ret; + + if (IS_ERR(regs)) + return PTR_ERR(regs); perf_fetch_caller_regs(regs); - return ____bpf_perf_event_output(regs, map, flags, data, size); + ret = ____bpf_perf_event_output(regs, map, flags, data, size); + + put_bpf_raw_tp_regs(); + return ret; } static const struct bpf_func_proto bpf_perf_event_output_proto_raw_tp = { @@ -835,12 +972,18 @@ static const struct bpf_func_proto bpf_perf_event_output_proto_raw_tp = { BPF_CALL_3(bpf_get_stackid_raw_tp, struct bpf_raw_tracepoint_args *, args, struct bpf_map *, map, u64, flags) { - struct pt_regs *regs = this_cpu_ptr(&bpf_raw_tp_regs); + struct pt_regs *regs = get_bpf_raw_tp_regs(); + int ret; + + if (IS_ERR(regs)) + return PTR_ERR(regs); perf_fetch_caller_regs(regs); /* similar to bpf_perf_event_output_tp, but pt_regs fetched differently */ - return bpf_get_stackid((unsigned long) regs, (unsigned long) map, - flags, 0, 0); + ret = bpf_get_stackid((unsigned long) regs, (unsigned long) map, + flags, 0, 0); + put_bpf_raw_tp_regs(); + return ret; } static const struct bpf_func_proto bpf_get_stackid_proto_raw_tp = { @@ -855,11 +998,17 @@ static const struct bpf_func_proto bpf_get_stackid_proto_raw_tp = { BPF_CALL_4(bpf_get_stack_raw_tp, struct bpf_raw_tracepoint_args *, args, void *, buf, u32, size, u64, flags) { - struct pt_regs *regs = this_cpu_ptr(&bpf_raw_tp_regs); + struct pt_regs *regs = get_bpf_raw_tp_regs(); + int ret; + + if (IS_ERR(regs)) + return PTR_ERR(regs); perf_fetch_caller_regs(regs); - return bpf_get_stack((unsigned long) regs, (unsigned long) buf, - (unsigned long) size, flags, 0); + ret = bpf_get_stack((unsigned long) regs, (unsigned long) buf, + (unsigned long) size, flags, 0); + put_bpf_raw_tp_regs(); + return ret; } static const struct bpf_func_proto bpf_get_stack_proto_raw_tp = { @@ -910,6 +1059,27 @@ const struct bpf_verifier_ops raw_tracepoint_verifier_ops = { const struct bpf_prog_ops raw_tracepoint_prog_ops = { }; +static bool raw_tp_writable_prog_is_valid_access(int off, int size, + enum bpf_access_type type, + const struct bpf_prog *prog, + struct bpf_insn_access_aux *info) +{ + if (off == 0) { + if (size != sizeof(u64) || type != BPF_READ) + return false; + info->reg_type = PTR_TO_TP_BUFFER; + } + return raw_tp_prog_is_valid_access(off, size, type, prog, info); +} + +const struct bpf_verifier_ops raw_tracepoint_writable_verifier_ops = { + .get_func_proto = raw_tp_prog_func_proto, + .is_valid_access = raw_tp_writable_prog_is_valid_access, +}; + +const struct bpf_prog_ops raw_tracepoint_writable_prog_ops = { +}; + static bool pe_prog_is_valid_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info) @@ -1000,7 +1170,7 @@ static DEFINE_MUTEX(bpf_event_mutex); int perf_event_attach_bpf_prog(struct perf_event *event, struct bpf_prog *prog) { - struct bpf_prog_array __rcu *old_array; + struct bpf_prog_array *old_array; struct bpf_prog_array *new_array; int ret = -EEXIST; @@ -1018,7 +1188,7 @@ int perf_event_attach_bpf_prog(struct perf_event *event, if (event->prog) goto unlock; - old_array = event->tp_event->prog_array; + old_array = bpf_event_rcu_dereference(event->tp_event->prog_array); if (old_array && bpf_prog_array_length(old_array) >= BPF_TRACE_MAX_PROGS) { ret = -E2BIG; @@ -1041,7 +1211,7 @@ unlock: void perf_event_detach_bpf_prog(struct perf_event *event) { - struct bpf_prog_array __rcu *old_array; + struct bpf_prog_array *old_array; struct bpf_prog_array *new_array; int ret; @@ -1050,7 +1220,7 @@ void perf_event_detach_bpf_prog(struct perf_event *event) if (!event->prog) goto unlock; - old_array = event->tp_event->prog_array; + old_array = bpf_event_rcu_dereference(event->tp_event->prog_array); ret = bpf_prog_array_copy(old_array, event->prog, NULL, &new_array); if (ret == -ENOENT) goto unlock; @@ -1072,6 +1242,7 @@ int perf_event_query_prog_array(struct perf_event *event, void __user *info) { struct perf_event_query_bpf __user *uquery = info; struct perf_event_query_bpf query = {}; + struct bpf_prog_array *progs; u32 *ids, prog_cnt, ids_len; int ret; @@ -1096,10 +1267,8 @@ int perf_event_query_prog_array(struct perf_event *event, void __user *info) */ mutex_lock(&bpf_event_mutex); - ret = bpf_prog_array_copy_info(event->tp_event->prog_array, - ids, - ids_len, - &prog_cnt); + progs = bpf_event_rcu_dereference(event->tp_event->prog_array); + ret = bpf_prog_array_copy_info(progs, ids, ids_len, &prog_cnt); mutex_unlock(&bpf_event_mutex); if (copy_to_user(&uquery->prog_cnt, &prog_cnt, sizeof(prog_cnt)) || @@ -1199,6 +1368,9 @@ static int __bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog * if (prog->aux->max_ctx_offset > btp->num_args * sizeof(u64)) return -EINVAL; + if (prog->aux->max_tp_access > btp->writable_size) + return -EINVAL; + return tracepoint_probe_register(tp, (void *)btp->bpf_func, prog); } @@ -1259,8 +1431,23 @@ int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id, return err; } +static int __init send_signal_irq_work_init(void) +{ + int cpu; + struct send_signal_irq_work *work; + + for_each_possible_cpu(cpu) { + work = per_cpu_ptr(&send_signal_work, cpu); + init_irq_work(&work->irq_work, do_bpf_send_signal); + } + return 0; +} + +subsys_initcall(send_signal_irq_work_init); + #ifdef CONFIG_MODULES -int bpf_event_notify(struct notifier_block *nb, unsigned long op, void *module) +static int bpf_event_notify(struct notifier_block *nb, unsigned long op, + void *module) { struct bpf_trace_module *btm, *tmp; struct module *mod = module; @@ -1299,7 +1486,7 @@ static struct notifier_block bpf_module_nb = { .notifier_call = bpf_event_notify, }; -int __init bpf_event_init(void) +static int __init bpf_event_init(void) { register_module_notifier(&bpf_module_nb); return 0; diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index aac7847c0214..576c41644e77 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -33,6 +33,7 @@ #include <linux/list.h> #include <linux/hash.h> #include <linux/rcupdate.h> +#include <linux/kprobes.h> #include <trace/events/sched.h> @@ -69,12 +70,8 @@ #define INIT_OPS_HASH(opsname) \ .func_hash = &opsname.local_hash, \ .local_hash.regex_lock = __MUTEX_INITIALIZER(opsname.local_hash.regex_lock), -#define ASSIGN_OPS_HASH(opsname, val) \ - .func_hash = val, \ - .local_hash.regex_lock = __MUTEX_INITIALIZER(opsname.local_hash.regex_lock), #else #define INIT_OPS_HASH(opsname) -#define ASSIGN_OPS_HASH(opsname, val) #endif enum { @@ -1992,7 +1989,7 @@ static void print_bug_type(void) * modifying the code. @failed should be one of either: * EFAULT - if the problem happens on reading the @ip address * EINVAL - if what is read at @ip is not what was expected - * EPERM - if the problem happens on writting to the @ip address + * EPERM - if the problem happens on writing to the @ip address */ void ftrace_bug(int failed, struct dyn_ftrace *rec) { @@ -2391,7 +2388,7 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable) return ftrace_modify_call(rec, ftrace_old_addr, ftrace_addr); } - return -1; /* unknow ftrace bug */ + return -1; /* unknown ftrace bug */ } void __weak ftrace_replace_code(int mod_flags) @@ -2938,14 +2935,13 @@ static int ftrace_update_code(struct module *mod, struct ftrace_page *new_pgs) p = &pg->records[i]; p->flags = rec_flags; -#ifndef CC_USING_NOP_MCOUNT /* * Do the initial record conversion from mcount jump * to the NOP instructions. */ - if (!ftrace_code_disable(mod, p)) + if (!__is_defined(CC_USING_NOP_MCOUNT) && + !ftrace_code_disable(mod, p)) break; -#endif update_cnt++; } @@ -3004,7 +3000,7 @@ ftrace_allocate_pages(unsigned long num_to_init) int cnt; if (!num_to_init) - return 0; + return NULL; start_pg = pg = kzalloc(sizeof(*pg), GFP_KERNEL); if (!pg) @@ -3702,6 +3698,31 @@ enter_record(struct ftrace_hash *hash, struct dyn_ftrace *rec, int clear_filter) } static int +add_rec_by_index(struct ftrace_hash *hash, struct ftrace_glob *func_g, + int clear_filter) +{ + long index = simple_strtoul(func_g->search, NULL, 0); + struct ftrace_page *pg; + struct dyn_ftrace *rec; + + /* The index starts at 1 */ + if (--index < 0) + return 0; + + do_for_each_ftrace_rec(pg, rec) { + if (pg->index <= index) { + index -= pg->index; + /* this is a double loop, break goes to the next page */ + break; + } + rec = &pg->records[index]; + enter_record(hash, rec, clear_filter); + return 1; + } while_for_each_ftrace_rec(); + return 0; +} + +static int ftrace_match_record(struct dyn_ftrace *rec, struct ftrace_glob *func_g, struct ftrace_glob *mod_g, int exclude_mod) { @@ -3769,6 +3790,11 @@ match_records(struct ftrace_hash *hash, char *func, int len, char *mod) if (unlikely(ftrace_disabled)) goto out_unlock; + if (func_g.type == MATCH_INDEX) { + found = add_rec_by_index(hash, &func_g, clear_filter); + goto out_unlock; + } + do_for_each_ftrace_rec(pg, rec) { if (rec->flags & FTRACE_FL_DISABLED) @@ -3849,7 +3875,7 @@ static int ftrace_hash_move_and_update_ops(struct ftrace_ops *ops, static bool module_exists(const char *module) { /* All modules have the symbol __this_module */ - const char this_mod[] = "__this_module"; + static const char this_mod[] = "__this_module"; char modname[MAX_PARAM_PREFIX_LEN + sizeof(this_mod) + 2]; unsigned long val; int n; @@ -4194,10 +4220,13 @@ void free_ftrace_func_mapper(struct ftrace_func_mapper *mapper, struct ftrace_func_entry *entry; struct ftrace_func_map *map; struct hlist_head *hhd; - int size = 1 << mapper->hash.size_bits; - int i; + int size, i; + + if (!mapper) + return; if (free_func && mapper->hash.count) { + size = 1 << mapper->hash.size_bits; for (i = 0; i < size; i++) { hhd = &mapper->hash.buckets[i]; hlist_for_each_entry(entry, hhd, hlist) { @@ -4725,7 +4754,7 @@ static int ftrace_set_addr(struct ftrace_ops *ops, unsigned long ip, int remove, int reset, int enable) { - return ftrace_set_hash(ops, 0, 0, ip, remove, reset, enable); + return ftrace_set_hash(ops, NULL, 0, ip, remove, reset, enable); } /** @@ -5433,7 +5462,7 @@ void ftrace_create_filter_files(struct ftrace_ops *ops, /* * The name "destroy_filter_files" is really a misnomer. Although - * in the future, it may actualy delete the files, but this is + * in the future, it may actually delete the files, but this is * really intended to make sure the ops passed in are disabled * and that when this function returns, the caller is free to * free the ops. @@ -5756,7 +5785,7 @@ void ftrace_module_enable(struct module *mod) /* * If the tracing is enabled, go ahead and enable the record. * - * The reason not to enable the record immediatelly is the + * The reason not to enable the record immediately is the * inherent check of ftrace_make_nop/ftrace_make_call for * correct previous instructions. Making first the NOP * conversion puts the module to the correct state, thus @@ -6216,7 +6245,7 @@ void ftrace_reset_array_ops(struct trace_array *tr) tr->ops->func = ftrace_stub; } -static inline void +static nokprobe_inline void __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *ignored, struct pt_regs *regs) { @@ -6234,6 +6263,9 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, preempt_disable_notrace(); do_for_each_ftrace_op(op, ftrace_ops_list) { + /* Stub functions don't need to be called nor tested */ + if (op->flags & FTRACE_OPS_FL_STUB) + continue; /* * Check the following for each ops before calling their func: * if RCU flag is set, then rcu_is_watching() must be true @@ -6276,11 +6308,13 @@ static void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, { __ftrace_ops_list_func(ip, parent_ip, NULL, regs); } +NOKPROBE_SYMBOL(ftrace_ops_list_func); #else static void ftrace_ops_no_ops(unsigned long ip, unsigned long parent_ip) { __ftrace_ops_list_func(ip, parent_ip, NULL, NULL); } +NOKPROBE_SYMBOL(ftrace_ops_no_ops); #endif /* @@ -6307,6 +6341,7 @@ static void ftrace_ops_assist_func(unsigned long ip, unsigned long parent_ip, preempt_enable_notrace(); trace_clear_recursion(bit); } +NOKPROBE_SYMBOL(ftrace_ops_assist_func); /** * ftrace_ops_get_func - get the function a trampoline should call diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 06e864a334bb..05b0b3139ebc 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -353,20 +353,6 @@ static void rb_init_page(struct buffer_data_page *bpage) local_set(&bpage->commit, 0); } -/** - * ring_buffer_page_len - the size of data on the page. - * @page: The page to read - * - * Returns the amount of data on the page, including buffer page header. - */ -size_t ring_buffer_page_len(void *page) -{ - struct buffer_data_page *bpage = page; - - return (local_read(&bpage->commit) & ~RB_MISSED_FLAGS) - + BUF_PAGE_HDR_SIZE; -} - /* * Also stolen from mm/slob.c. Thanks to Mathieu Desnoyers for pointing * this issue out. @@ -776,7 +762,7 @@ u64 ring_buffer_time_stamp(struct ring_buffer *buffer, int cpu) preempt_disable_notrace(); time = rb_time_stamp(buffer); - preempt_enable_no_resched_notrace(); + preempt_enable_notrace(); return time; } @@ -4205,6 +4191,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_consume); * ring_buffer_read_prepare - Prepare for a non consuming read of the buffer * @buffer: The ring buffer to read from * @cpu: The cpu buffer to iterate over + * @flags: gfp flags to use for memory allocation * * This performs the initial preparations necessary to iterate * through the buffer. Memory is allocated, buffer recording @@ -4222,7 +4209,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_consume); * This overall must be paired with ring_buffer_read_finish. */ struct ring_buffer_iter * -ring_buffer_read_prepare(struct ring_buffer *buffer, int cpu) +ring_buffer_read_prepare(struct ring_buffer *buffer, int cpu, gfp_t flags) { struct ring_buffer_per_cpu *cpu_buffer; struct ring_buffer_iter *iter; @@ -4230,7 +4217,7 @@ ring_buffer_read_prepare(struct ring_buffer *buffer, int cpu) if (!cpumask_test_cpu(cpu, buffer->cpumask)) return NULL; - iter = kmalloc(sizeof(*iter), GFP_KERNEL); + iter = kmalloc(sizeof(*iter), flags); if (!iter) return NULL; @@ -4992,7 +4979,7 @@ static __init int rb_write_something(struct rb_test_data *data, bool nested) cnt = data->cnt + (nested ? 27 : 0); /* Multiply cnt by ~e, to make some unique increment */ - size = (data->cnt * 68 / 25) % (sizeof(rb_string) - 1); + size = (cnt * 68 / 25) % (sizeof(rb_string) - 1); len = size + sizeof(struct rb_item); diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c index ffba6789c0e2..0564f6db0561 100644 --- a/kernel/trace/ring_buffer_benchmark.c +++ b/kernel/trace/ring_buffer_benchmark.c @@ -362,7 +362,7 @@ static void ring_buffer_producer(void) hit--; /* make it non zero */ } - /* Caculate the average time in nanosecs */ + /* Calculate the average time in nanosecs */ avg = NSEC_PER_MSEC / (hit + missed); trace_printk("%ld ns per entry\n", avg); } diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index c521b7347482..c90c687cf950 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -159,6 +159,8 @@ static union trace_eval_map_item *trace_eval_maps; #endif /* CONFIG_TRACE_EVAL_MAP_FILE */ static int tracing_set_tracer(struct trace_array *tr, const char *buf); +static void ftrace_trace_userstack(struct ring_buffer *buffer, + unsigned long flags, int pc); #define MAX_TRACER_SIZE 100 static char bootup_tracer_buf[MAX_TRACER_SIZE] __initdata; @@ -496,8 +498,10 @@ int trace_pid_write(struct trace_pid_list *filtered_pids, * not modified. */ pid_list = kmalloc(sizeof(*pid_list), GFP_KERNEL); - if (!pid_list) + if (!pid_list) { + trace_parser_put(&parser); return -ENOMEM; + } pid_list->pid_max = READ_ONCE(pid_max); @@ -507,6 +511,7 @@ int trace_pid_write(struct trace_pid_list *filtered_pids, pid_list->pids = vzalloc((pid_list->pid_max + 7) >> 3); if (!pid_list->pids) { + trace_parser_put(&parser); kfree(pid_list); return -ENOMEM; } @@ -894,7 +899,7 @@ int __trace_bputs(unsigned long ip, const char *str) EXPORT_SYMBOL_GPL(__trace_bputs); #ifdef CONFIG_TRACER_SNAPSHOT -void tracing_snapshot_instance(struct trace_array *tr) +void tracing_snapshot_instance_cond(struct trace_array *tr, void *cond_data) { struct tracer *tracer = tr->current_trace; unsigned long flags; @@ -920,10 +925,15 @@ void tracing_snapshot_instance(struct trace_array *tr) } local_irq_save(flags); - update_max_tr(tr, current, smp_processor_id()); + update_max_tr(tr, current, smp_processor_id(), cond_data); local_irq_restore(flags); } +void tracing_snapshot_instance(struct trace_array *tr) +{ + tracing_snapshot_instance_cond(tr, NULL); +} + /** * tracing_snapshot - take a snapshot of the current buffer. * @@ -946,6 +956,54 @@ void tracing_snapshot(void) } EXPORT_SYMBOL_GPL(tracing_snapshot); +/** + * tracing_snapshot_cond - conditionally take a snapshot of the current buffer. + * @tr: The tracing instance to snapshot + * @cond_data: The data to be tested conditionally, and possibly saved + * + * This is the same as tracing_snapshot() except that the snapshot is + * conditional - the snapshot will only happen if the + * cond_snapshot.update() implementation receiving the cond_data + * returns true, which means that the trace array's cond_snapshot + * update() operation used the cond_data to determine whether the + * snapshot should be taken, and if it was, presumably saved it along + * with the snapshot. + */ +void tracing_snapshot_cond(struct trace_array *tr, void *cond_data) +{ + tracing_snapshot_instance_cond(tr, cond_data); +} +EXPORT_SYMBOL_GPL(tracing_snapshot_cond); + +/** + * tracing_snapshot_cond_data - get the user data associated with a snapshot + * @tr: The tracing instance + * + * When the user enables a conditional snapshot using + * tracing_snapshot_cond_enable(), the user-defined cond_data is saved + * with the snapshot. This accessor is used to retrieve it. + * + * Should not be called from cond_snapshot.update(), since it takes + * the tr->max_lock lock, which the code calling + * cond_snapshot.update() has already done. + * + * Returns the cond_data associated with the trace array's snapshot. + */ +void *tracing_cond_snapshot_data(struct trace_array *tr) +{ + void *cond_data = NULL; + + arch_spin_lock(&tr->max_lock); + + if (tr->cond_snapshot) + cond_data = tr->cond_snapshot->cond_data; + + arch_spin_unlock(&tr->max_lock); + + return cond_data; +} +EXPORT_SYMBOL_GPL(tracing_cond_snapshot_data); + static int resize_buffer_duplicate_size(struct trace_buffer *trace_buf, struct trace_buffer *size_buf, int cpu_id); static void set_buffer_entries(struct trace_buffer *buf, unsigned long val); @@ -1025,12 +1083,111 @@ void tracing_snapshot_alloc(void) tracing_snapshot(); } EXPORT_SYMBOL_GPL(tracing_snapshot_alloc); + +/** + * tracing_snapshot_cond_enable - enable conditional snapshot for an instance + * @tr: The tracing instance + * @cond_data: User data to associate with the snapshot + * @update: Implementation of the cond_snapshot update function + * + * Check whether the conditional snapshot for the given instance has + * already been enabled, or if the current tracer is already using a + * snapshot; if so, return -EBUSY, else create a cond_snapshot and + * save the cond_data and update function inside. + * + * Returns 0 if successful, error otherwise. + */ +int tracing_snapshot_cond_enable(struct trace_array *tr, void *cond_data, + cond_update_fn_t update) +{ + struct cond_snapshot *cond_snapshot; + int ret = 0; + + cond_snapshot = kzalloc(sizeof(*cond_snapshot), GFP_KERNEL); + if (!cond_snapshot) + return -ENOMEM; + + cond_snapshot->cond_data = cond_data; + cond_snapshot->update = update; + + mutex_lock(&trace_types_lock); + + ret = tracing_alloc_snapshot_instance(tr); + if (ret) + goto fail_unlock; + + if (tr->current_trace->use_max_tr) { + ret = -EBUSY; + goto fail_unlock; + } + + /* + * The cond_snapshot can only change to NULL without the + * trace_types_lock. We don't care if we race with it going + * to NULL, but we want to make sure that it's not set to + * something other than NULL when we get here, which we can + * do safely with only holding the trace_types_lock and not + * having to take the max_lock. + */ + if (tr->cond_snapshot) { + ret = -EBUSY; + goto fail_unlock; + } + + arch_spin_lock(&tr->max_lock); + tr->cond_snapshot = cond_snapshot; + arch_spin_unlock(&tr->max_lock); + + mutex_unlock(&trace_types_lock); + + return ret; + + fail_unlock: + mutex_unlock(&trace_types_lock); + kfree(cond_snapshot); + return ret; +} +EXPORT_SYMBOL_GPL(tracing_snapshot_cond_enable); + +/** + * tracing_snapshot_cond_disable - disable conditional snapshot for an instance + * @tr: The tracing instance + * + * Check whether the conditional snapshot for the given instance is + * enabled; if so, free the cond_snapshot associated with it, + * otherwise return -EINVAL. + * + * Returns 0 if successful, error otherwise. + */ +int tracing_snapshot_cond_disable(struct trace_array *tr) +{ + int ret = 0; + + arch_spin_lock(&tr->max_lock); + + if (!tr->cond_snapshot) + ret = -EINVAL; + else { + kfree(tr->cond_snapshot); + tr->cond_snapshot = NULL; + } + + arch_spin_unlock(&tr->max_lock); + + return ret; +} +EXPORT_SYMBOL_GPL(tracing_snapshot_cond_disable); #else void tracing_snapshot(void) { WARN_ONCE(1, "Snapshot feature not enabled, but internal snapshot used"); } EXPORT_SYMBOL_GPL(tracing_snapshot); +void tracing_snapshot_cond(struct trace_array *tr, void *cond_data) +{ + WARN_ONCE(1, "Snapshot feature not enabled, but internal conditional snapshot used"); +} +EXPORT_SYMBOL_GPL(tracing_snapshot_cond); int tracing_alloc_snapshot(void) { WARN_ONCE(1, "Snapshot feature not enabled, but snapshot allocation used"); @@ -1043,6 +1200,21 @@ void tracing_snapshot_alloc(void) tracing_snapshot(); } EXPORT_SYMBOL_GPL(tracing_snapshot_alloc); +void *tracing_cond_snapshot_data(struct trace_array *tr) +{ + return NULL; +} +EXPORT_SYMBOL_GPL(tracing_cond_snapshot_data); +int tracing_snapshot_cond_enable(struct trace_array *tr, void *cond_data, cond_update_fn_t update) +{ + return -ENODEV; +} +EXPORT_SYMBOL_GPL(tracing_snapshot_cond_enable); +int tracing_snapshot_cond_disable(struct trace_array *tr) +{ + return false; +} +EXPORT_SYMBOL_GPL(tracing_snapshot_cond_disable); #endif /* CONFIG_TRACER_SNAPSHOT */ void tracer_tracing_off(struct trace_array *tr) @@ -1330,7 +1502,7 @@ __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) max_data->critical_start = data->critical_start; max_data->critical_end = data->critical_end; - memcpy(max_data->comm, tsk->comm, TASK_COMM_LEN); + strncpy(max_data->comm, tsk->comm, TASK_COMM_LEN); max_data->pid = tsk->pid; /* * If tsk == current, then use current_uid(), as that does not use @@ -1354,12 +1526,14 @@ __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) * @tr: tracer * @tsk: the task with the latency * @cpu: The cpu that initiated the trace. + * @cond_data: User data associated with a conditional snapshot * * Flip the buffers between the @tr and the max_tr and record information * about which task was the cause of this latency. */ void -update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) +update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu, + void *cond_data) { if (tr->stop_count) return; @@ -1380,9 +1554,15 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) else ring_buffer_record_off(tr->max_buffer.buffer); +#ifdef CONFIG_TRACER_SNAPSHOT + if (tr->cond_snapshot && !tr->cond_snapshot->update(tr, cond_data)) + goto out_unlock; +#endif swap(tr->trace_buffer.buffer, tr->max_buffer.buffer); __update_max_tr(tr, tsk, cpu); + + out_unlock: arch_spin_unlock(&tr->max_lock); } @@ -1547,6 +1727,10 @@ static __init int init_trace_selftests(void) pr_info("Running postponed tracer tests:\n"); list_for_each_entry_safe(p, n, &postponed_selftests, list) { + /* This loop can take minutes when sanitizers are enabled, so + * lets make sure we allow RCU processing. + */ + cond_resched(); ret = run_tracer_selftest(p->type); /* If the test fails, then warn and remove from available_tracers */ if (ret < 0) { @@ -1748,7 +1932,7 @@ static inline char *get_saved_cmdlines(int idx) static inline void set_cmdline(int idx, const char *cmdline) { - memcpy(get_saved_cmdlines(idx), cmdline, TASK_COMM_LEN); + strncpy(get_saved_cmdlines(idx), cmdline, TASK_COMM_LEN); } static int allocate_cmdlines_buffer(unsigned int val, @@ -2574,12 +2758,21 @@ trace_function(struct trace_array *tr, #ifdef CONFIG_STACKTRACE -#define FTRACE_STACK_MAX_ENTRIES (PAGE_SIZE / sizeof(unsigned long)) +/* Allow 4 levels of nesting: normal, softirq, irq, NMI */ +#define FTRACE_KSTACK_NESTING 4 + +#define FTRACE_KSTACK_ENTRIES (PAGE_SIZE / FTRACE_KSTACK_NESTING) + struct ftrace_stack { - unsigned long calls[FTRACE_STACK_MAX_ENTRIES]; + unsigned long calls[FTRACE_KSTACK_ENTRIES]; +}; + + +struct ftrace_stacks { + struct ftrace_stack stacks[FTRACE_KSTACK_NESTING]; }; -static DEFINE_PER_CPU(struct ftrace_stack, ftrace_stack); +static DEFINE_PER_CPU(struct ftrace_stacks, ftrace_stacks); static DEFINE_PER_CPU(int, ftrace_stack_reserve); static void __ftrace_trace_stack(struct ring_buffer *buffer, @@ -2588,13 +2781,10 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer, { struct trace_event_call *call = &event_kernel_stack; struct ring_buffer_event *event; + unsigned int size, nr_entries; + struct ftrace_stack *fstack; struct stack_entry *entry; - struct stack_trace trace; - int use_stack; - int size = FTRACE_STACK_ENTRIES; - - trace.nr_entries = 0; - trace.skip = skip; + int stackidx; /* * Add one, for this function and the call to save_stack_trace() @@ -2602,7 +2792,7 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer, */ #ifndef CONFIG_UNWINDER_ORC if (!regs) - trace.skip++; + skip++; #endif /* @@ -2613,53 +2803,40 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer, */ preempt_disable_notrace(); - use_stack = __this_cpu_inc_return(ftrace_stack_reserve); + stackidx = __this_cpu_inc_return(ftrace_stack_reserve) - 1; + + /* This should never happen. If it does, yell once and skip */ + if (WARN_ON_ONCE(stackidx > FTRACE_KSTACK_NESTING)) + goto out; + /* - * We don't need any atomic variables, just a barrier. - * If an interrupt comes in, we don't care, because it would - * have exited and put the counter back to what we want. - * We just need a barrier to keep gcc from moving things - * around. + * The above __this_cpu_inc_return() is 'atomic' cpu local. An + * interrupt will either see the value pre increment or post + * increment. If the interrupt happens pre increment it will have + * restored the counter when it returns. We just need a barrier to + * keep gcc from moving things around. */ barrier(); - if (use_stack == 1) { - trace.entries = this_cpu_ptr(ftrace_stack.calls); - trace.max_entries = FTRACE_STACK_MAX_ENTRIES; - if (regs) - save_stack_trace_regs(regs, &trace); - else - save_stack_trace(&trace); - - if (trace.nr_entries > size) - size = trace.nr_entries; - } else - /* From now on, use_stack is a boolean */ - use_stack = 0; + fstack = this_cpu_ptr(ftrace_stacks.stacks) + stackidx; + size = ARRAY_SIZE(fstack->calls); - size *= sizeof(unsigned long); + if (regs) { + nr_entries = stack_trace_save_regs(regs, fstack->calls, + size, skip); + } else { + nr_entries = stack_trace_save(fstack->calls, size, skip); + } + size = nr_entries * sizeof(unsigned long); event = __trace_buffer_lock_reserve(buffer, TRACE_STACK, sizeof(*entry) + size, flags, pc); if (!event) goto out; entry = ring_buffer_event_data(event); - memset(&entry->caller, 0, size); - - if (use_stack) - memcpy(&entry->caller, trace.entries, - trace.nr_entries * sizeof(unsigned long)); - else { - trace.max_entries = FTRACE_STACK_ENTRIES; - trace.entries = entry->caller; - if (regs) - save_stack_trace_regs(regs, &trace); - else - save_stack_trace(&trace); - } - - entry->size = trace.nr_entries; + memcpy(&entry->caller, fstack->calls, size); + entry->size = nr_entries; if (!call_filter_check_discard(call, entry, buffer, event)) __buffer_unlock_commit(buffer, event); @@ -2729,15 +2906,15 @@ void trace_dump_stack(int skip) } EXPORT_SYMBOL_GPL(trace_dump_stack); +#ifdef CONFIG_USER_STACKTRACE_SUPPORT static DEFINE_PER_CPU(int, user_stack_count); -void +static void ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc) { struct trace_event_call *call = &event_user_stack; struct ring_buffer_event *event; struct userstack_entry *entry; - struct stack_trace trace; if (!(global_trace.trace_flags & TRACE_ITER_USERSTACKTRACE)) return; @@ -2768,12 +2945,7 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc) entry->tgid = current->tgid; memset(&entry->caller, 0, sizeof(entry->caller)); - trace.nr_entries = 0; - trace.max_entries = FTRACE_STACK_ENTRIES; - trace.skip = 0; - trace.entries = entry->caller; - - save_stack_trace_user(&trace); + stack_trace_save_user(entry->caller, FTRACE_STACK_ENTRIES); if (!call_filter_check_discard(call, entry, buffer, event)) __buffer_unlock_commit(buffer, event); @@ -2782,13 +2954,12 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc) out: preempt_enable(); } - -#ifdef UNUSED -static void __trace_userstack(struct trace_array *tr, unsigned long flags) +#else /* CONFIG_USER_STACKTRACE_SUPPORT */ +static void ftrace_trace_userstack(struct ring_buffer *buffer, + unsigned long flags, int pc) { - ftrace_trace_userstack(tr, flags, preempt_count()); } -#endif /* UNUSED */ +#endif /* !CONFIG_USER_STACKTRACE_SUPPORT */ #endif /* CONFIG_STACKTRACE */ @@ -2878,6 +3049,7 @@ void trace_printk_init_buffers(void) if (global_trace.trace_buffer.buffer) tracing_start_cmdline_record(); } +EXPORT_SYMBOL_GPL(trace_printk_init_buffers); void trace_printk_start_comm(void) { @@ -3038,6 +3210,7 @@ int trace_array_printk(struct trace_array *tr, va_end(ap); return ret; } +EXPORT_SYMBOL_GPL(trace_array_printk); __printf(3, 4) int trace_array_printk_buf(struct ring_buffer *buffer, @@ -3316,33 +3489,68 @@ static void s_stop(struct seq_file *m, void *p) } static void +get_total_entries_cpu(struct trace_buffer *buf, unsigned long *total, + unsigned long *entries, int cpu) +{ + unsigned long count; + + count = ring_buffer_entries_cpu(buf->buffer, cpu); + /* + * If this buffer has skipped entries, then we hold all + * entries for the trace and we need to ignore the + * ones before the time stamp. + */ + if (per_cpu_ptr(buf->data, cpu)->skipped_entries) { + count -= per_cpu_ptr(buf->data, cpu)->skipped_entries; + /* total is the same as the entries */ + *total = count; + } else + *total = count + + ring_buffer_overrun_cpu(buf->buffer, cpu); + *entries = count; +} + +static void get_total_entries(struct trace_buffer *buf, unsigned long *total, unsigned long *entries) { - unsigned long count; + unsigned long t, e; int cpu; *total = 0; *entries = 0; for_each_tracing_cpu(cpu) { - count = ring_buffer_entries_cpu(buf->buffer, cpu); - /* - * If this buffer has skipped entries, then we hold all - * entries for the trace and we need to ignore the - * ones before the time stamp. - */ - if (per_cpu_ptr(buf->data, cpu)->skipped_entries) { - count -= per_cpu_ptr(buf->data, cpu)->skipped_entries; - /* total is the same as the entries */ - *total += count; - } else - *total += count + - ring_buffer_overrun_cpu(buf->buffer, cpu); - *entries += count; + get_total_entries_cpu(buf, &t, &e, cpu); + *total += t; + *entries += e; } } +unsigned long trace_total_entries_cpu(struct trace_array *tr, int cpu) +{ + unsigned long total, entries; + + if (!tr) + tr = &global_trace; + + get_total_entries_cpu(&tr->trace_buffer, &total, &entries, cpu); + + return entries; +} + +unsigned long trace_total_entries(struct trace_array *tr) +{ + unsigned long total, entries; + + if (!tr) + tr = &global_trace; + + get_total_entries(&tr->trace_buffer, &total, &entries); + + return entries; +} + static void print_lat_help_header(struct seq_file *m) { seq_puts(m, "# _------=> CPU# \n" @@ -3381,23 +3589,18 @@ static void print_func_help_header_irq(struct trace_buffer *buf, struct seq_file unsigned int flags) { bool tgid = flags & TRACE_ITER_RECORD_TGID; - const char tgid_space[] = " "; - const char space[] = " "; - - seq_printf(m, "# %s _-----=> irqs-off\n", - tgid ? tgid_space : space); - seq_printf(m, "# %s / _----=> need-resched\n", - tgid ? tgid_space : space); - seq_printf(m, "# %s| / _---=> hardirq/softirq\n", - tgid ? tgid_space : space); - seq_printf(m, "# %s|| / _--=> preempt-depth\n", - tgid ? tgid_space : space); - seq_printf(m, "# %s||| / delay\n", - tgid ? tgid_space : space); - seq_printf(m, "# TASK-PID %sCPU# |||| TIMESTAMP FUNCTION\n", - tgid ? " TGID " : space); - seq_printf(m, "# | | %s | |||| | |\n", - tgid ? " | " : space); + const char *space = " "; + int prec = tgid ? 10 : 2; + + print_event_info(buf, m); + + seq_printf(m, "# %.*s _-----=> irqs-off\n", prec, space); + seq_printf(m, "# %.*s / _----=> need-resched\n", prec, space); + seq_printf(m, "# %.*s| / _---=> hardirq/softirq\n", prec, space); + seq_printf(m, "# %.*s|| / _--=> preempt-depth\n", prec, space); + seq_printf(m, "# %.*s||| / delay\n", prec, space); + seq_printf(m, "# TASK-PID %.*sCPU# |||| TIMESTAMP FUNCTION\n", prec, " TGID "); + seq_printf(m, "# | | %.*s | |||| | |\n", prec, " | "); } void @@ -3902,7 +4105,8 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot) if (iter->cpu_file == RING_BUFFER_ALL_CPUS) { for_each_tracing_cpu(cpu) { iter->buffer_iter[cpu] = - ring_buffer_read_prepare(iter->trace_buffer->buffer, cpu); + ring_buffer_read_prepare(iter->trace_buffer->buffer, + cpu, GFP_KERNEL); } ring_buffer_read_prepare_sync(); for_each_tracing_cpu(cpu) { @@ -3912,7 +4116,8 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot) } else { cpu = iter->cpu_file; iter->buffer_iter[cpu] = - ring_buffer_read_prepare(iter->trace_buffer->buffer, cpu); + ring_buffer_read_prepare(iter->trace_buffer->buffer, + cpu, GFP_KERNEL); ring_buffer_read_prepare_sync(); ring_buffer_read_start(iter->buffer_iter[cpu]); tracing_iter_reset(iter, cpu); @@ -4521,6 +4726,7 @@ static const char readme_msg[] = " trace_pipe\t\t- A consuming read to see the contents of the buffer\n" " current_tracer\t- function and latency tracers\n" " available_tracers\t- list of configured tracers for current_tracer\n" + " error_log\t- error log for failed commands (that support it)\n" " buffer_size_kb\t- view and modify size of per cpu buffer\n" " buffer_total_size_kb - view total size of all cpu buffers\n\n" " trace_clock\t\t-change the clock used to order events\n" @@ -4541,7 +4747,7 @@ static const char readme_msg[] = " instances\t\t- Make sub-buffers with: mkdir instances/foo\n" "\t\t\t Remove sub-buffer with rmdir\n" " trace_options\t\t- Set format or modify how tracing happens\n" - "\t\t\t Disable an option by adding a suffix 'no' to the\n" + "\t\t\t Disable an option by prefixing 'no' to the\n" "\t\t\t option name\n" " saved_cmdlines_size\t- echo command number in here to store comm-pid list\n" #ifdef CONFIG_DYNAMIC_FTRACE @@ -4700,6 +4906,7 @@ static const char readme_msg[] = "\t [:size=#entries]\n" "\t [:pause][:continue][:clear]\n" "\t [:name=histname1]\n" + "\t [:<handler>.<action>]\n" "\t [if <filter>]\n\n" "\t When a matching event is hit, an entry is added to a hash\n" "\t table using the key(s) and value(s) named, and the value of a\n" @@ -4740,8 +4947,21 @@ static const char readme_msg[] = "\t unchanged.\n\n" "\t The enable_hist and disable_hist triggers can be used to\n" "\t have one event conditionally start and stop another event's\n" - "\t already-attached hist trigger. The syntax is analagous to\n" - "\t the enable_event and disable_event triggers.\n" + "\t already-attached hist trigger. The syntax is analogous to\n" + "\t the enable_event and disable_event triggers.\n\n" + "\t Hist trigger handlers and actions are executed whenever a\n" + "\t a histogram entry is added or updated. They take the form:\n\n" + "\t <handler>.<action>\n\n" + "\t The available handlers are:\n\n" + "\t onmatch(matching.event) - invoke on addition or update\n" + "\t onmax(var) - invoke if var exceeds current max\n" + "\t onchange(var) - invoke action if var changes\n\n" + "\t The available actions are:\n\n" + "\t trace(<synthetic_event>,param list) - generate synthetic event\n" + "\t save(field,...) - save current event fields\n" +#ifdef CONFIG_TRACER_SNAPSHOT + "\t snapshot() - snapshot the trace buffer\n" +#endif #endif ; @@ -5386,6 +5606,16 @@ static int tracing_set_tracer(struct trace_array *tr, const char *buf) if (t == tr->current_trace) goto out; +#ifdef CONFIG_TRACER_SNAPSHOT + if (t->use_max_tr) { + arch_spin_lock(&tr->max_lock); + if (tr->cond_snapshot) + ret = -EBUSY; + arch_spin_unlock(&tr->max_lock); + if (ret) + goto out; + } +#endif /* Some tracers won't work on kernel command line */ if (system_state < SYSTEM_RUNNING && t->noboot) { pr_warn("Tracer '%s' is not allowed on command line, ignored\n", @@ -5624,7 +5854,6 @@ out: return ret; fail: - kfree(iter->trace); kfree(iter); __trace_array_put(tr); mutex_unlock(&trace_types_lock); @@ -5823,7 +6052,6 @@ static void tracing_spd_release_pipe(struct splice_pipe_desc *spd, } static const struct pipe_buf_operations tracing_pipe_buf_ops = { - .can_merge = 0, .confirm = generic_pipe_buf_confirm, .release = generic_pipe_buf_release, .steal = generic_pipe_buf_steal, @@ -6103,13 +6331,13 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, struct ring_buffer *buffer; struct print_entry *entry; unsigned long irq_flags; - const char faulted[] = "<faulted>"; ssize_t written; int size; int len; /* Used in tracing_mark_raw_write() as well */ -#define FAULTED_SIZE (sizeof(faulted) - 1) /* '\0' is already accounted for */ +#define FAULTED_STR "<faulted>" +#define FAULTED_SIZE (sizeof(FAULTED_STR) - 1) /* '\0' is already accounted for */ if (tracing_disabled) return -EINVAL; @@ -6141,7 +6369,7 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, len = __copy_from_user_inatomic(&entry->buf, ubuf, cnt); if (len) { - memcpy(&entry->buf, faulted, FAULTED_SIZE); + memcpy(&entry->buf, FAULTED_STR, FAULTED_SIZE); cnt = FAULTED_SIZE; written = -EFAULT; } else @@ -6182,7 +6410,6 @@ tracing_mark_raw_write(struct file *filp, const char __user *ubuf, struct ring_buffer_event *event; struct ring_buffer *buffer; struct raw_data_entry *entry; - const char faulted[] = "<faulted>"; unsigned long irq_flags; ssize_t written; int size; @@ -6222,7 +6449,7 @@ tracing_mark_raw_write(struct file *filp, const char __user *ubuf, len = __copy_from_user_inatomic(&entry->id, ubuf, cnt); if (len) { entry->id = -1; - memcpy(&entry->buf, faulted, FAULTED_SIZE); + memcpy(&entry->buf, FAULTED_STR, FAULTED_SIZE); written = -EFAULT; } else written = cnt; @@ -6468,6 +6695,13 @@ tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt, goto out; } + arch_spin_lock(&tr->max_lock); + if (tr->cond_snapshot) + ret = -EBUSY; + arch_spin_unlock(&tr->max_lock); + if (ret) + goto out; + switch (val) { case 0: if (iter->cpu_file != RING_BUFFER_ALL_CPUS) { @@ -6485,15 +6719,17 @@ tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt, break; } #endif - if (!tr->allocated_snapshot) { + if (tr->allocated_snapshot) + ret = resize_buffer_duplicate_size(&tr->max_buffer, + &tr->trace_buffer, iter->cpu_file); + else ret = tracing_alloc_snapshot_instance(tr); - if (ret < 0) - break; - } + if (ret < 0) + break; local_irq_disable(); /* Now, we're going to swap */ if (iter->cpu_file == RING_BUFFER_ALL_CPUS) - update_max_tr(tr, current, smp_processor_id()); + update_max_tr(tr, current, smp_processor_id(), NULL); else update_max_tr_single(tr, current, iter->cpu_file); local_irq_enable(); @@ -6668,6 +6904,250 @@ static const struct file_operations snapshot_raw_fops = { #endif /* CONFIG_TRACER_SNAPSHOT */ +#define TRACING_LOG_ERRS_MAX 8 +#define TRACING_LOG_LOC_MAX 128 + +#define CMD_PREFIX " Command: " + +struct err_info { + const char **errs; /* ptr to loc-specific array of err strings */ + u8 type; /* index into errs -> specific err string */ + u8 pos; /* MAX_FILTER_STR_VAL = 256 */ + u64 ts; +}; + +struct tracing_log_err { + struct list_head list; + struct err_info info; + char loc[TRACING_LOG_LOC_MAX]; /* err location */ + char cmd[MAX_FILTER_STR_VAL]; /* what caused err */ +}; + +static DEFINE_MUTEX(tracing_err_log_lock); + +static struct tracing_log_err *get_tracing_log_err(struct trace_array *tr) +{ + struct tracing_log_err *err; + + if (tr->n_err_log_entries < TRACING_LOG_ERRS_MAX) { + err = kzalloc(sizeof(*err), GFP_KERNEL); + if (!err) + err = ERR_PTR(-ENOMEM); + tr->n_err_log_entries++; + + return err; + } + + err = list_first_entry(&tr->err_log, struct tracing_log_err, list); + list_del(&err->list); + + return err; +} + +/** + * err_pos - find the position of a string within a command for error careting + * @cmd: The tracing command that caused the error + * @str: The string to position the caret at within @cmd + * + * Finds the position of the first occurence of @str within @cmd. The + * return value can be passed to tracing_log_err() for caret placement + * within @cmd. + * + * Returns the index within @cmd of the first occurence of @str or 0 + * if @str was not found. + */ +unsigned int err_pos(char *cmd, const char *str) +{ + char *found; + + if (WARN_ON(!strlen(cmd))) + return 0; + + found = strstr(cmd, str); + if (found) + return found - cmd; + + return 0; +} + +/** + * tracing_log_err - write an error to the tracing error log + * @tr: The associated trace array for the error (NULL for top level array) + * @loc: A string describing where the error occurred + * @cmd: The tracing command that caused the error + * @errs: The array of loc-specific static error strings + * @type: The index into errs[], which produces the specific static err string + * @pos: The position the caret should be placed in the cmd + * + * Writes an error into tracing/error_log of the form: + * + * <loc>: error: <text> + * Command: <cmd> + * ^ + * + * tracing/error_log is a small log file containing the last + * TRACING_LOG_ERRS_MAX errors (8). Memory for errors isn't allocated + * unless there has been a tracing error, and the error log can be + * cleared and have its memory freed by writing the empty string in + * truncation mode to it i.e. echo > tracing/error_log. + * + * NOTE: the @errs array along with the @type param are used to + * produce a static error string - this string is not copied and saved + * when the error is logged - only a pointer to it is saved. See + * existing callers for examples of how static strings are typically + * defined for use with tracing_log_err(). + */ +void tracing_log_err(struct trace_array *tr, + const char *loc, const char *cmd, + const char **errs, u8 type, u8 pos) +{ + struct tracing_log_err *err; + + if (!tr) + tr = &global_trace; + + mutex_lock(&tracing_err_log_lock); + err = get_tracing_log_err(tr); + if (PTR_ERR(err) == -ENOMEM) { + mutex_unlock(&tracing_err_log_lock); + return; + } + + snprintf(err->loc, TRACING_LOG_LOC_MAX, "%s: error: ", loc); + snprintf(err->cmd, MAX_FILTER_STR_VAL,"\n" CMD_PREFIX "%s\n", cmd); + + err->info.errs = errs; + err->info.type = type; + err->info.pos = pos; + err->info.ts = local_clock(); + + list_add_tail(&err->list, &tr->err_log); + mutex_unlock(&tracing_err_log_lock); +} + +static void clear_tracing_err_log(struct trace_array *tr) +{ + struct tracing_log_err *err, *next; + + mutex_lock(&tracing_err_log_lock); + list_for_each_entry_safe(err, next, &tr->err_log, list) { + list_del(&err->list); + kfree(err); + } + + tr->n_err_log_entries = 0; + mutex_unlock(&tracing_err_log_lock); +} + +static void *tracing_err_log_seq_start(struct seq_file *m, loff_t *pos) +{ + struct trace_array *tr = m->private; + + mutex_lock(&tracing_err_log_lock); + + return seq_list_start(&tr->err_log, *pos); +} + +static void *tracing_err_log_seq_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct trace_array *tr = m->private; + + return seq_list_next(v, &tr->err_log, pos); +} + +static void tracing_err_log_seq_stop(struct seq_file *m, void *v) +{ + mutex_unlock(&tracing_err_log_lock); +} + +static void tracing_err_log_show_pos(struct seq_file *m, u8 pos) +{ + u8 i; + + for (i = 0; i < sizeof(CMD_PREFIX) - 1; i++) + seq_putc(m, ' '); + for (i = 0; i < pos; i++) + seq_putc(m, ' '); + seq_puts(m, "^\n"); +} + +static int tracing_err_log_seq_show(struct seq_file *m, void *v) +{ + struct tracing_log_err *err = v; + + if (err) { + const char *err_text = err->info.errs[err->info.type]; + u64 sec = err->info.ts; + u32 nsec; + + nsec = do_div(sec, NSEC_PER_SEC); + seq_printf(m, "[%5llu.%06u] %s%s", sec, nsec / 1000, + err->loc, err_text); + seq_printf(m, "%s", err->cmd); + tracing_err_log_show_pos(m, err->info.pos); + } + + return 0; +} + +static const struct seq_operations tracing_err_log_seq_ops = { + .start = tracing_err_log_seq_start, + .next = tracing_err_log_seq_next, + .stop = tracing_err_log_seq_stop, + .show = tracing_err_log_seq_show +}; + +static int tracing_err_log_open(struct inode *inode, struct file *file) +{ + struct trace_array *tr = inode->i_private; + int ret = 0; + + if (trace_array_get(tr) < 0) + return -ENODEV; + + /* If this file was opened for write, then erase contents */ + if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) + clear_tracing_err_log(tr); + + if (file->f_mode & FMODE_READ) { + ret = seq_open(file, &tracing_err_log_seq_ops); + if (!ret) { + struct seq_file *m = file->private_data; + m->private = tr; + } else { + trace_array_put(tr); + } + } + return ret; +} + +static ssize_t tracing_err_log_write(struct file *file, + const char __user *buffer, + size_t count, loff_t *ppos) +{ + return count; +} + +static int tracing_err_log_release(struct inode *inode, struct file *file) +{ + struct trace_array *tr = inode->i_private; + + trace_array_put(tr); + + if (file->f_mode & FMODE_READ) + seq_release(inode, file); + + return 0; +} + +static const struct file_operations tracing_err_log_fops = { + .open = tracing_err_log_open, + .write = tracing_err_log_write, + .read = seq_read, + .llseek = seq_lseek, + .release = tracing_err_log_release, +}; + static int tracing_buffers_open(struct inode *inode, struct file *filp) { struct trace_array *tr = inode->i_private; @@ -6817,36 +7297,43 @@ struct buffer_ref { struct ring_buffer *buffer; void *page; int cpu; - int ref; + refcount_t refcount; }; +static void buffer_ref_release(struct buffer_ref *ref) +{ + if (!refcount_dec_and_test(&ref->refcount)) + return; + ring_buffer_free_read_page(ref->buffer, ref->cpu, ref->page); + kfree(ref); +} + static void buffer_pipe_buf_release(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { struct buffer_ref *ref = (struct buffer_ref *)buf->private; - if (--ref->ref) - return; - - ring_buffer_free_read_page(ref->buffer, ref->cpu, ref->page); - kfree(ref); + buffer_ref_release(ref); buf->private = 0; } -static void buffer_pipe_buf_get(struct pipe_inode_info *pipe, +static bool buffer_pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { struct buffer_ref *ref = (struct buffer_ref *)buf->private; - ref->ref++; + if (refcount_read(&ref->refcount) > INT_MAX/2) + return false; + + refcount_inc(&ref->refcount); + return true; } /* Pipe buffer operations for a buffer. */ static const struct pipe_buf_operations buffer_pipe_buf_ops = { - .can_merge = 0, .confirm = generic_pipe_buf_confirm, .release = buffer_pipe_buf_release, - .steal = generic_pipe_buf_steal, + .steal = generic_pipe_buf_nosteal, .get = buffer_pipe_buf_get, }; @@ -6859,11 +7346,7 @@ static void buffer_spd_release(struct splice_pipe_desc *spd, unsigned int i) struct buffer_ref *ref = (struct buffer_ref *)spd->partial[i].private; - if (--ref->ref) - return; - - ring_buffer_free_read_page(ref->buffer, ref->cpu, ref->page); - kfree(ref); + buffer_ref_release(ref); spd->partial[i].private = 0; } @@ -6918,7 +7401,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, break; } - ref->ref = 1; + refcount_set(&ref->refcount, 1); ref->buffer = iter->trace_buffer->buffer; ref->page = ring_buffer_alloc_read_page(ref->buffer, iter->cpu_file); if (IS_ERR(ref->page)) { @@ -7723,7 +8206,7 @@ static const struct file_operations buffer_percent_fops = { .llseek = default_llseek, }; -struct dentry *trace_instance_dir; +static struct dentry *trace_instance_dir; static void init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer); @@ -7830,7 +8313,7 @@ static void update_tracer_options(struct trace_array *tr) mutex_unlock(&trace_types_lock); } -static int instance_mkdir(const char *name) +struct trace_array *trace_array_create(const char *name) { struct trace_array *tr; int ret; @@ -7869,6 +8352,7 @@ static int instance_mkdir(const char *name) INIT_LIST_HEAD(&tr->systems); INIT_LIST_HEAD(&tr->events); INIT_LIST_HEAD(&tr->hist_vars); + INIT_LIST_HEAD(&tr->err_log); if (allocate_trace_buffers(tr, trace_buf_size) < 0) goto out_free_tr; @@ -7894,7 +8378,7 @@ static int instance_mkdir(const char *name) mutex_unlock(&trace_types_lock); mutex_unlock(&event_mutex); - return 0; + return tr; out_free_tr: free_trace_buffers(tr); @@ -7906,33 +8390,21 @@ static int instance_mkdir(const char *name) mutex_unlock(&trace_types_lock); mutex_unlock(&event_mutex); - return ret; + return ERR_PTR(ret); +} +EXPORT_SYMBOL_GPL(trace_array_create); +static int instance_mkdir(const char *name) +{ + return PTR_ERR_OR_ZERO(trace_array_create(name)); } -static int instance_rmdir(const char *name) +static int __remove_instance(struct trace_array *tr) { - struct trace_array *tr; - int found = 0; - int ret; int i; - mutex_lock(&event_mutex); - mutex_lock(&trace_types_lock); - - ret = -ENODEV; - list_for_each_entry(tr, &ftrace_trace_arrays, list) { - if (tr->name && strcmp(tr->name, name) == 0) { - found = 1; - break; - } - } - if (!found) - goto out_unlock; - - ret = -EBUSY; if (tr->ref || (tr->current_trace && tr->current_trace->ref)) - goto out_unlock; + return -EBUSY; list_del(&tr->list); @@ -7958,10 +8430,46 @@ static int instance_rmdir(const char *name) free_cpumask_var(tr->tracing_cpumask); kfree(tr->name); kfree(tr); + tr = NULL; - ret = 0; + return 0; +} + +int trace_array_destroy(struct trace_array *tr) +{ + int ret; + + if (!tr) + return -EINVAL; + + mutex_lock(&event_mutex); + mutex_lock(&trace_types_lock); + + ret = __remove_instance(tr); + + mutex_unlock(&trace_types_lock); + mutex_unlock(&event_mutex); + + return ret; +} +EXPORT_SYMBOL_GPL(trace_array_destroy); + +static int instance_rmdir(const char *name) +{ + struct trace_array *tr; + int ret; + + mutex_lock(&event_mutex); + mutex_lock(&trace_types_lock); + + ret = -ENODEV; + list_for_each_entry(tr, &ftrace_trace_arrays, list) { + if (tr->name && strcmp(tr->name, name) == 0) { + ret = __remove_instance(tr); + break; + } + } - out_unlock: mutex_unlock(&trace_types_lock); mutex_unlock(&event_mutex); @@ -8051,6 +8559,9 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer) tr, &snapshot_fops); #endif + trace_create_file("error_log", 0644, d_tracer, + tr, &tracing_err_log_fops); + for_each_tracing_cpu(cpu) tracing_init_tracefs_percpu(tr, cpu); @@ -8107,10 +8618,6 @@ struct dentry *tracing_init_dentry(void) */ tr->dir = debugfs_create_automount("tracing", NULL, trace_automount, NULL); - if (!tr->dir) { - pr_warn_once("Could not create debugfs directory 'tracing'\n"); - return ERR_PTR(-ENOMEM); - } return NULL; } @@ -8413,12 +8920,8 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) cnt++; - /* reset all but tr, trace, and overruns */ - memset(&iter.seq, 0, - sizeof(struct trace_iterator) - - offsetof(struct trace_iterator, seq)); + trace_iterator_reset(&iter); iter.iter_flags |= TRACE_FILE_LAT_FMT; - iter.pos = -1; if (trace_find_next_entry_inc(&iter) != NULL) { int ret; @@ -8636,6 +9139,7 @@ __init static int tracer_alloc_buffers(void) INIT_LIST_HEAD(&global_trace.systems); INIT_LIST_HEAD(&global_trace.events); INIT_LIST_HEAD(&global_trace.hist_vars); + INIT_LIST_HEAD(&global_trace.err_log); list_add(&global_trace.list, &ftrace_trace_arrays); apply_trace_boot_options(); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 08900828d282..005f08629b8b 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -15,7 +15,6 @@ #include <linux/trace_seq.h> #include <linux/trace_events.h> #include <linux/compiler.h> -#include <linux/trace_seq.h> #include <linux/glob.h> #ifdef CONFIG_FTRACE_SYSCALLS @@ -194,6 +193,51 @@ struct trace_pid_list { unsigned long *pids; }; +typedef bool (*cond_update_fn_t)(struct trace_array *tr, void *cond_data); + +/** + * struct cond_snapshot - conditional snapshot data and callback + * + * The cond_snapshot structure encapsulates a callback function and + * data associated with the snapshot for a given tracing instance. + * + * When a snapshot is taken conditionally, by invoking + * tracing_snapshot_cond(tr, cond_data), the cond_data passed in is + * passed in turn to the cond_snapshot.update() function. That data + * can be compared by the update() implementation with the cond_data + * contained wihin the struct cond_snapshot instance associated with + * the trace_array. Because the tr->max_lock is held throughout the + * update() call, the update() function can directly retrieve the + * cond_snapshot and cond_data associated with the per-instance + * snapshot associated with the trace_array. + * + * The cond_snapshot.update() implementation can save data to be + * associated with the snapshot if it decides to, and returns 'true' + * in that case, or it returns 'false' if the conditional snapshot + * shouldn't be taken. + * + * The cond_snapshot instance is created and associated with the + * user-defined cond_data by tracing_cond_snapshot_enable(). + * Likewise, the cond_snapshot instance is destroyed and is no longer + * associated with the trace instance by + * tracing_cond_snapshot_disable(). + * + * The method below is required. + * + * @update: When a conditional snapshot is invoked, the update() + * callback function is invoked with the tr->max_lock held. The + * update() implementation signals whether or not to actually + * take the snapshot, by returning 'true' if so, 'false' if no + * snapshot should be taken. Because the max_lock is held for + * the duration of update(), the implementation is safe to + * directly retrieven and save any implementation data it needs + * to in association with the snapshot. + */ +struct cond_snapshot { + void *cond_data; + cond_update_fn_t update; +}; + /* * The trace array - an array of per-CPU trace arrays. This is the * highest level data structure that individual tracers deal with. @@ -248,11 +292,13 @@ struct trace_array { int nr_topts; bool clear_trace; int buffer_percent; + unsigned int n_err_log_entries; struct tracer *current_trace; unsigned int trace_flags; unsigned char trace_flags_index[TRACE_FLAGS_MAX_SIZE]; unsigned int flags; raw_spinlock_t start_lock; + struct list_head err_log; struct dentry *dir; struct dentry *options; struct dentry *percpu_dir; @@ -277,6 +323,9 @@ struct trace_array { #endif int time_stamp_abs_ref; struct list_head hist_vars; +#ifdef CONFIG_TRACER_SNAPSHOT + struct cond_snapshot *cond_snapshot; +#endif }; enum { @@ -671,6 +720,9 @@ void trace_init_global_iter(struct trace_iterator *iter); void tracing_iter_reset(struct trace_iterator *iter, int cpu); +unsigned long trace_total_entries_cpu(struct trace_array *tr, int cpu); +unsigned long trace_total_entries(struct trace_array *tr); + void trace_function(struct trace_array *tr, unsigned long ip, unsigned long parent_ip, @@ -727,23 +779,16 @@ int trace_pid_write(struct trace_pid_list *filtered_pids, const char __user *ubuf, size_t cnt); #ifdef CONFIG_TRACER_MAX_TRACE -void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu); +void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu, + void *cond_data); void update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu); #endif /* CONFIG_TRACER_MAX_TRACE */ #ifdef CONFIG_STACKTRACE -void ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, - int pc); - void __trace_stack(struct trace_array *tr, unsigned long flags, int skip, int pc); #else -static inline void ftrace_trace_userstack(struct ring_buffer *buffer, - unsigned long flags, int pc) -{ -} - static inline void __trace_stack(struct trace_array *tr, unsigned long flags, int skip, int pc) { @@ -855,10 +900,11 @@ static __always_inline bool ftrace_hash_empty(struct ftrace_hash *hash) #define TRACE_GRAPH_PRINT_PROC 0x8 #define TRACE_GRAPH_PRINT_DURATION 0x10 #define TRACE_GRAPH_PRINT_ABS_TIME 0x20 -#define TRACE_GRAPH_PRINT_IRQS 0x40 -#define TRACE_GRAPH_PRINT_TAIL 0x80 -#define TRACE_GRAPH_SLEEP_TIME 0x100 -#define TRACE_GRAPH_GRAPH_TIME 0x200 +#define TRACE_GRAPH_PRINT_REL_TIME 0x40 +#define TRACE_GRAPH_PRINT_IRQS 0x80 +#define TRACE_GRAPH_PRINT_TAIL 0x100 +#define TRACE_GRAPH_SLEEP_TIME 0x200 +#define TRACE_GRAPH_GRAPH_TIME 0x400 #define TRACE_GRAPH_PRINT_FILL_SHIFT 28 #define TRACE_GRAPH_PRINT_FILL_MASK (0x3 << TRACE_GRAPH_PRINT_FILL_SHIFT) @@ -1458,6 +1504,7 @@ enum regex_type { MATCH_MIDDLE_ONLY, MATCH_END_ONLY, MATCH_GLOB, + MATCH_INDEX, }; struct regex { @@ -1502,7 +1549,8 @@ extern int apply_subsystem_event_filter(struct trace_subsystem_dir *dir, extern void print_subsystem_event_filter(struct event_subsystem *system, struct trace_seq *s); extern int filter_assign_type(const char *type); -extern int create_event_filter(struct trace_event_call *call, +extern int create_event_filter(struct trace_array *tr, + struct trace_event_call *call, char *filter_str, bool set_str, struct event_filter **filterp); extern void free_event_filter(struct event_filter *filter); @@ -1808,6 +1856,11 @@ static inline bool event_command_needs_rec(struct event_command *cmd_ops) extern int trace_event_enable_disable(struct trace_event_file *file, int enable, int soft_disable); extern int tracing_alloc_snapshot(void); +extern void tracing_snapshot_cond(struct trace_array *tr, void *cond_data); +extern int tracing_snapshot_cond_enable(struct trace_array *tr, void *cond_data, cond_update_fn_t update); + +extern int tracing_snapshot_cond_disable(struct trace_array *tr); +extern void *tracing_cond_snapshot_data(struct trace_array *tr); extern const char *__start___trace_bprintk_fmt[]; extern const char *__stop___trace_bprintk_fmt[]; @@ -1828,6 +1881,11 @@ extern ssize_t trace_parse_run_command(struct file *file, const char __user *buffer, size_t count, loff_t *ppos, int (*createfn)(int, char**)); +extern unsigned int err_pos(char *cmd, const char *str); +extern void tracing_log_err(struct trace_array *tr, + const char *loc, const char *cmd, + const char **errs, u8 type, u8 pos); + /* * Normal trace_printk() and friends allocates special buffers * to do the manipulation, as well as saves the print formats @@ -1908,4 +1966,22 @@ static inline void tracer_hardirqs_off(unsigned long a0, unsigned long a1) { } extern struct trace_iterator *tracepoint_print_iter; +/* + * Reset the state of the trace_iterator so that it can read consumed data. + * Normally, the trace_iterator is used for reading the data when it is not + * consumed, and must retain state. + */ +static __always_inline void trace_iterator_reset(struct trace_iterator *iter) +{ + const size_t offset = offsetof(struct trace_iterator, seq); + + /* + * Keep gcc from complaining about overwriting more than just one + * member in the structure. + */ + memset((char *)iter + offset, 0, sizeof(struct trace_iterator) - offset); + + iter->pos = -1; +} + #endif /* _LINUX_KERNEL_TRACE_H */ diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c index 4ad967453b6f..3ea65cdff30d 100644 --- a/kernel/trace/trace_branch.c +++ b/kernel/trace/trace_branch.c @@ -205,6 +205,8 @@ void trace_likely_condition(struct ftrace_likely_data *f, int val, int expect) void ftrace_likely_update(struct ftrace_likely_data *f, int val, int expect, int is_constant) { + unsigned long flags = user_access_save(); + /* A constant is always correct */ if (is_constant) { f->constant++; @@ -223,6 +225,8 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val, f->data.correct++; else f->data.incorrect++; + + user_access_restore(flags); } EXPORT_SYMBOL(ftrace_likely_update); diff --git a/kernel/trace/trace_dynevent.c b/kernel/trace/trace_dynevent.c index dd1f43588d70..fa100ed3b4de 100644 --- a/kernel/trace/trace_dynevent.c +++ b/kernel/trace/trace_dynevent.c @@ -74,7 +74,7 @@ int dyn_event_release(int argc, char **argv, struct dyn_event_operations *type) static int create_dyn_event(int argc, char **argv) { struct dyn_event_operations *ops; - int ret; + int ret = -ENODEV; if (argv[0][0] == '-' || argv[0][0] == '!') return dyn_event_release(argc, argv, NULL); diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index 06bb2fd9a56c..fc8e97328e54 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h @@ -65,7 +65,8 @@ FTRACE_ENTRY_REG(function, ftrace_entry, __field( unsigned long, parent_ip ) ), - F_printk(" %lx <-- %lx", __entry->ip, __entry->parent_ip), + F_printk(" %ps <-- %ps", + (void *)__entry->ip, (void *)__entry->parent_ip), FILTER_TRACE_FN, @@ -83,7 +84,7 @@ FTRACE_ENTRY_PACKED(funcgraph_entry, ftrace_graph_ent_entry, __field_desc( int, graph_ent, depth ) ), - F_printk("--> %lx (%d)", __entry->func, __entry->depth), + F_printk("--> %ps (%d)", (void *)__entry->func, __entry->depth), FILTER_OTHER ); @@ -102,8 +103,8 @@ FTRACE_ENTRY_PACKED(funcgraph_exit, ftrace_graph_ret_entry, __field_desc( int, ret, depth ) ), - F_printk("<-- %lx (%d) (start: %llx end: %llx) over: %d", - __entry->func, __entry->depth, + F_printk("<-- %ps (%d) (start: %llx end: %llx) over: %d", + (void *)__entry->func, __entry->depth, __entry->calltime, __entry->rettime, __entry->depth), @@ -167,12 +168,6 @@ FTRACE_ENTRY_DUP(wakeup, ctx_switch_entry, #define FTRACE_STACK_ENTRIES 8 -#ifndef CONFIG_64BIT -# define IP_FMT "%08lx" -#else -# define IP_FMT "%016lx" -#endif - FTRACE_ENTRY(kernel_stack, stack_entry, TRACE_STACK, @@ -182,12 +177,13 @@ FTRACE_ENTRY(kernel_stack, stack_entry, __dynamic_array(unsigned long, caller ) ), - F_printk("\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n" - "\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n" - "\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n", - __entry->caller[0], __entry->caller[1], __entry->caller[2], - __entry->caller[3], __entry->caller[4], __entry->caller[5], - __entry->caller[6], __entry->caller[7]), + F_printk("\t=> %ps\n\t=> %ps\n\t=> %ps\n" + "\t=> %ps\n\t=> %ps\n\t=> %ps\n" + "\t=> %ps\n\t=> %ps\n", + (void *)__entry->caller[0], (void *)__entry->caller[1], + (void *)__entry->caller[2], (void *)__entry->caller[3], + (void *)__entry->caller[4], (void *)__entry->caller[5], + (void *)__entry->caller[6], (void *)__entry->caller[7]), FILTER_OTHER ); @@ -201,12 +197,13 @@ FTRACE_ENTRY(user_stack, userstack_entry, __array( unsigned long, caller, FTRACE_STACK_ENTRIES ) ), - F_printk("\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n" - "\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n" - "\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n", - __entry->caller[0], __entry->caller[1], __entry->caller[2], - __entry->caller[3], __entry->caller[4], __entry->caller[5], - __entry->caller[6], __entry->caller[7]), + F_printk("\t=> %ps\n\t=> %ps\n\t=> %ps\n" + "\t=> %ps\n\t=> %ps\n\t=> %ps\n" + "\t=> %ps\n\t=> %ps\n", + (void *)__entry->caller[0], (void *)__entry->caller[1], + (void *)__entry->caller[2], (void *)__entry->caller[3], + (void *)__entry->caller[4], (void *)__entry->caller[5], + (void *)__entry->caller[6], (void *)__entry->caller[7]), FILTER_OTHER ); diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index 76217bbef815..4629a6104474 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -299,15 +299,13 @@ int perf_uprobe_init(struct perf_event *p_event, if (!p_event->attr.uprobe_path) return -EINVAL; - path = kzalloc(PATH_MAX, GFP_KERNEL); - if (!path) - return -ENOMEM; - ret = strncpy_from_user( - path, u64_to_user_ptr(p_event->attr.uprobe_path), PATH_MAX); - if (ret == PATH_MAX) - return -E2BIG; - if (ret < 0) - goto out; + + path = strndup_user(u64_to_user_ptr(p_event->attr.uprobe_path), + PATH_MAX); + if (IS_ERR(path)) { + ret = PTR_ERR(path); + return (ret == -EINVAL) ? -E2BIG : ret; + } if (path[0] == '\0') { ret = -EINVAL; goto out; diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 5b3b0c3c8a47..0ce3db67f556 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -832,6 +832,7 @@ static int ftrace_set_clr_event(struct trace_array *tr, char *buf, int set) return ret; } +EXPORT_SYMBOL_GPL(ftrace_set_clr_event); /** * trace_set_clr_event - enable or disable an event @@ -1318,9 +1319,6 @@ event_id_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) char buf[32]; int len; - if (*ppos) - return 0; - if (unlikely(!id)) return -ENODEV; diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 27821480105e..5079d1db3754 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -66,7 +66,8 @@ static const char * ops[] = { OPS }; C(INVALID_FILTER, "Meaningless filter expression"), \ C(IP_FIELD_ONLY, "Only 'ip' field is supported for function trace"), \ C(INVALID_VALUE, "Invalid value (did you forget quotes)?"), \ - C(NO_FILTER, "No filter found"), + C(ERRNO, "Error"), \ + C(NO_FILTER, "No filter found") #undef C #define C(a, b) FILT_ERR_##a @@ -76,7 +77,7 @@ enum { ERRORS }; #undef C #define C(a, b) b -static char *err_text[] = { ERRORS }; +static const char *err_text[] = { ERRORS }; /* Called after a '!' character but "!=" and "!~" are not "not"s */ static bool is_not(const char *str) @@ -427,7 +428,7 @@ predicate_parse(const char *str, int nr_parens, int nr_preds, op_stack = kmalloc_array(nr_parens, sizeof(*op_stack), GFP_KERNEL); if (!op_stack) return ERR_PTR(-ENOMEM); - prog_stack = kmalloc_array(nr_preds, sizeof(*prog_stack), GFP_KERNEL); + prog_stack = kcalloc(nr_preds, sizeof(*prog_stack), GFP_KERNEL); if (!prog_stack) { parse_error(pe, -ENOMEM, 0); goto out_free; @@ -491,10 +492,12 @@ predicate_parse(const char *str, int nr_parens, int nr_preds, break; case '&': case '|': + /* accepting only "&&" or "||" */ if (next[1] == next[0]) { ptr++; break; } + /* fall through */ default: parse_error(pe, FILT_ERR_TOO_MANY_PREDS, next - str); @@ -576,7 +579,11 @@ predicate_parse(const char *str, int nr_parens, int nr_preds, out_free: kfree(op_stack); kfree(inverts); - kfree(prog_stack); + if (prog_stack) { + for (i = 0; prog_stack[i].pred; i++) + kfree(prog_stack[i].pred); + kfree(prog_stack); + } return ERR_PTR(ret); } @@ -823,6 +830,9 @@ enum regex_type filter_parse_regex(char *buff, int len, char **search, int *not) *search = buff; + if (isdigit(buff[0])) + return MATCH_INDEX; + for (i = 0; i < len; i++) { if (buff[i] == '*') { if (!i) { @@ -860,6 +870,8 @@ static void filter_build_regex(struct filter_pred *pred) } switch (type) { + /* MATCH_INDEX should not happen, but if it does, match full */ + case MATCH_INDEX: case MATCH_FULL: r->match = regex_match_full; break; @@ -912,7 +924,8 @@ static void remove_filter_string(struct event_filter *filter) filter->filter_string = NULL; } -static void append_filter_err(struct filter_parse_error *pe, +static void append_filter_err(struct trace_array *tr, + struct filter_parse_error *pe, struct event_filter *filter) { struct trace_seq *s; @@ -940,8 +953,14 @@ static void append_filter_err(struct filter_parse_error *pe, if (pe->lasterr > 0) { trace_seq_printf(s, "\n%*s", pos, "^"); trace_seq_printf(s, "\nparse_error: %s\n", err_text[pe->lasterr]); + tracing_log_err(tr, "event filter parse error", + filter->filter_string, err_text, + pe->lasterr, pe->lasterr_pos); } else { trace_seq_printf(s, "\nError: (%d)\n", pe->lasterr); + tracing_log_err(tr, "event filter parse error", + filter->filter_string, err_text, + FILT_ERR_ERRNO, 0); } trace_seq_putc(s, 0); buf = kmemdup_nul(s->buffer, s->seq.len, GFP_KERNEL); @@ -1207,30 +1226,30 @@ static int parse_pred(const char *str, void *data, * (perf doesn't use it) and grab everything. */ if (strcmp(field->name, "ip") != 0) { - parse_error(pe, FILT_ERR_IP_FIELD_ONLY, pos + i); - goto err_free; - } - pred->fn = filter_pred_none; - - /* - * Quotes are not required, but if they exist then we need - * to read them till we hit a matching one. - */ - if (str[i] == '\'' || str[i] == '"') - q = str[i]; - else - q = 0; - - for (i++; str[i]; i++) { - if (q && str[i] == q) - break; - if (!q && (str[i] == ')' || str[i] == '&' || - str[i] == '|')) - break; - } - /* Skip quotes */ - if (q) - s++; + parse_error(pe, FILT_ERR_IP_FIELD_ONLY, pos + i); + goto err_free; + } + pred->fn = filter_pred_none; + + /* + * Quotes are not required, but if they exist then we need + * to read them till we hit a matching one. + */ + if (str[i] == '\'' || str[i] == '"') + q = str[i]; + else + q = 0; + + for (i++; str[i]; i++) { + if (q && str[i] == q) + break; + if (!q && (str[i] == ')' || str[i] == '&' || + str[i] == '|')) + break; + } + /* Skip quotes */ + if (q) + s++; len = i - s; if (len >= MAX_FILTER_STR_VAL) { parse_error(pe, FILT_ERR_OPERAND_TOO_LONG, pos + i); @@ -1301,7 +1320,7 @@ static int parse_pred(const char *str, void *data, /* go past the last quote */ i++; - } else if (isdigit(str[i])) { + } else if (isdigit(str[i]) || str[i] == '-') { /* Make sure the field is not a string */ if (is_string_field(field)) { @@ -1314,6 +1333,9 @@ static int parse_pred(const char *str, void *data, goto err_free; } + if (str[i] == '-') + i++; + /* We allow 0xDEADBEEF */ while (isalnum(str[i])) i++; @@ -1590,7 +1612,7 @@ static int process_system_preds(struct trace_subsystem_dir *dir, if (err) { filter_disable(file); parse_error(pe, FILT_ERR_BAD_SUBSYS_FILTER, 0); - append_filter_err(pe, filter); + append_filter_err(tr, pe, filter); } else event_set_filtered_flag(file); @@ -1702,7 +1724,8 @@ static void create_filter_finish(struct filter_parse_error *pe) * information if @set_str is %true and the caller is responsible for * freeing it. */ -static int create_filter(struct trace_event_call *call, +static int create_filter(struct trace_array *tr, + struct trace_event_call *call, char *filter_string, bool set_str, struct event_filter **filterp) { @@ -1719,17 +1742,18 @@ static int create_filter(struct trace_event_call *call, err = process_preds(call, filter_string, *filterp, pe); if (err && set_str) - append_filter_err(pe, *filterp); + append_filter_err(tr, pe, *filterp); create_filter_finish(pe); return err; } -int create_event_filter(struct trace_event_call *call, +int create_event_filter(struct trace_array *tr, + struct trace_event_call *call, char *filter_str, bool set_str, struct event_filter **filterp) { - return create_filter(call, filter_str, set_str, filterp); + return create_filter(tr, call, filter_str, set_str, filterp); } /** @@ -1756,7 +1780,7 @@ static int create_system_filter(struct trace_subsystem_dir *dir, kfree((*filterp)->filter_string); (*filterp)->filter_string = NULL; } else { - append_filter_err(pe, *filterp); + append_filter_err(tr, pe, *filterp); } } create_filter_finish(pe); @@ -1787,7 +1811,7 @@ int apply_event_filter(struct trace_event_file *file, char *filter_string) return 0; } - err = create_filter(call, filter_string, true, &filter); + err = create_filter(file->tr, call, filter_string, true, &filter); /* * Always swap the call filter with the new filter @@ -2043,7 +2067,7 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id, if (event->filter) goto out_unlock; - err = create_filter(call, filter_str, false, &filter); + err = create_filter(NULL, call, filter_str, false, &filter); if (err) goto free_filter; @@ -2192,8 +2216,8 @@ static __init int ftrace_test_event_filter(void) struct test_filter_data_t *d = &test_filter_data[i]; int err; - err = create_filter(&event_ftrace_test_filter, d->filter, - false, &filter); + err = create_filter(NULL, &event_ftrace_test_filter, + d->filter, false, &filter); if (err) { printk(KERN_INFO "Failed to get filter for '%s', err %d\n", diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 449d90cfa151..ca6b0dff60c5 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -22,6 +22,57 @@ #define STR_VAR_LEN_MAX 32 /* must be multiple of sizeof(u64) */ +#define ERRORS \ + C(NONE, "No error"), \ + C(DUPLICATE_VAR, "Variable already defined"), \ + C(VAR_NOT_UNIQUE, "Variable name not unique, need to use fully qualified name (subsys.event.var) for variable"), \ + C(TOO_MANY_VARS, "Too many variables defined"), \ + C(MALFORMED_ASSIGNMENT, "Malformed assignment"), \ + C(NAMED_MISMATCH, "Named hist trigger doesn't match existing named trigger (includes variables)"), \ + C(TRIGGER_EEXIST, "Hist trigger already exists"), \ + C(TRIGGER_ENOENT_CLEAR, "Can't clear or continue a nonexistent hist trigger"), \ + C(SET_CLOCK_FAIL, "Couldn't set trace_clock"), \ + C(BAD_FIELD_MODIFIER, "Invalid field modifier"), \ + C(TOO_MANY_SUBEXPR, "Too many subexpressions (3 max)"), \ + C(TIMESTAMP_MISMATCH, "Timestamp units in expression don't match"), \ + C(TOO_MANY_FIELD_VARS, "Too many field variables defined"), \ + C(EVENT_FILE_NOT_FOUND, "Event file not found"), \ + C(HIST_NOT_FOUND, "Matching event histogram not found"), \ + C(HIST_CREATE_FAIL, "Couldn't create histogram for field"), \ + C(SYNTH_VAR_NOT_FOUND, "Couldn't find synthetic variable"), \ + C(SYNTH_EVENT_NOT_FOUND,"Couldn't find synthetic event"), \ + C(SYNTH_TYPE_MISMATCH, "Param type doesn't match synthetic event field type"), \ + C(SYNTH_COUNT_MISMATCH, "Param count doesn't match synthetic event field count"), \ + C(FIELD_VAR_PARSE_FAIL, "Couldn't parse field variable"), \ + C(VAR_CREATE_FIND_FAIL, "Couldn't create or find variable"), \ + C(ONX_NOT_VAR, "For onmax(x) or onchange(x), x must be a variable"), \ + C(ONX_VAR_NOT_FOUND, "Couldn't find onmax or onchange variable"), \ + C(ONX_VAR_CREATE_FAIL, "Couldn't create onmax or onchange variable"), \ + C(FIELD_VAR_CREATE_FAIL,"Couldn't create field variable"), \ + C(TOO_MANY_PARAMS, "Too many action params"), \ + C(PARAM_NOT_FOUND, "Couldn't find param"), \ + C(INVALID_PARAM, "Invalid action param"), \ + C(ACTION_NOT_FOUND, "No action found"), \ + C(NO_SAVE_PARAMS, "No params found for save()"), \ + C(TOO_MANY_SAVE_ACTIONS,"Can't have more than one save() action per hist"), \ + C(ACTION_MISMATCH, "Handler doesn't support action"), \ + C(NO_CLOSING_PAREN, "No closing paren found"), \ + C(SUBSYS_NOT_FOUND, "Missing subsystem"), \ + C(INVALID_SUBSYS_EVENT, "Invalid subsystem or event name"), \ + C(INVALID_REF_KEY, "Using variable references in keys not supported"), \ + C(VAR_NOT_FOUND, "Couldn't find variable"), \ + C(FIELD_NOT_FOUND, "Couldn't find field"), + +#undef C +#define C(a, b) HIST_ERR_##a + +enum { ERRORS }; + +#undef C +#define C(a, b) b + +static const char *err_text[] = { ERRORS }; + struct hist_field; typedef u64 (*hist_field_fn_t) (struct hist_field *field, @@ -313,9 +364,9 @@ struct hist_trigger_data { struct field_var_hist *field_var_hists[SYNTH_FIELDS_MAX]; unsigned int n_field_var_hists; - struct field_var *max_vars[SYNTH_FIELDS_MAX]; - unsigned int n_max_vars; - unsigned int n_max_var_str; + struct field_var *save_vars[SYNTH_FIELDS_MAX]; + unsigned int n_save_vars; + unsigned int n_save_var_str; }; static int synth_event_create(int argc, const char **argv); @@ -383,98 +434,201 @@ struct action_data; typedef void (*action_fn_t) (struct hist_trigger_data *hist_data, struct tracing_map_elt *elt, void *rec, - struct ring_buffer_event *rbe, + struct ring_buffer_event *rbe, void *key, struct action_data *data, u64 *var_ref_vals); +typedef bool (*check_track_val_fn_t) (u64 track_val, u64 var_val); + +enum handler_id { + HANDLER_ONMATCH = 1, + HANDLER_ONMAX, + HANDLER_ONCHANGE, +}; + +enum action_id { + ACTION_SAVE = 1, + ACTION_TRACE, + ACTION_SNAPSHOT, +}; + struct action_data { + enum handler_id handler; + enum action_id action; + char *action_name; action_fn_t fn; + unsigned int n_params; char *params[SYNTH_FIELDS_MAX]; + /* + * When a histogram trigger is hit, the values of any + * references to variables, including variables being passed + * as parameters to synthetic events, are collected into a + * var_ref_vals array. This var_ref_idx is the index of the + * first param in the array to be passed to the synthetic + * event invocation. + */ + unsigned int var_ref_idx; + struct synth_event *synth_event; + bool use_trace_keyword; + char *synth_event_name; + union { struct { - /* - * When a histogram trigger is hit, the values of any - * references to variables, including variables being passed - * as parameters to synthetic events, are collected into a - * var_ref_vals array. This var_ref_idx is the index of the - * first param in the array to be passed to the synthetic - * event invocation. - */ - unsigned int var_ref_idx; - char *match_event; - char *match_event_system; - char *synth_event_name; - struct synth_event *synth_event; - } onmatch; + char *event; + char *event_system; + } match_data; struct { + /* + * var_str contains the $-unstripped variable + * name referenced by var_ref, and used when + * printing the action. Because var_ref + * creation is deferred to create_actions(), + * we need a per-action way to save it until + * then, thus var_str. + */ char *var_str; - char *fn_name; - unsigned int max_var_ref_idx; - struct hist_field *max_var; - struct hist_field *var; - } onmax; + + /* + * var_ref refers to the variable being + * tracked e.g onmax($var). + */ + struct hist_field *var_ref; + + /* + * track_var contains the 'invisible' tracking + * variable created to keep the current + * e.g. max value. + */ + struct hist_field *track_var; + + check_track_val_fn_t check_val; + action_fn_t save_data; + } track_data; }; }; +struct track_data { + u64 track_val; + bool updated; + + unsigned int key_len; + void *key; + struct tracing_map_elt elt; -static char last_hist_cmd[MAX_FILTER_STR_VAL]; -static char hist_err_str[MAX_FILTER_STR_VAL]; + struct action_data *action_data; + struct hist_trigger_data *hist_data; +}; -static void last_cmd_set(char *str) +struct hist_elt_data { + char *comm; + u64 *var_ref_vals; + char *field_var_str[SYNTH_FIELDS_MAX]; +}; + +struct snapshot_context { + struct tracing_map_elt *elt; + void *key; +}; + +static void track_data_free(struct track_data *track_data) { - if (!str) + struct hist_elt_data *elt_data; + + if (!track_data) return; - strncpy(last_hist_cmd, str, MAX_FILTER_STR_VAL - 1); + kfree(track_data->key); + + elt_data = track_data->elt.private_data; + if (elt_data) { + kfree(elt_data->comm); + kfree(elt_data); + } + + kfree(track_data); } -static void hist_err(char *str, char *var) +static struct track_data *track_data_alloc(unsigned int key_len, + struct action_data *action_data, + struct hist_trigger_data *hist_data) { - int maxlen = MAX_FILTER_STR_VAL - 1; + struct track_data *data = kzalloc(sizeof(*data), GFP_KERNEL); + struct hist_elt_data *elt_data; - if (!str) - return; + if (!data) + return ERR_PTR(-ENOMEM); - if (strlen(hist_err_str)) - return; + data->key = kzalloc(key_len, GFP_KERNEL); + if (!data->key) { + track_data_free(data); + return ERR_PTR(-ENOMEM); + } - if (!var) - var = ""; + data->key_len = key_len; + data->action_data = action_data; + data->hist_data = hist_data; - if (strlen(hist_err_str) + strlen(str) + strlen(var) > maxlen) - return; + elt_data = kzalloc(sizeof(*elt_data), GFP_KERNEL); + if (!elt_data) { + track_data_free(data); + return ERR_PTR(-ENOMEM); + } + data->elt.private_data = elt_data; + + elt_data->comm = kzalloc(TASK_COMM_LEN, GFP_KERNEL); + if (!elt_data->comm) { + track_data_free(data); + return ERR_PTR(-ENOMEM); + } - strcat(hist_err_str, str); - strcat(hist_err_str, var); + return data; } -static void hist_err_event(char *str, char *system, char *event, char *var) +static char last_cmd[MAX_FILTER_STR_VAL]; +static char last_cmd_loc[MAX_FILTER_STR_VAL]; + +static int errpos(char *str) { - char err[MAX_FILTER_STR_VAL]; + return err_pos(last_cmd, str); +} - if (system && var) - snprintf(err, MAX_FILTER_STR_VAL, "%s.%s.%s", system, event, var); - else if (system) - snprintf(err, MAX_FILTER_STR_VAL, "%s.%s", system, event); - else - strscpy(err, var, MAX_FILTER_STR_VAL); +static void last_cmd_set(struct trace_event_file *file, char *str) +{ + const char *system = NULL, *name = NULL; + struct trace_event_call *call; - hist_err(str, err); + if (!str) + return; + + strncpy(last_cmd, str, MAX_FILTER_STR_VAL - 1); + + if (file) { + call = file->event_call; + + system = call->class->system; + if (system) { + name = trace_event_name(call); + if (!name) + system = NULL; + } + } + + if (system) + snprintf(last_cmd_loc, MAX_FILTER_STR_VAL, "hist:%s:%s", system, name); } -static void hist_err_clear(void) +static void hist_err(struct trace_array *tr, u8 err_type, u8 err_pos) { - hist_err_str[0] = '\0'; + tracing_log_err(tr, last_cmd_loc, last_cmd, err_text, + err_type, err_pos); } -static bool have_hist_err(void) +static void hist_err_clear(void) { - if (strlen(hist_err_str)) - return true; - - return false; + last_cmd[0] = '\0'; + last_cmd_loc[0] = '\0'; } struct synth_trace_event { @@ -1078,12 +1232,12 @@ static struct synth_event *alloc_synth_event(const char *name, int n_fields, static void action_trace(struct hist_trigger_data *hist_data, struct tracing_map_elt *elt, void *rec, - struct ring_buffer_event *rbe, + struct ring_buffer_event *rbe, void *key, struct action_data *data, u64 *var_ref_vals) { - struct synth_event *event = data->onmatch.synth_event; + struct synth_event *event = data->synth_event; - trace_synth(event, var_ref_vals, data->onmatch.var_ref_idx); + trace_synth(event, var_ref_vals, data->var_ref_idx); } struct hist_var_data { @@ -1200,8 +1354,8 @@ static int synth_event_create(int argc, const char **argv) /* This interface accepts group name prefix */ if (strchr(name, '/')) { - len = sizeof(SYNTH_SYSTEM "/") - 1; - if (strncmp(name, SYNTH_SYSTEM "/", len)) + len = str_has_prefix(name, SYNTH_SYSTEM "/"); + if (len == 0) return -EINVAL; name += len; } @@ -1603,7 +1757,7 @@ static struct trace_event_file *find_var_file(struct trace_array *tr, if (find_var_field(var_hist_data, var_name)) { if (found) { - hist_err_event("Variable name not unique, need to use fully qualified name (subsys.event.var) for variable: ", system, event_name, var_name); + hist_err(tr, HIST_ERR_VAR_NOT_UNIQUE, errpos(var_name)); return NULL; } @@ -1644,9 +1798,9 @@ find_match_var(struct hist_trigger_data *hist_data, char *var_name) for (i = 0; i < hist_data->n_actions; i++) { struct action_data *data = hist_data->actions[i]; - if (data->fn == action_trace) { - char *system = data->onmatch.match_event_system; - char *event_name = data->onmatch.match_event; + if (data->handler == HANDLER_ONMATCH) { + char *system = data->match_data.event_system; + char *event_name = data->match_data.event; file = find_var_file(tr, system, event_name, var_name); if (!file) @@ -1654,7 +1808,8 @@ find_match_var(struct hist_trigger_data *hist_data, char *var_name) hist_field = find_file_var(file, var_name); if (hist_field) { if (found) { - hist_err_event("Variable name not unique, need to use fully qualified name (subsys.event.var) for variable: ", system, event_name, var_name); + hist_err(tr, HIST_ERR_VAR_NOT_UNIQUE, + errpos(var_name)); return ERR_PTR(-EINVAL); } @@ -1691,12 +1846,6 @@ static struct hist_field *find_event_var(struct hist_trigger_data *hist_data, return hist_field; } -struct hist_elt_data { - char *comm; - u64 *var_ref_vals; - char *field_var_str[SYNTH_FIELDS_MAX]; -}; - static u64 hist_field_var_ref(struct hist_field *hist_field, struct tracing_map_elt *elt, struct ring_buffer_event *rbe, @@ -1705,6 +1854,9 @@ static u64 hist_field_var_ref(struct hist_field *hist_field, struct hist_elt_data *elt_data; u64 var_val = 0; + if (WARN_ON_ONCE(!elt)) + return var_val; + elt_data = elt->private_data; var_val = elt_data->var_ref_vals[hist_field->var_ref_idx]; @@ -1882,7 +2034,8 @@ static int parse_action(char *str, struct hist_trigger_attrs *attrs) return ret; if ((str_has_prefix(str, "onmatch(")) || - (str_has_prefix(str, "onmax("))) { + (str_has_prefix(str, "onmax(")) || + (str_has_prefix(str, "onchange("))) { attrs->action_str[attrs->n_actions] = kstrdup(str, GFP_KERNEL); if (!attrs->action_str[attrs->n_actions]) { ret = -ENOMEM; @@ -1891,11 +2044,11 @@ static int parse_action(char *str, struct hist_trigger_attrs *attrs) attrs->n_actions++; ret = 0; } - return ret; } -static int parse_assignment(char *str, struct hist_trigger_attrs *attrs) +static int parse_assignment(struct trace_array *tr, + char *str, struct hist_trigger_attrs *attrs) { int ret = 0; @@ -1951,7 +2104,7 @@ static int parse_assignment(char *str, struct hist_trigger_attrs *attrs) char *assignment; if (attrs->n_assignments == TRACING_MAP_VARS_MAX) { - hist_err("Too many variables defined: ", str); + hist_err(tr, HIST_ERR_TOO_MANY_VARS, errpos(str)); ret = -EINVAL; goto out; } @@ -1968,7 +2121,8 @@ static int parse_assignment(char *str, struct hist_trigger_attrs *attrs) return ret; } -static struct hist_trigger_attrs *parse_hist_trigger_attrs(char *trigger_str) +static struct hist_trigger_attrs * +parse_hist_trigger_attrs(struct trace_array *tr, char *trigger_str) { struct hist_trigger_attrs *attrs; int ret = 0; @@ -1981,7 +2135,7 @@ static struct hist_trigger_attrs *parse_hist_trigger_attrs(char *trigger_str) char *str = strsep(&trigger_str, ":"); if (strchr(str, '=')) { - ret = parse_assignment(str, attrs); + ret = parse_assignment(tr, str, attrs); if (ret) goto free; } else if (strcmp(str, "pause") == 0) @@ -2030,7 +2184,7 @@ static inline void save_comm(char *comm, struct task_struct *task) return; } - memcpy(comm, task->comm, TASK_COMM_LEN); + strncpy(comm, task->comm, TASK_COMM_LEN); } static void hist_elt_data_free(struct hist_elt_data *elt_data) @@ -2076,7 +2230,7 @@ static int hist_trigger_elt_data_alloc(struct tracing_map_elt *elt) } } - n_str = hist_data->n_field_var_str + hist_data->n_max_var_str; + n_str = hist_data->n_field_var_str + hist_data->n_save_var_str; size = STR_VAR_LEN_MAX; @@ -2537,6 +2691,7 @@ static struct hist_field *parse_var_ref(struct hist_trigger_data *hist_data, char *var_name) { struct hist_field *var_field = NULL, *ref_field = NULL; + struct trace_array *tr = hist_data->event_file->tr; if (!is_var_ref(var_name)) return NULL; @@ -2549,8 +2704,7 @@ static struct hist_field *parse_var_ref(struct hist_trigger_data *hist_data, system, event_name); if (!ref_field) - hist_err_event("Couldn't find variable: $", - system, event_name, var_name); + hist_err(tr, HIST_ERR_VAR_NOT_FOUND, errpos(var_name)); return ref_field; } @@ -2561,6 +2715,7 @@ parse_field(struct hist_trigger_data *hist_data, struct trace_event_file *file, { struct ftrace_event_field *field = NULL; char *field_name, *modifier, *str; + struct trace_array *tr = file->tr; modifier = str = kstrdup(field_str, GFP_KERNEL); if (!modifier) @@ -2584,7 +2739,7 @@ parse_field(struct hist_trigger_data *hist_data, struct trace_event_file *file, else if (strcmp(modifier, "usecs") == 0) *flags |= HIST_FIELD_FL_TIMESTAMP_USECS; else { - hist_err("Invalid field modifier: ", modifier); + hist_err(tr, HIST_ERR_BAD_FIELD_MODIFIER, errpos(modifier)); field = ERR_PTR(-EINVAL); goto out; } @@ -2600,7 +2755,7 @@ parse_field(struct hist_trigger_data *hist_data, struct trace_event_file *file, else { field = trace_find_event_field(file->event_call, field_name); if (!field || !field->size) { - hist_err("Couldn't find field: ", field_name); + hist_err(tr, HIST_ERR_FIELD_NOT_FOUND, errpos(field_name)); field = ERR_PTR(-EINVAL); goto out; } @@ -2662,7 +2817,8 @@ static struct hist_field *parse_atom(struct hist_trigger_data *hist_data, s = local_field_var_ref(hist_data, ref_system, ref_event, ref_var); if (!s) { - hist_field = parse_var_ref(hist_data, ref_system, ref_event, ref_var); + hist_field = parse_var_ref(hist_data, ref_system, + ref_event, ref_var); if (hist_field) { if (var_name) { hist_field = create_alias(hist_data, hist_field, var_name); @@ -2711,7 +2867,7 @@ static struct hist_field *parse_unary(struct hist_trigger_data *hist_data, /* we support only -(xxx) i.e. explicit parens required */ if (level > 3) { - hist_err("Too many subexpressions (3 max): ", str); + hist_err(file->tr, HIST_ERR_TOO_MANY_SUBEXPR, errpos(str)); ret = -EINVAL; goto free; } @@ -2766,7 +2922,8 @@ static struct hist_field *parse_unary(struct hist_trigger_data *hist_data, return ERR_PTR(ret); } -static int check_expr_operands(struct hist_field *operand1, +static int check_expr_operands(struct trace_array *tr, + struct hist_field *operand1, struct hist_field *operand2) { unsigned long operand1_flags = operand1->flags; @@ -2794,7 +2951,7 @@ static int check_expr_operands(struct hist_field *operand1, if ((operand1_flags & HIST_FIELD_FL_TIMESTAMP_USECS) != (operand2_flags & HIST_FIELD_FL_TIMESTAMP_USECS)) { - hist_err("Timestamp units in expression don't match", NULL); + hist_err(tr, HIST_ERR_TIMESTAMP_MISMATCH, 0); return -EINVAL; } @@ -2812,7 +2969,7 @@ static struct hist_field *parse_expr(struct hist_trigger_data *hist_data, char *sep, *operand1_str; if (level > 3) { - hist_err("Too many subexpressions (3 max): ", str); + hist_err(file->tr, HIST_ERR_TOO_MANY_SUBEXPR, errpos(str)); return ERR_PTR(-EINVAL); } @@ -2857,7 +3014,7 @@ static struct hist_field *parse_expr(struct hist_trigger_data *hist_data, goto free; } - ret = check_expr_operands(operand1, operand2); + ret = check_expr_operands(file->tr, operand1, operand2); if (ret) goto free; @@ -3050,16 +3207,14 @@ create_field_var_hist(struct hist_trigger_data *target_hist_data, int ret; if (target_hist_data->n_field_var_hists >= SYNTH_FIELDS_MAX) { - hist_err_event("onmatch: Too many field variables defined: ", - subsys_name, event_name, field_name); + hist_err(tr, HIST_ERR_TOO_MANY_FIELD_VARS, errpos(field_name)); return ERR_PTR(-EINVAL); } file = event_file(tr, subsys_name, event_name); if (IS_ERR(file)) { - hist_err_event("onmatch: Event file not found: ", - subsys_name, event_name, field_name); + hist_err(tr, HIST_ERR_EVENT_FILE_NOT_FOUND, errpos(field_name)); ret = PTR_ERR(file); return ERR_PTR(ret); } @@ -3072,8 +3227,7 @@ create_field_var_hist(struct hist_trigger_data *target_hist_data, */ hist_data = find_compatible_hist(target_hist_data, file); if (!hist_data) { - hist_err_event("onmatch: Matching event histogram not found: ", - subsys_name, event_name, field_name); + hist_err(tr, HIST_ERR_HIST_NOT_FOUND, errpos(field_name)); return ERR_PTR(-EINVAL); } @@ -3134,8 +3288,7 @@ create_field_var_hist(struct hist_trigger_data *target_hist_data, kfree(cmd); kfree(var_hist->cmd); kfree(var_hist); - hist_err_event("onmatch: Couldn't create histogram for field: ", - subsys_name, event_name, field_name); + hist_err(tr, HIST_ERR_HIST_CREATE_FAIL, errpos(field_name)); return ERR_PTR(ret); } @@ -3147,8 +3300,7 @@ create_field_var_hist(struct hist_trigger_data *target_hist_data, if (IS_ERR_OR_NULL(event_var)) { kfree(var_hist->cmd); kfree(var_hist); - hist_err_event("onmatch: Couldn't find synthetic variable: ", - subsys_name, event_name, field_name); + hist_err(tr, HIST_ERR_SYNTH_VAR_NOT_FOUND, errpos(field_name)); return ERR_PTR(-EINVAL); } @@ -3225,13 +3377,13 @@ static void update_field_vars(struct hist_trigger_data *hist_data, hist_data->n_field_vars, 0); } -static void update_max_vars(struct hist_trigger_data *hist_data, - struct tracing_map_elt *elt, - struct ring_buffer_event *rbe, - void *rec) +static void save_track_data_vars(struct hist_trigger_data *hist_data, + struct tracing_map_elt *elt, void *rec, + struct ring_buffer_event *rbe, void *key, + struct action_data *data, u64 *var_ref_vals) { - __update_field_vars(elt, rbe, rec, hist_data->max_vars, - hist_data->n_max_vars, hist_data->n_field_var_str); + __update_field_vars(elt, rbe, rec, hist_data->save_vars, + hist_data->n_save_vars, hist_data->n_field_var_str); } static struct hist_field *create_var(struct hist_trigger_data *hist_data, @@ -3281,25 +3433,26 @@ static struct field_var *create_field_var(struct hist_trigger_data *hist_data, { struct hist_field *val = NULL, *var = NULL; unsigned long flags = HIST_FIELD_FL_VAR; + struct trace_array *tr = file->tr; struct field_var *field_var; int ret = 0; if (hist_data->n_field_vars >= SYNTH_FIELDS_MAX) { - hist_err("Too many field variables defined: ", field_name); + hist_err(tr, HIST_ERR_TOO_MANY_FIELD_VARS, errpos(field_name)); ret = -EINVAL; goto err; } val = parse_atom(hist_data, file, field_name, &flags, NULL); if (IS_ERR(val)) { - hist_err("Couldn't parse field variable: ", field_name); + hist_err(tr, HIST_ERR_FIELD_VAR_PARSE_FAIL, errpos(field_name)); ret = PTR_ERR(val); goto err; } var = create_var(hist_data, file, field_name, val->size, val->type); if (IS_ERR(var)) { - hist_err("Couldn't create or find variable: ", field_name); + hist_err(tr, HIST_ERR_VAR_CREATE_FIND_FAIL, errpos(field_name)); kfree(val); ret = PTR_ERR(var); goto err; @@ -3366,18 +3519,196 @@ create_target_field_var(struct hist_trigger_data *target_hist_data, return create_field_var(target_hist_data, file, var_name); } -static void onmax_print(struct seq_file *m, - struct hist_trigger_data *hist_data, - struct tracing_map_elt *elt, - struct action_data *data) +static bool check_track_val_max(u64 track_val, u64 var_val) +{ + if (var_val <= track_val) + return false; + + return true; +} + +static bool check_track_val_changed(u64 track_val, u64 var_val) +{ + if (var_val == track_val) + return false; + + return true; +} + +static u64 get_track_val(struct hist_trigger_data *hist_data, + struct tracing_map_elt *elt, + struct action_data *data) { - unsigned int i, save_var_idx, max_idx = data->onmax.max_var->var.idx; + unsigned int track_var_idx = data->track_data.track_var->var.idx; + u64 track_val; + + track_val = tracing_map_read_var(elt, track_var_idx); - seq_printf(m, "\n\tmax: %10llu", tracing_map_read_var(elt, max_idx)); + return track_val; +} - for (i = 0; i < hist_data->n_max_vars; i++) { - struct hist_field *save_val = hist_data->max_vars[i]->val; - struct hist_field *save_var = hist_data->max_vars[i]->var; +static void save_track_val(struct hist_trigger_data *hist_data, + struct tracing_map_elt *elt, + struct action_data *data, u64 var_val) +{ + unsigned int track_var_idx = data->track_data.track_var->var.idx; + + tracing_map_set_var(elt, track_var_idx, var_val); +} + +static void save_track_data(struct hist_trigger_data *hist_data, + struct tracing_map_elt *elt, void *rec, + struct ring_buffer_event *rbe, void *key, + struct action_data *data, u64 *var_ref_vals) +{ + if (data->track_data.save_data) + data->track_data.save_data(hist_data, elt, rec, rbe, key, data, var_ref_vals); +} + +static bool check_track_val(struct tracing_map_elt *elt, + struct action_data *data, + u64 var_val) +{ + struct hist_trigger_data *hist_data; + u64 track_val; + + hist_data = data->track_data.track_var->hist_data; + track_val = get_track_val(hist_data, elt, data); + + return data->track_data.check_val(track_val, var_val); +} + +#ifdef CONFIG_TRACER_SNAPSHOT +static bool cond_snapshot_update(struct trace_array *tr, void *cond_data) +{ + /* called with tr->max_lock held */ + struct track_data *track_data = tr->cond_snapshot->cond_data; + struct hist_elt_data *elt_data, *track_elt_data; + struct snapshot_context *context = cond_data; + struct action_data *action; + u64 track_val; + + if (!track_data) + return false; + + action = track_data->action_data; + + track_val = get_track_val(track_data->hist_data, context->elt, + track_data->action_data); + + if (!action->track_data.check_val(track_data->track_val, track_val)) + return false; + + track_data->track_val = track_val; + memcpy(track_data->key, context->key, track_data->key_len); + + elt_data = context->elt->private_data; + track_elt_data = track_data->elt.private_data; + if (elt_data->comm) + strncpy(track_elt_data->comm, elt_data->comm, TASK_COMM_LEN); + + track_data->updated = true; + + return true; +} + +static void save_track_data_snapshot(struct hist_trigger_data *hist_data, + struct tracing_map_elt *elt, void *rec, + struct ring_buffer_event *rbe, void *key, + struct action_data *data, + u64 *var_ref_vals) +{ + struct trace_event_file *file = hist_data->event_file; + struct snapshot_context context; + + context.elt = elt; + context.key = key; + + tracing_snapshot_cond(file->tr, &context); +} + +static void hist_trigger_print_key(struct seq_file *m, + struct hist_trigger_data *hist_data, + void *key, + struct tracing_map_elt *elt); + +static struct action_data *snapshot_action(struct hist_trigger_data *hist_data) +{ + unsigned int i; + + if (!hist_data->n_actions) + return NULL; + + for (i = 0; i < hist_data->n_actions; i++) { + struct action_data *data = hist_data->actions[i]; + + if (data->action == ACTION_SNAPSHOT) + return data; + } + + return NULL; +} + +static void track_data_snapshot_print(struct seq_file *m, + struct hist_trigger_data *hist_data) +{ + struct trace_event_file *file = hist_data->event_file; + struct track_data *track_data; + struct action_data *action; + + track_data = tracing_cond_snapshot_data(file->tr); + if (!track_data) + return; + + if (!track_data->updated) + return; + + action = snapshot_action(hist_data); + if (!action) + return; + + seq_puts(m, "\nSnapshot taken (see tracing/snapshot). Details:\n"); + seq_printf(m, "\ttriggering value { %s(%s) }: %10llu", + action->handler == HANDLER_ONMAX ? "onmax" : "onchange", + action->track_data.var_str, track_data->track_val); + + seq_puts(m, "\ttriggered by event with key: "); + hist_trigger_print_key(m, hist_data, track_data->key, &track_data->elt); + seq_putc(m, '\n'); +} +#else +static bool cond_snapshot_update(struct trace_array *tr, void *cond_data) +{ + return false; +} +static void save_track_data_snapshot(struct hist_trigger_data *hist_data, + struct tracing_map_elt *elt, void *rec, + struct ring_buffer_event *rbe, void *key, + struct action_data *data, + u64 *var_ref_vals) {} +static void track_data_snapshot_print(struct seq_file *m, + struct hist_trigger_data *hist_data) {} +#endif /* CONFIG_TRACER_SNAPSHOT */ + +static void track_data_print(struct seq_file *m, + struct hist_trigger_data *hist_data, + struct tracing_map_elt *elt, + struct action_data *data) +{ + u64 track_val = get_track_val(hist_data, elt, data); + unsigned int i, save_var_idx; + + if (data->handler == HANDLER_ONMAX) + seq_printf(m, "\n\tmax: %10llu", track_val); + else if (data->handler == HANDLER_ONCHANGE) + seq_printf(m, "\n\tchanged: %10llu", track_val); + + if (data->action == ACTION_SNAPSHOT) + return; + + for (i = 0; i < hist_data->n_save_vars; i++) { + struct hist_field *save_val = hist_data->save_vars[i]->val; + struct hist_field *save_var = hist_data->save_vars[i]->var; u64 val; save_var_idx = save_var->var.idx; @@ -3392,64 +3723,82 @@ static void onmax_print(struct seq_file *m, } } -static void onmax_save(struct hist_trigger_data *hist_data, - struct tracing_map_elt *elt, void *rec, - struct ring_buffer_event *rbe, - struct action_data *data, u64 *var_ref_vals) +static void ontrack_action(struct hist_trigger_data *hist_data, + struct tracing_map_elt *elt, void *rec, + struct ring_buffer_event *rbe, void *key, + struct action_data *data, u64 *var_ref_vals) { - unsigned int max_idx = data->onmax.max_var->var.idx; - unsigned int max_var_ref_idx = data->onmax.max_var_ref_idx; - - u64 var_val, max_val; + u64 var_val = var_ref_vals[data->track_data.var_ref->var_ref_idx]; - var_val = var_ref_vals[max_var_ref_idx]; - max_val = tracing_map_read_var(elt, max_idx); - - if (var_val <= max_val) - return; - - tracing_map_set_var(elt, max_idx, var_val); - - update_max_vars(hist_data, elt, rbe, rec); + if (check_track_val(elt, data, var_val)) { + save_track_val(hist_data, elt, data, var_val); + save_track_data(hist_data, elt, rec, rbe, key, data, var_ref_vals); + } } -static void onmax_destroy(struct action_data *data) +static void action_data_destroy(struct action_data *data) { unsigned int i; - destroy_hist_field(data->onmax.max_var, 0); - destroy_hist_field(data->onmax.var, 0); + lockdep_assert_held(&event_mutex); - kfree(data->onmax.var_str); - kfree(data->onmax.fn_name); + kfree(data->action_name); for (i = 0; i < data->n_params; i++) kfree(data->params[i]); + if (data->synth_event) + data->synth_event->ref--; + + kfree(data->synth_event_name); + kfree(data); } -static int onmax_create(struct hist_trigger_data *hist_data, - struct action_data *data) +static void track_data_destroy(struct hist_trigger_data *hist_data, + struct action_data *data) { struct trace_event_file *file = hist_data->event_file; - struct hist_field *var_field, *ref_field, *max_var; - unsigned int var_ref_idx = hist_data->n_var_refs; - struct field_var *field_var; - char *onmax_var_str, *param; - unsigned int i; + + destroy_hist_field(data->track_data.track_var, 0); + + if (data->action == ACTION_SNAPSHOT) { + struct track_data *track_data; + + track_data = tracing_cond_snapshot_data(file->tr); + if (track_data && track_data->hist_data == hist_data) { + tracing_snapshot_cond_disable(file->tr); + track_data_free(track_data); + } + } + + kfree(data->track_data.var_str); + + action_data_destroy(data); +} + +static int action_create(struct hist_trigger_data *hist_data, + struct action_data *data); + +static int track_data_create(struct hist_trigger_data *hist_data, + struct action_data *data) +{ + struct hist_field *var_field, *ref_field, *track_var = NULL; + struct trace_event_file *file = hist_data->event_file; + struct trace_array *tr = file->tr; + char *track_data_var_str; int ret = 0; - onmax_var_str = data->onmax.var_str; - if (onmax_var_str[0] != '$') { - hist_err("onmax: For onmax(x), x must be a variable: ", onmax_var_str); + track_data_var_str = data->track_data.var_str; + if (track_data_var_str[0] != '$') { + hist_err(tr, HIST_ERR_ONX_NOT_VAR, errpos(track_data_var_str)); return -EINVAL; } - onmax_var_str++; + track_data_var_str++; - var_field = find_target_event_var(hist_data, NULL, NULL, onmax_var_str); + var_field = find_target_event_var(hist_data, NULL, NULL, track_data_var_str); if (!var_field) { - hist_err("onmax: Couldn't find onmax variable: ", onmax_var_str); + hist_err(tr, HIST_ERR_ONX_VAR_NOT_FOUND, errpos(track_data_var_str)); return -EINVAL; } @@ -3457,61 +3806,53 @@ static int onmax_create(struct hist_trigger_data *hist_data, if (!ref_field) return -ENOMEM; - data->onmax.var = ref_field; + data->track_data.var_ref = ref_field; - data->fn = onmax_save; - data->onmax.max_var_ref_idx = var_ref_idx; - max_var = create_var(hist_data, file, "max", sizeof(u64), "u64"); - if (IS_ERR(max_var)) { - hist_err("onmax: Couldn't create onmax variable: ", "max"); - ret = PTR_ERR(max_var); + if (data->handler == HANDLER_ONMAX) + track_var = create_var(hist_data, file, "__max", sizeof(u64), "u64"); + if (IS_ERR(track_var)) { + hist_err(tr, HIST_ERR_ONX_VAR_CREATE_FAIL, 0); + ret = PTR_ERR(track_var); goto out; } - data->onmax.max_var = max_var; - for (i = 0; i < data->n_params; i++) { - param = kstrdup(data->params[i], GFP_KERNEL); - if (!param) { - ret = -ENOMEM; - goto out; - } - - field_var = create_target_field_var(hist_data, NULL, NULL, param); - if (IS_ERR(field_var)) { - hist_err("onmax: Couldn't create field variable: ", param); - ret = PTR_ERR(field_var); - kfree(param); - goto out; - } - - hist_data->max_vars[hist_data->n_max_vars++] = field_var; - if (field_var->val->flags & HIST_FIELD_FL_STRING) - hist_data->n_max_var_str++; - - kfree(param); + if (data->handler == HANDLER_ONCHANGE) + track_var = create_var(hist_data, file, "__change", sizeof(u64), "u64"); + if (IS_ERR(track_var)) { + hist_err(tr, HIST_ERR_ONX_VAR_CREATE_FAIL, 0); + ret = PTR_ERR(track_var); + goto out; } + data->track_data.track_var = track_var; + + ret = action_create(hist_data, data); out: return ret; } -static int parse_action_params(char *params, struct action_data *data) +static int parse_action_params(struct trace_array *tr, char *params, + struct action_data *data) { char *param, *saved_param; + bool first_param = true; int ret = 0; while (params) { - if (data->n_params >= SYNTH_FIELDS_MAX) + if (data->n_params >= SYNTH_FIELDS_MAX) { + hist_err(tr, HIST_ERR_TOO_MANY_PARAMS, 0); goto out; + } param = strsep(¶ms, ","); if (!param) { + hist_err(tr, HIST_ERR_PARAM_NOT_FOUND, 0); ret = -EINVAL; goto out; } param = strstrip(param); if (strlen(param) < 2) { - hist_err("Invalid action param: ", param); + hist_err(tr, HIST_ERR_INVALID_PARAM, errpos(param)); ret = -EINVAL; goto out; } @@ -3522,86 +3863,164 @@ static int parse_action_params(char *params, struct action_data *data) goto out; } + if (first_param && data->use_trace_keyword) { + data->synth_event_name = saved_param; + first_param = false; + continue; + } + first_param = false; + data->params[data->n_params++] = saved_param; } out: return ret; } -static struct action_data *onmax_parse(char *str) +static int action_parse(struct trace_array *tr, char *str, struct action_data *data, + enum handler_id handler) { - char *onmax_fn_name, *onmax_var_str; - struct action_data *data; - int ret = -EINVAL; - - data = kzalloc(sizeof(*data), GFP_KERNEL); - if (!data) - return ERR_PTR(-ENOMEM); + char *action_name; + int ret = 0; - onmax_var_str = strsep(&str, ")"); - if (!onmax_var_str || !str) { + strsep(&str, "."); + if (!str) { + hist_err(tr, HIST_ERR_ACTION_NOT_FOUND, 0); ret = -EINVAL; - goto free; + goto out; } - data->onmax.var_str = kstrdup(onmax_var_str, GFP_KERNEL); - if (!data->onmax.var_str) { - ret = -ENOMEM; - goto free; + action_name = strsep(&str, "("); + if (!action_name || !str) { + hist_err(tr, HIST_ERR_ACTION_NOT_FOUND, 0); + ret = -EINVAL; + goto out; } - strsep(&str, "."); - if (!str) - goto free; - - onmax_fn_name = strsep(&str, "("); - if (!onmax_fn_name || !str) - goto free; - - if (str_has_prefix(onmax_fn_name, "save")) { + if (str_has_prefix(action_name, "save")) { char *params = strsep(&str, ")"); if (!params) { + hist_err(tr, HIST_ERR_NO_SAVE_PARAMS, 0); ret = -EINVAL; - goto free; + goto out; } - ret = parse_action_params(params, data); + ret = parse_action_params(tr, params, data); if (ret) - goto free; - } else + goto out; + + if (handler == HANDLER_ONMAX) + data->track_data.check_val = check_track_val_max; + else if (handler == HANDLER_ONCHANGE) + data->track_data.check_val = check_track_val_changed; + else { + hist_err(tr, HIST_ERR_ACTION_MISMATCH, errpos(action_name)); + ret = -EINVAL; + goto out; + } + + data->track_data.save_data = save_track_data_vars; + data->fn = ontrack_action; + data->action = ACTION_SAVE; + } else if (str_has_prefix(action_name, "snapshot")) { + char *params = strsep(&str, ")"); + + if (!str) { + hist_err(tr, HIST_ERR_NO_CLOSING_PAREN, errpos(params)); + ret = -EINVAL; + goto out; + } + + if (handler == HANDLER_ONMAX) + data->track_data.check_val = check_track_val_max; + else if (handler == HANDLER_ONCHANGE) + data->track_data.check_val = check_track_val_changed; + else { + hist_err(tr, HIST_ERR_ACTION_MISMATCH, errpos(action_name)); + ret = -EINVAL; + goto out; + } + + data->track_data.save_data = save_track_data_snapshot; + data->fn = ontrack_action; + data->action = ACTION_SNAPSHOT; + } else { + char *params = strsep(&str, ")"); + + if (str_has_prefix(action_name, "trace")) + data->use_trace_keyword = true; + + if (params) { + ret = parse_action_params(tr, params, data); + if (ret) + goto out; + } + + if (handler == HANDLER_ONMAX) + data->track_data.check_val = check_track_val_max; + else if (handler == HANDLER_ONCHANGE) + data->track_data.check_val = check_track_val_changed; + + if (handler != HANDLER_ONMATCH) { + data->track_data.save_data = action_trace; + data->fn = ontrack_action; + } else + data->fn = action_trace; + + data->action = ACTION_TRACE; + } + + data->action_name = kstrdup(action_name, GFP_KERNEL); + if (!data->action_name) { + ret = -ENOMEM; + goto out; + } + + data->handler = handler; + out: + return ret; +} + +static struct action_data *track_data_parse(struct hist_trigger_data *hist_data, + char *str, enum handler_id handler) +{ + struct action_data *data; + int ret = -EINVAL; + char *var_str; + + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (!data) + return ERR_PTR(-ENOMEM); + + var_str = strsep(&str, ")"); + if (!var_str || !str) { + ret = -EINVAL; goto free; + } - data->onmax.fn_name = kstrdup(onmax_fn_name, GFP_KERNEL); - if (!data->onmax.fn_name) { + data->track_data.var_str = kstrdup(var_str, GFP_KERNEL); + if (!data->track_data.var_str) { ret = -ENOMEM; goto free; } + + ret = action_parse(hist_data->event_file->tr, str, data, handler); + if (ret) + goto free; out: return data; free: - onmax_destroy(data); + track_data_destroy(hist_data, data); data = ERR_PTR(ret); goto out; } static void onmatch_destroy(struct action_data *data) { - unsigned int i; - - lockdep_assert_held(&event_mutex); + kfree(data->match_data.event); + kfree(data->match_data.event_system); - kfree(data->onmatch.match_event); - kfree(data->onmatch.match_event_system); - kfree(data->onmatch.synth_event_name); - - for (i = 0; i < data->n_params; i++) - kfree(data->params[i]); - - if (data->onmatch.synth_event) - data->onmatch.synth_event->ref--; - - kfree(data); + action_data_destroy(data); } static void destroy_field_var(struct field_var *field_var) @@ -3651,33 +4070,35 @@ static int check_synth_field(struct synth_event *event, } static struct hist_field * -onmatch_find_var(struct hist_trigger_data *hist_data, struct action_data *data, - char *system, char *event, char *var) +trace_action_find_var(struct hist_trigger_data *hist_data, + struct action_data *data, + char *system, char *event, char *var) { + struct trace_array *tr = hist_data->event_file->tr; struct hist_field *hist_field; var++; /* skip '$' */ hist_field = find_target_event_var(hist_data, system, event, var); if (!hist_field) { - if (!system) { - system = data->onmatch.match_event_system; - event = data->onmatch.match_event; + if (!system && data->handler == HANDLER_ONMATCH) { + system = data->match_data.event_system; + event = data->match_data.event; } hist_field = find_event_var(hist_data, system, event, var); } if (!hist_field) - hist_err_event("onmatch: Couldn't find onmatch param: $", system, event, var); + hist_err(tr, HIST_ERR_PARAM_NOT_FOUND, errpos(var)); return hist_field; } static struct hist_field * -onmatch_create_field_var(struct hist_trigger_data *hist_data, - struct action_data *data, char *system, - char *event, char *var) +trace_action_create_field_var(struct hist_trigger_data *hist_data, + struct action_data *data, char *system, + char *event, char *var) { struct hist_field *hist_field = NULL; struct field_var *field_var; @@ -3700,9 +4121,9 @@ onmatch_create_field_var(struct hist_trigger_data *hist_data, * looking for fields on the onmatch(system.event.xxx) * event. */ - if (!system) { - system = data->onmatch.match_event_system; - event = data->onmatch.match_event; + if (!system && data->handler == HANDLER_ONMATCH) { + system = data->match_data.event_system; + event = data->match_data.event; } /* @@ -3724,24 +4145,31 @@ onmatch_create_field_var(struct hist_trigger_data *hist_data, goto out; } -static int onmatch_create(struct hist_trigger_data *hist_data, - struct trace_event_file *file, - struct action_data *data) +static int trace_action_create(struct hist_trigger_data *hist_data, + struct action_data *data) { + struct trace_array *tr = hist_data->event_file->tr; char *event_name, *param, *system = NULL; struct hist_field *hist_field, *var_ref; unsigned int i, var_ref_idx; unsigned int field_pos = 0; struct synth_event *event; + char *synth_event_name; int ret = 0; lockdep_assert_held(&event_mutex); - event = find_synth_event(data->onmatch.synth_event_name); + if (data->use_trace_keyword) + synth_event_name = data->synth_event_name; + else + synth_event_name = data->action_name; + + event = find_synth_event(synth_event_name); if (!event) { - hist_err("onmatch: Couldn't find synthetic event: ", data->onmatch.synth_event_name); + hist_err(tr, HIST_ERR_SYNTH_EVENT_NOT_FOUND, errpos(synth_event_name)); return -EINVAL; } + event->ref++; var_ref_idx = hist_data->n_var_refs; @@ -3769,13 +4197,15 @@ static int onmatch_create(struct hist_trigger_data *hist_data, } if (param[0] == '$') - hist_field = onmatch_find_var(hist_data, data, system, - event_name, param); + hist_field = trace_action_find_var(hist_data, data, + system, event_name, + param); else - hist_field = onmatch_create_field_var(hist_data, data, - system, - event_name, - param); + hist_field = trace_action_create_field_var(hist_data, + data, + system, + event_name, + param); if (!hist_field) { kfree(p); @@ -3797,22 +4227,20 @@ static int onmatch_create(struct hist_trigger_data *hist_data, continue; } - hist_err_event("onmatch: Param type doesn't match synthetic event field type: ", - system, event_name, param); + hist_err(tr, HIST_ERR_SYNTH_TYPE_MISMATCH, errpos(param)); kfree(p); ret = -EINVAL; goto err; } if (field_pos != event->n_fields) { - hist_err("onmatch: Param count doesn't match synthetic event field count: ", event->name); + hist_err(tr, HIST_ERR_SYNTH_COUNT_MISMATCH, errpos(event->name)); ret = -EINVAL; goto err; } - data->fn = action_trace; - data->onmatch.synth_event = event; - data->onmatch.var_ref_idx = var_ref_idx; + data->synth_event = event; + data->var_ref_idx = var_ref_idx; out: return ret; err: @@ -3821,10 +4249,77 @@ static int onmatch_create(struct hist_trigger_data *hist_data, goto out; } +static int action_create(struct hist_trigger_data *hist_data, + struct action_data *data) +{ + struct trace_event_file *file = hist_data->event_file; + struct trace_array *tr = file->tr; + struct track_data *track_data; + struct field_var *field_var; + unsigned int i; + char *param; + int ret = 0; + + if (data->action == ACTION_TRACE) + return trace_action_create(hist_data, data); + + if (data->action == ACTION_SNAPSHOT) { + track_data = track_data_alloc(hist_data->key_size, data, hist_data); + if (IS_ERR(track_data)) { + ret = PTR_ERR(track_data); + goto out; + } + + ret = tracing_snapshot_cond_enable(file->tr, track_data, + cond_snapshot_update); + if (ret) + track_data_free(track_data); + + goto out; + } + + if (data->action == ACTION_SAVE) { + if (hist_data->n_save_vars) { + ret = -EEXIST; + hist_err(tr, HIST_ERR_TOO_MANY_SAVE_ACTIONS, 0); + goto out; + } + + for (i = 0; i < data->n_params; i++) { + param = kstrdup(data->params[i], GFP_KERNEL); + if (!param) { + ret = -ENOMEM; + goto out; + } + + field_var = create_target_field_var(hist_data, NULL, NULL, param); + if (IS_ERR(field_var)) { + hist_err(tr, HIST_ERR_FIELD_VAR_CREATE_FAIL, + errpos(param)); + ret = PTR_ERR(field_var); + kfree(param); + goto out; + } + + hist_data->save_vars[hist_data->n_save_vars++] = field_var; + if (field_var->val->flags & HIST_FIELD_FL_STRING) + hist_data->n_save_var_str++; + kfree(param); + } + } + out: + return ret; +} + +static int onmatch_create(struct hist_trigger_data *hist_data, + struct action_data *data) +{ + return action_create(hist_data, data); +} + static struct action_data *onmatch_parse(struct trace_array *tr, char *str) { char *match_event, *match_event_system; - char *synth_event_name, *params; struct action_data *data; int ret = -EINVAL; @@ -3834,59 +4329,34 @@ static struct action_data *onmatch_parse(struct trace_array *tr, char *str) match_event = strsep(&str, ")"); if (!match_event || !str) { - hist_err("onmatch: Missing closing paren: ", match_event); + hist_err(tr, HIST_ERR_NO_CLOSING_PAREN, errpos(match_event)); goto free; } match_event_system = strsep(&match_event, "."); if (!match_event) { - hist_err("onmatch: Missing subsystem for match event: ", match_event_system); + hist_err(tr, HIST_ERR_SUBSYS_NOT_FOUND, errpos(match_event_system)); goto free; } if (IS_ERR(event_file(tr, match_event_system, match_event))) { - hist_err_event("onmatch: Invalid subsystem or event name: ", - match_event_system, match_event, NULL); + hist_err(tr, HIST_ERR_INVALID_SUBSYS_EVENT, errpos(match_event)); goto free; } - data->onmatch.match_event = kstrdup(match_event, GFP_KERNEL); - if (!data->onmatch.match_event) { + data->match_data.event = kstrdup(match_event, GFP_KERNEL); + if (!data->match_data.event) { ret = -ENOMEM; goto free; } - data->onmatch.match_event_system = kstrdup(match_event_system, GFP_KERNEL); - if (!data->onmatch.match_event_system) { - ret = -ENOMEM; - goto free; - } - - strsep(&str, "."); - if (!str) { - hist_err("onmatch: Missing . after onmatch(): ", str); - goto free; - } - - synth_event_name = strsep(&str, "("); - if (!synth_event_name || !str) { - hist_err("onmatch: Missing opening paramlist paren: ", synth_event_name); - goto free; - } - - data->onmatch.synth_event_name = kstrdup(synth_event_name, GFP_KERNEL); - if (!data->onmatch.synth_event_name) { + data->match_data.event_system = kstrdup(match_event_system, GFP_KERNEL); + if (!data->match_data.event_system) { ret = -ENOMEM; goto free; } - params = strsep(&str, ")"); - if (!params || !str || (str && strlen(str))) { - hist_err("onmatch: Missing closing paramlist paren: ", params); - goto free; - } - - ret = parse_action_params(params, data); + ret = action_parse(tr, str, data, HANDLER_ONMATCH); if (ret) goto free; out: @@ -3955,13 +4425,14 @@ static int create_var_field(struct hist_trigger_data *hist_data, struct trace_event_file *file, char *var_name, char *expr_str) { + struct trace_array *tr = hist_data->event_file->tr; unsigned long flags = 0; if (WARN_ON(val_idx >= TRACING_MAP_VALS_MAX + TRACING_MAP_VARS_MAX)) return -EINVAL; if (find_var(hist_data, file, var_name) && !hist_data->remove) { - hist_err("Variable already defined: ", var_name); + hist_err(tr, HIST_ERR_DUPLICATE_VAR, errpos(var_name)); return -EINVAL; } @@ -4018,8 +4489,8 @@ static int create_key_field(struct hist_trigger_data *hist_data, struct trace_event_file *file, char *field_str) { + struct trace_array *tr = hist_data->event_file->tr; struct hist_field *hist_field = NULL; - unsigned long flags = 0; unsigned int key_size; int ret = 0; @@ -4041,8 +4512,8 @@ static int create_key_field(struct hist_trigger_data *hist_data, goto out; } - if (hist_field->flags & HIST_FIELD_FL_VAR_REF) { - hist_err("Using variable references as keys not supported: ", field_str); + if (field_has_hist_vars(hist_field, 0)) { + hist_err(tr, HIST_ERR_INVALID_REF_KEY, errpos(field_str)); destroy_hist_field(hist_field, 0); ret = -EINVAL; goto out; @@ -4143,6 +4614,7 @@ static void free_var_defs(struct hist_trigger_data *hist_data) static int parse_var_defs(struct hist_trigger_data *hist_data) { + struct trace_array *tr = hist_data->event_file->tr; char *s, *str, *var_name, *field_str; unsigned int i, j, n_vars = 0; int ret = 0; @@ -4156,13 +4628,14 @@ static int parse_var_defs(struct hist_trigger_data *hist_data) var_name = strsep(&field_str, "="); if (!var_name || !field_str) { - hist_err("Malformed assignment: ", var_name); + hist_err(tr, HIST_ERR_MALFORMED_ASSIGNMENT, + errpos(var_name)); ret = -EINVAL; goto free; } if (n_vars == TRACING_MAP_VARS_MAX) { - hist_err("Too many variables defined: ", var_name); + hist_err(tr, HIST_ERR_TOO_MANY_VARS, errpos(var_name)); ret = -EINVAL; goto free; } @@ -4326,10 +4799,11 @@ static void destroy_actions(struct hist_trigger_data *hist_data) for (i = 0; i < hist_data->n_actions; i++) { struct action_data *data = hist_data->actions[i]; - if (data->fn == action_trace) + if (data->handler == HANDLER_ONMATCH) onmatch_destroy(data); - else if (data->fn == onmax_save) - onmax_destroy(data); + else if (data->handler == HANDLER_ONMAX || + data->handler == HANDLER_ONCHANGE) + track_data_destroy(hist_data, data); else kfree(data); } @@ -4355,16 +4829,24 @@ static int parse_actions(struct hist_trigger_data *hist_data) ret = PTR_ERR(data); break; } - data->fn = action_trace; } else if ((len = str_has_prefix(str, "onmax("))) { char *action_str = str + len; - data = onmax_parse(action_str); + data = track_data_parse(hist_data, action_str, + HANDLER_ONMAX); + if (IS_ERR(data)) { + ret = PTR_ERR(data); + break; + } + } else if ((len = str_has_prefix(str, "onchange("))) { + char *action_str = str + len; + + data = track_data_parse(hist_data, action_str, + HANDLER_ONCHANGE); if (IS_ERR(data)) { ret = PTR_ERR(data); break; } - data->fn = onmax_save; } else { ret = -EINVAL; break; @@ -4376,8 +4858,7 @@ static int parse_actions(struct hist_trigger_data *hist_data) return ret; } -static int create_actions(struct hist_trigger_data *hist_data, - struct trace_event_file *file) +static int create_actions(struct hist_trigger_data *hist_data) { struct action_data *data; unsigned int i; @@ -4386,14 +4867,18 @@ static int create_actions(struct hist_trigger_data *hist_data, for (i = 0; i < hist_data->attrs->n_actions; i++) { data = hist_data->actions[i]; - if (data->fn == action_trace) { - ret = onmatch_create(hist_data, file, data); + if (data->handler == HANDLER_ONMATCH) { + ret = onmatch_create(hist_data, data); if (ret) - return ret; - } else if (data->fn == onmax_save) { - ret = onmax_create(hist_data, data); + break; + } else if (data->handler == HANDLER_ONMAX || + data->handler == HANDLER_ONCHANGE) { + ret = track_data_create(hist_data, data); if (ret) - return ret; + break; + } else { + ret = -EINVAL; + break; } } @@ -4409,26 +4894,51 @@ static void print_actions(struct seq_file *m, for (i = 0; i < hist_data->n_actions; i++) { struct action_data *data = hist_data->actions[i]; - if (data->fn == onmax_save) - onmax_print(m, hist_data, elt, data); + if (data->action == ACTION_SNAPSHOT) + continue; + + if (data->handler == HANDLER_ONMAX || + data->handler == HANDLER_ONCHANGE) + track_data_print(m, hist_data, elt, data); } } -static void print_onmax_spec(struct seq_file *m, - struct hist_trigger_data *hist_data, - struct action_data *data) +static void print_action_spec(struct seq_file *m, + struct hist_trigger_data *hist_data, + struct action_data *data) { unsigned int i; - seq_puts(m, ":onmax("); - seq_printf(m, "%s", data->onmax.var_str); - seq_printf(m, ").%s(", data->onmax.fn_name); - - for (i = 0; i < hist_data->n_max_vars; i++) { - seq_printf(m, "%s", hist_data->max_vars[i]->var->var.name); - if (i < hist_data->n_max_vars - 1) - seq_puts(m, ","); + if (data->action == ACTION_SAVE) { + for (i = 0; i < hist_data->n_save_vars; i++) { + seq_printf(m, "%s", hist_data->save_vars[i]->var->var.name); + if (i < hist_data->n_save_vars - 1) + seq_puts(m, ","); + } + } else if (data->action == ACTION_TRACE) { + if (data->use_trace_keyword) + seq_printf(m, "%s", data->synth_event_name); + for (i = 0; i < data->n_params; i++) { + if (i || data->use_trace_keyword) + seq_puts(m, ","); + seq_printf(m, "%s", data->params[i]); + } } +} + +static void print_track_data_spec(struct seq_file *m, + struct hist_trigger_data *hist_data, + struct action_data *data) +{ + if (data->handler == HANDLER_ONMAX) + seq_puts(m, ":onmax("); + else if (data->handler == HANDLER_ONCHANGE) + seq_puts(m, ":onchange("); + seq_printf(m, "%s", data->track_data.var_str); + seq_printf(m, ").%s(", data->action_name); + + print_action_spec(m, hist_data, data); + seq_puts(m, ")"); } @@ -4436,18 +4946,12 @@ static void print_onmatch_spec(struct seq_file *m, struct hist_trigger_data *hist_data, struct action_data *data) { - unsigned int i; - - seq_printf(m, ":onmatch(%s.%s).", data->onmatch.match_event_system, - data->onmatch.match_event); + seq_printf(m, ":onmatch(%s.%s).", data->match_data.event_system, + data->match_data.event); - seq_printf(m, "%s(", data->onmatch.synth_event->name); + seq_printf(m, "%s(", data->action_name); - for (i = 0; i < data->n_params; i++) { - if (i) - seq_puts(m, ","); - seq_printf(m, "%s", data->params[i]); - } + print_action_spec(m, hist_data, data); seq_puts(m, ")"); } @@ -4463,8 +4967,11 @@ static bool actions_match(struct hist_trigger_data *hist_data, for (i = 0; i < hist_data->n_actions; i++) { struct action_data *data = hist_data->actions[i]; struct action_data *data_test = hist_data_test->actions[i]; + char *action_name, *action_name_test; - if (data->fn != data_test->fn) + if (data->handler != data_test->handler) + return false; + if (data->action != data_test->action) return false; if (data->n_params != data_test->n_params) @@ -4475,22 +4982,30 @@ static bool actions_match(struct hist_trigger_data *hist_data, return false; } - if (data->fn == action_trace) { - if (strcmp(data->onmatch.synth_event_name, - data_test->onmatch.synth_event_name) != 0) - return false; - if (strcmp(data->onmatch.match_event_system, - data_test->onmatch.match_event_system) != 0) - return false; - if (strcmp(data->onmatch.match_event, - data_test->onmatch.match_event) != 0) + if (data->use_trace_keyword) + action_name = data->synth_event_name; + else + action_name = data->action_name; + + if (data_test->use_trace_keyword) + action_name_test = data_test->synth_event_name; + else + action_name_test = data_test->action_name; + + if (strcmp(action_name, action_name_test) != 0) + return false; + + if (data->handler == HANDLER_ONMATCH) { + if (strcmp(data->match_data.event_system, + data_test->match_data.event_system) != 0) return false; - } else if (data->fn == onmax_save) { - if (strcmp(data->onmax.var_str, - data_test->onmax.var_str) != 0) + if (strcmp(data->match_data.event, + data_test->match_data.event) != 0) return false; - if (strcmp(data->onmax.fn_name, - data_test->onmax.fn_name) != 0) + } else if (data->handler == HANDLER_ONMAX || + data->handler == HANDLER_ONCHANGE) { + if (strcmp(data->track_data.var_str, + data_test->track_data.var_str) != 0) return false; } } @@ -4507,10 +5022,11 @@ static void print_actions_spec(struct seq_file *m, for (i = 0; i < hist_data->n_actions; i++) { struct action_data *data = hist_data->actions[i]; - if (data->fn == action_trace) + if (data->handler == HANDLER_ONMATCH) print_onmatch_spec(m, hist_data, data); - else if (data->fn == onmax_save) - print_onmax_spec(m, hist_data, data); + else if (data->handler == HANDLER_ONMAX || + data->handler == HANDLER_ONCHANGE) + print_track_data_spec(m, hist_data, data); } } @@ -4695,22 +5211,24 @@ static inline void add_to_key(char *compound_key, void *key, /* ensure NULL-termination */ if (size > key_field->size - 1) size = key_field->size - 1; - } - memcpy(compound_key + key_field->offset, key, size); + strncpy(compound_key + key_field->offset, (char *)key, size); + } else + memcpy(compound_key + key_field->offset, key, size); } static void hist_trigger_actions(struct hist_trigger_data *hist_data, struct tracing_map_elt *elt, void *rec, - struct ring_buffer_event *rbe, u64 *var_ref_vals) + struct ring_buffer_event *rbe, void *key, + u64 *var_ref_vals) { struct action_data *data; unsigned int i; for (i = 0; i < hist_data->n_actions; i++) { data = hist_data->actions[i]; - data->fn(hist_data, elt, rec, rbe, data, var_ref_vals); + data->fn(hist_data, elt, rec, rbe, key, data, var_ref_vals); } } @@ -4723,7 +5241,6 @@ static void event_hist_trigger(struct event_trigger_data *data, void *rec, u64 var_ref_vals[TRACING_MAP_VARS_MAX]; char compound_key[HIST_KEY_SIZE_MAX]; struct tracing_map_elt *elt = NULL; - struct stack_trace stacktrace; struct hist_field *key_field; u64 field_contents; void *key = NULL; @@ -4735,14 +5252,9 @@ static void event_hist_trigger(struct event_trigger_data *data, void *rec, key_field = hist_data->fields[i]; if (key_field->flags & HIST_FIELD_FL_STACKTRACE) { - stacktrace.max_entries = HIST_STACKTRACE_DEPTH; - stacktrace.entries = entries; - stacktrace.nr_entries = 0; - stacktrace.skip = HIST_STACKTRACE_SKIP; - - memset(stacktrace.entries, 0, HIST_STACKTRACE_SIZE); - save_stack_trace(&stacktrace); - + memset(entries, 0, HIST_STACKTRACE_SIZE); + stack_trace_save(entries, HIST_STACKTRACE_DEPTH, + HIST_STACKTRACE_SKIP); key = entries; } else { field_contents = key_field->fn(key_field, elt, rbe, rec); @@ -4771,7 +5283,7 @@ static void event_hist_trigger(struct event_trigger_data *data, void *rec, hist_trigger_elt_update(hist_data, elt, rec, rbe, var_ref_vals); if (resolve_var_refs(hist_data, key, var_ref_vals, true)) - hist_trigger_actions(hist_data, elt, rec, rbe, var_ref_vals); + hist_trigger_actions(hist_data, elt, rec, rbe, key, var_ref_vals); } static void hist_trigger_stacktrace_print(struct seq_file *m, @@ -4783,7 +5295,7 @@ static void hist_trigger_stacktrace_print(struct seq_file *m, unsigned int i; for (i = 0; i < max_entries; i++) { - if (stacktrace_entries[i] == ULONG_MAX) + if (!stacktrace_entries[i]) return; seq_printf(m, "%*c", 1 + spaces, ' '); @@ -4792,10 +5304,10 @@ static void hist_trigger_stacktrace_print(struct seq_file *m, } } -static void -hist_trigger_entry_print(struct seq_file *m, - struct hist_trigger_data *hist_data, void *key, - struct tracing_map_elt *elt) +static void hist_trigger_print_key(struct seq_file *m, + struct hist_trigger_data *hist_data, + void *key, + struct tracing_map_elt *elt) { struct hist_field *key_field; char str[KSYM_SYMBOL_LEN]; @@ -4871,6 +5383,17 @@ hist_trigger_entry_print(struct seq_file *m, seq_puts(m, " "); seq_puts(m, "}"); +} + +static void hist_trigger_entry_print(struct seq_file *m, + struct hist_trigger_data *hist_data, + void *key, + struct tracing_map_elt *elt) +{ + const char *field_name; + unsigned int i; + + hist_trigger_print_key(m, hist_data, key, elt); seq_printf(m, " hitcount: %10llu", tracing_map_read_sum(elt, HITCOUNT_IDX)); @@ -4937,6 +5460,8 @@ static void hist_trigger_show(struct seq_file *m, if (n_entries < 0) n_entries = 0; + track_data_snapshot_print(m, hist_data); + seq_printf(m, "\nTotals:\n Hits: %llu\n Entries: %u\n Dropped: %llu\n", (u64)atomic64_read(&hist_data->map->hits), n_entries, (u64)atomic64_read(&hist_data->map->drops)); @@ -4961,11 +5486,6 @@ static int hist_show(struct seq_file *m, void *v) hist_trigger_show(m, data, n++); } - if (have_hist_err()) { - seq_printf(m, "\nERROR: %s\n", hist_err_str); - seq_printf(m, " Last command: %s\n", last_hist_cmd); - } - out_unlock: mutex_unlock(&event_mutex); @@ -5330,6 +5850,7 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops, { struct hist_trigger_data *hist_data = data->private_data; struct event_trigger_data *test, *named_data = NULL; + struct trace_array *tr = file->tr; int ret = 0; if (hist_data->attrs->name) { @@ -5337,7 +5858,7 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops, if (named_data) { if (!hist_trigger_match(data, named_data, named_data, true)) { - hist_err("Named hist trigger doesn't match existing named trigger (includes variables): ", hist_data->attrs->name); + hist_err(tr, HIST_ERR_NAMED_MISMATCH, errpos(hist_data->attrs->name)); ret = -EINVAL; goto out; } @@ -5358,7 +5879,7 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops, else if (hist_data->attrs->clear) hist_clear(test); else { - hist_err("Hist trigger already exists", NULL); + hist_err(tr, HIST_ERR_TRIGGER_EEXIST, 0); ret = -EEXIST; } goto out; @@ -5366,7 +5887,7 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops, } new: if (hist_data->attrs->cont || hist_data->attrs->clear) { - hist_err("Can't clear or continue a nonexistent hist trigger", NULL); + hist_err(tr, HIST_ERR_TRIGGER_ENOENT_CLEAR, 0); ret = -ENOENT; goto out; } @@ -5391,7 +5912,7 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops, ret = tracing_set_clock(file->tr, hist_data->attrs->clock); if (ret) { - hist_err("Couldn't set trace_clock: ", clock); + hist_err(tr, HIST_ERR_SET_CLOCK_FAIL, errpos(clock)); goto out; } @@ -5567,8 +6088,8 @@ static int event_hist_trigger_func(struct event_command *cmd_ops, lockdep_assert_held(&event_mutex); if (glob && strlen(glob)) { - last_cmd_set(param); hist_err_clear(); + last_cmd_set(file, param); } if (!param) @@ -5609,7 +6130,7 @@ static int event_hist_trigger_func(struct event_command *cmd_ops, trigger = strstrip(trigger); } - attrs = parse_hist_trigger_attrs(trigger); + attrs = parse_hist_trigger_attrs(file->tr, trigger); if (IS_ERR(attrs)) return PTR_ERR(attrs); @@ -5683,7 +6204,7 @@ static int event_hist_trigger_func(struct event_command *cmd_ops, if (has_hist_vars(hist_data)) save_hist_vars(hist_data); - ret = create_actions(hist_data, file); + ret = create_actions(hist_data); if (ret) goto out_unreg; diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index cd12ecb66eb9..2a2912cb4533 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -731,7 +731,8 @@ int set_trigger_filter(char *filter_str, goto out; /* The filter is for the 'trigger' event, not the triggered event */ - ret = create_event_filter(file->event_call, filter_str, false, &filter); + ret = create_event_filter(file->tr, file->event_call, + filter_str, false, &filter); /* * If create_event_filter() fails, filter still needs to be freed. * Which the calling code will do with data->filter. diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index c2af1560e856..69ebf3c2f1b5 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -380,6 +380,7 @@ static void print_graph_lat_fmt(struct trace_seq *s, struct trace_entry *entry) { trace_seq_putc(s, ' '); trace_print_lat_fmt(s, entry); + trace_seq_puts(s, " | "); } /* If the pid changed since the last trace, output this event */ @@ -501,6 +502,17 @@ static void print_graph_abs_time(u64 t, struct trace_seq *s) } static void +print_graph_rel_time(struct trace_iterator *iter, struct trace_seq *s) +{ + unsigned long long usecs; + + usecs = iter->ts - iter->trace_buffer->time_start; + do_div(usecs, NSEC_PER_USEC); + + trace_seq_printf(s, "%9llu us | ", usecs); +} + +static void print_graph_irq(struct trace_iterator *iter, unsigned long addr, enum trace_type type, int cpu, pid_t pid, u32 flags) { @@ -517,6 +529,10 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr, if (flags & TRACE_GRAPH_PRINT_ABS_TIME) print_graph_abs_time(iter->ts, s); + /* Relative time */ + if (flags & TRACE_GRAPH_PRINT_REL_TIME) + print_graph_rel_time(iter, s); + /* Cpu */ if (flags & TRACE_GRAPH_PRINT_CPU) print_graph_cpu(s, cpu); @@ -725,6 +741,10 @@ print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s, if (flags & TRACE_GRAPH_PRINT_ABS_TIME) print_graph_abs_time(iter->ts, s); + /* Relative time */ + if (flags & TRACE_GRAPH_PRINT_REL_TIME) + print_graph_rel_time(iter, s); + /* Cpu */ if (flags & TRACE_GRAPH_PRINT_CPU) print_graph_cpu(s, cpu); @@ -1101,6 +1121,8 @@ static void print_lat_header(struct seq_file *s, u32 flags) if (flags & TRACE_GRAPH_PRINT_ABS_TIME) size += 16; + if (flags & TRACE_GRAPH_PRINT_REL_TIME) + size += 16; if (flags & TRACE_GRAPH_PRINT_CPU) size += 4; if (flags & TRACE_GRAPH_PRINT_PROC) @@ -1125,12 +1147,14 @@ static void __print_graph_headers_flags(struct trace_array *tr, seq_putc(s, '#'); if (flags & TRACE_GRAPH_PRINT_ABS_TIME) seq_puts(s, " TIME "); + if (flags & TRACE_GRAPH_PRINT_REL_TIME) + seq_puts(s, " REL TIME "); if (flags & TRACE_GRAPH_PRINT_CPU) seq_puts(s, " CPU"); if (flags & TRACE_GRAPH_PRINT_PROC) seq_puts(s, " TASK/PID "); if (lat) - seq_puts(s, "||||"); + seq_puts(s, "|||| "); if (flags & TRACE_GRAPH_PRINT_DURATION) seq_puts(s, " DURATION "); seq_puts(s, " FUNCTION CALLS\n"); @@ -1139,12 +1163,14 @@ static void __print_graph_headers_flags(struct trace_array *tr, seq_putc(s, '#'); if (flags & TRACE_GRAPH_PRINT_ABS_TIME) seq_puts(s, " | "); + if (flags & TRACE_GRAPH_PRINT_REL_TIME) + seq_puts(s, " | "); if (flags & TRACE_GRAPH_PRINT_CPU) seq_puts(s, " | "); if (flags & TRACE_GRAPH_PRINT_PROC) seq_puts(s, " | | "); if (lat) - seq_puts(s, "||||"); + seq_puts(s, "|||| "); if (flags & TRACE_GRAPH_PRINT_DURATION) seq_puts(s, " | | "); seq_puts(s, " | | | |\n"); diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c index 1e6db9cbe4dc..fa95139445b2 100644 --- a/kernel/trace/trace_hwlat.c +++ b/kernel/trace/trace_hwlat.c @@ -277,7 +277,7 @@ static void move_to_next_cpu(void) * of this thread, than stop migrating for the duration * of the current test. */ - if (!cpumask_equal(current_mask, ¤t->cpus_allowed)) + if (!cpumask_equal(current_mask, current->cpus_ptr)) goto disable; get_online_cpus(); diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index d3294721f119..a745b0cee5d3 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -14,6 +14,7 @@ #include <linux/uaccess.h> #include <linux/module.h> #include <linux/ftrace.h> +#include <linux/kprobes.h> #include "trace.h" @@ -238,7 +239,7 @@ static void irqsoff_trace_close(struct trace_iterator *iter) #define GRAPH_TRACER_FLAGS (TRACE_GRAPH_PRINT_CPU | \ TRACE_GRAPH_PRINT_PROC | \ - TRACE_GRAPH_PRINT_ABS_TIME | \ + TRACE_GRAPH_PRINT_REL_TIME | \ TRACE_GRAPH_PRINT_DURATION) static enum print_line_t irqsoff_print_line(struct trace_iterator *iter) @@ -365,7 +366,7 @@ out: __trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc); } -static inline void +static nokprobe_inline void start_critical_timing(unsigned long ip, unsigned long parent_ip, int pc) { int cpu; @@ -401,7 +402,7 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip, int pc) atomic_dec(&data->disabled); } -static inline void +static nokprobe_inline void stop_critical_timing(unsigned long ip, unsigned long parent_ip, int pc) { int cpu; @@ -443,6 +444,7 @@ void start_critical_timings(void) start_critical_timing(CALLER_ADDR0, CALLER_ADDR1, pc); } EXPORT_SYMBOL_GPL(start_critical_timings); +NOKPROBE_SYMBOL(start_critical_timings); void stop_critical_timings(void) { @@ -452,6 +454,7 @@ void stop_critical_timings(void) stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1, pc); } EXPORT_SYMBOL_GPL(stop_critical_timings); +NOKPROBE_SYMBOL(stop_critical_timings); #ifdef CONFIG_FUNCTION_TRACER static bool function_enabled; @@ -611,6 +614,7 @@ void tracer_hardirqs_on(unsigned long a0, unsigned long a1) if (!preempt_trace(pc) && irq_trace()) stop_critical_timing(a0, a1, pc); } +NOKPROBE_SYMBOL(tracer_hardirqs_on); void tracer_hardirqs_off(unsigned long a0, unsigned long a1) { @@ -619,6 +623,7 @@ void tracer_hardirqs_off(unsigned long a0, unsigned long a1) if (!preempt_trace(pc) && irq_trace()) start_critical_timing(a0, a1, pc); } +NOKPROBE_SYMBOL(tracer_hardirqs_off); static int irqsoff_tracer_init(struct trace_array *tr) { diff --git a/kernel/trace/trace_kdb.c b/kernel/trace/trace_kdb.c index d953c163a079..cca65044c14c 100644 --- a/kernel/trace/trace_kdb.c +++ b/kernel/trace/trace_kdb.c @@ -17,48 +17,42 @@ #include "trace.h" #include "trace_output.h" -static void ftrace_dump_buf(int skip_lines, long cpu_file) +static struct trace_iterator iter; +static struct ring_buffer_iter *buffer_iter[CONFIG_NR_CPUS]; + +static void ftrace_dump_buf(int skip_entries, long cpu_file) { - /* use static because iter can be a bit big for the stack */ - static struct trace_iterator iter; - static struct ring_buffer_iter *buffer_iter[CONFIG_NR_CPUS]; struct trace_array *tr; unsigned int old_userobj; int cnt = 0, cpu; - trace_init_global_iter(&iter); - iter.buffer_iter = buffer_iter; tr = iter.tr; - for_each_tracing_cpu(cpu) { - atomic_inc(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled); - } - old_userobj = tr->trace_flags; /* don't look at user memory in panic mode */ tr->trace_flags &= ~TRACE_ITER_SYM_USEROBJ; kdb_printf("Dumping ftrace buffer:\n"); + if (skip_entries) + kdb_printf("(skipping %d entries)\n", skip_entries); - /* reset all but tr, trace, and overruns */ - memset(&iter.seq, 0, - sizeof(struct trace_iterator) - - offsetof(struct trace_iterator, seq)); + trace_iterator_reset(&iter); iter.iter_flags |= TRACE_FILE_LAT_FMT; - iter.pos = -1; if (cpu_file == RING_BUFFER_ALL_CPUS) { for_each_tracing_cpu(cpu) { iter.buffer_iter[cpu] = - ring_buffer_read_prepare(iter.trace_buffer->buffer, cpu); + ring_buffer_read_prepare(iter.trace_buffer->buffer, + cpu, GFP_ATOMIC); ring_buffer_read_start(iter.buffer_iter[cpu]); tracing_iter_reset(&iter, cpu); } } else { iter.cpu_file = cpu_file; iter.buffer_iter[cpu_file] = - ring_buffer_read_prepare(iter.trace_buffer->buffer, cpu_file); + ring_buffer_read_prepare(iter.trace_buffer->buffer, + cpu_file, GFP_ATOMIC); ring_buffer_read_start(iter.buffer_iter[cpu_file]); tracing_iter_reset(&iter, cpu_file); } @@ -68,11 +62,11 @@ static void ftrace_dump_buf(int skip_lines, long cpu_file) kdb_printf("---------------------------------\n"); cnt++; - if (!skip_lines) { + if (!skip_entries) { print_trace_line(&iter); trace_printk_seq(&iter.seq); } else { - skip_lines--; + skip_entries--; } if (KDB_FLAG(CMD_INTERRUPT)) @@ -88,10 +82,6 @@ out: tr->trace_flags = old_userobj; for_each_tracing_cpu(cpu) { - atomic_dec(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled); - } - - for_each_tracing_cpu(cpu) { if (iter.buffer_iter[cpu]) { ring_buffer_read_finish(iter.buffer_iter[cpu]); iter.buffer_iter[cpu] = NULL; @@ -104,17 +94,19 @@ out: */ static int kdb_ftdump(int argc, const char **argv) { - int skip_lines = 0; + int skip_entries = 0; long cpu_file; char *cp; + int cnt; + int cpu; if (argc > 2) return KDB_ARGCOUNT; if (argc) { - skip_lines = simple_strtol(argv[1], &cp, 0); + skip_entries = simple_strtol(argv[1], &cp, 0); if (*cp) - skip_lines = 0; + skip_entries = 0; } if (argc == 2) { @@ -127,7 +119,29 @@ static int kdb_ftdump(int argc, const char **argv) } kdb_trap_printk++; - ftrace_dump_buf(skip_lines, cpu_file); + + trace_init_global_iter(&iter); + iter.buffer_iter = buffer_iter; + + for_each_tracing_cpu(cpu) { + atomic_inc(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled); + } + + /* A negative skip_entries means skip all but the last entries */ + if (skip_entries < 0) { + if (cpu_file == RING_BUFFER_ALL_CPUS) + cnt = trace_total_entries(NULL); + else + cnt = trace_total_entries_cpu(NULL, cpu_file); + skip_entries = max(cnt + skip_entries, 0); + } + + ftrace_dump_buf(skip_entries, cpu_file); + + for_each_tracing_cpu(cpu) { + atomic_dec(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled); + } + kdb_trap_printk--; return 0; @@ -135,8 +149,9 @@ static int kdb_ftdump(int argc, const char **argv) static __init int kdb_ftrace_register(void) { - kdb_register_flags("ftdump", kdb_ftdump, "[skip_#lines] [cpu]", - "Dump ftrace log", 0, KDB_ENABLE_ALWAYS_SAFE); + kdb_register_flags("ftdump", kdb_ftdump, "[skip_#entries] [cpu]", + "Dump ftrace log; -skip dumps last #entries", 0, + KDB_ENABLE_ALWAYS_SAFE); return 0; } diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index d5fb09ebba8b..7d736248a070 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -35,7 +35,7 @@ static struct dyn_event_operations trace_kprobe_ops = { .match = trace_kprobe_match, }; -/** +/* * Kprobe event core functions */ struct trace_kprobe { @@ -221,7 +221,7 @@ static struct trace_kprobe *alloc_trace_kprobe(const char *group, tk->rp.maxactive = maxactive; - if (!event || !is_good_name(event)) { + if (!event || !group) { ret = -EINVAL; goto error; } @@ -231,11 +231,6 @@ static struct trace_kprobe *alloc_trace_kprobe(const char *group, if (!tk->tp.call.name) goto error; - if (!group || !is_good_name(group)) { - ret = -EINVAL; - goto error; - } - tk->tp.class.system = kstrdup(group, GFP_KERNEL); if (!tk->tp.class.system) goto error; @@ -446,13 +441,8 @@ static int __register_trace_kprobe(struct trace_kprobe *tk) else ret = register_kprobe(&tk->rp.kp); - if (ret == 0) { + if (ret == 0) tk->tp.flags |= TP_FLAG_REGISTERED; - } else if (ret == -EILSEQ) { - pr_warn("Probing address(0x%p) is not an instruction boundary.\n", - tk->rp.kp.addr); - ret = -EINVAL; - } return ret; } @@ -596,7 +586,7 @@ static int trace_kprobe_create(int argc, const char *argv[]) * Type of args: * FETCHARG:TYPE : use TYPE instead of unsigned long. */ - struct trace_kprobe *tk; + struct trace_kprobe *tk = NULL; int i, len, ret = 0; bool is_return = false; char *symbol = NULL, *tmp = NULL; @@ -620,40 +610,50 @@ static int trace_kprobe_create(int argc, const char *argv[]) if (argc < 2) return -ECANCELED; + trace_probe_log_init("trace_kprobe", argc, argv); + event = strchr(&argv[0][1], ':'); if (event) event++; - if (is_return && isdigit(argv[0][1])) { + if (isdigit(argv[0][1])) { + if (!is_return) { + trace_probe_log_err(1, MAXACT_NO_KPROBE); + goto parse_error; + } if (event) len = event - &argv[0][1] - 1; else len = strlen(&argv[0][1]); - if (len > MAX_EVENT_NAME_LEN - 1) - return -E2BIG; + if (len > MAX_EVENT_NAME_LEN - 1) { + trace_probe_log_err(1, BAD_MAXACT); + goto parse_error; + } memcpy(buf, &argv[0][1], len); buf[len] = '\0'; ret = kstrtouint(buf, 0, &maxactive); - if (ret) { - pr_info("Failed to parse maxactive.\n"); - return ret; + if (ret || !maxactive) { + trace_probe_log_err(1, BAD_MAXACT); + goto parse_error; } /* kretprobes instances are iterated over via a list. The * maximum should stay reasonable. */ if (maxactive > KRETPROBE_MAXACTIVE_MAX) { - pr_info("Maxactive is too big (%d > %d).\n", - maxactive, KRETPROBE_MAXACTIVE_MAX); - return -E2BIG; + trace_probe_log_err(1, MAXACT_TOO_BIG); + goto parse_error; } } /* try to parse an address. if that fails, try to read the * input as a symbol. */ if (kstrtoul(argv[1], 0, (unsigned long *)&addr)) { + trace_probe_log_set_index(1); /* Check whether uprobe event specified */ - if (strchr(argv[1], '/') && strchr(argv[1], ':')) - return -ECANCELED; + if (strchr(argv[1], '/') && strchr(argv[1], ':')) { + ret = -ECANCELED; + goto error; + } /* a symbol specified */ symbol = kstrdup(argv[1], GFP_KERNEL); if (!symbol) @@ -661,23 +661,23 @@ static int trace_kprobe_create(int argc, const char *argv[]) /* TODO: support .init module functions */ ret = traceprobe_split_symbol_offset(symbol, &offset); if (ret || offset < 0 || offset > UINT_MAX) { - pr_info("Failed to parse either an address or a symbol.\n"); - goto out; + trace_probe_log_err(0, BAD_PROBE_ADDR); + goto parse_error; } if (kprobe_on_func_entry(NULL, symbol, offset)) flags |= TPARG_FL_FENTRY; if (offset && is_return && !(flags & TPARG_FL_FENTRY)) { - pr_info("Given offset is not valid for return probe.\n"); - ret = -EINVAL; - goto out; + trace_probe_log_err(0, BAD_RETPROBE); + goto parse_error; } } - argc -= 2; argv += 2; + trace_probe_log_set_index(0); if (event) { - ret = traceprobe_parse_event_name(&event, &group, buf); + ret = traceprobe_parse_event_name(&event, &group, buf, + event - argv[0]); if (ret) - goto out; + goto parse_error; } else { /* Make a new event name */ if (symbol) @@ -692,13 +692,14 @@ static int trace_kprobe_create(int argc, const char *argv[]) /* setup a probe */ tk = alloc_trace_kprobe(group, event, addr, symbol, offset, maxactive, - argc, is_return); + argc - 2, is_return); if (IS_ERR(tk)) { - pr_info("Failed to allocate trace_probe.(%d)\n", - (int)PTR_ERR(tk)); ret = PTR_ERR(tk); - goto out; + /* This must return -ENOMEM, else there is a bug */ + WARN_ON_ONCE(ret != -ENOMEM); + goto out; /* We know tk is not allocated */ } + argc -= 2; argv += 2; /* parse arguments */ for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) { @@ -708,19 +709,32 @@ static int trace_kprobe_create(int argc, const char *argv[]) goto error; } + trace_probe_log_set_index(i + 2); ret = traceprobe_parse_probe_arg(&tk->tp, i, tmp, flags); kfree(tmp); if (ret) - goto error; + goto error; /* This can be -ENOMEM */ } ret = register_trace_kprobe(tk); - if (ret) + if (ret) { + trace_probe_log_set_index(1); + if (ret == -EILSEQ) + trace_probe_log_err(0, BAD_INSN_BNDRY); + else if (ret == -ENOENT) + trace_probe_log_err(0, BAD_PROBE_ADDR); + else if (ret != -ENOMEM) + trace_probe_log_err(0, FAIL_REG_PROBE); goto error; + } + out: + trace_probe_log_clear(); kfree(symbol); return ret; +parse_error: + ret = -EINVAL; error: free_trace_kprobe(tk); goto out; @@ -861,22 +875,14 @@ static const struct file_operations kprobe_profile_ops = { static nokprobe_inline int fetch_store_strlen(unsigned long addr) { - mm_segment_t old_fs; int ret, len = 0; u8 c; - old_fs = get_fs(); - set_fs(KERNEL_DS); - pagefault_disable(); - do { - ret = __copy_from_user_inatomic(&c, (u8 *)addr + len, 1); + ret = probe_kernel_read(&c, (u8 *)addr + len, 1); len++; } while (c && ret == 0 && len < MAX_STRING_SIZE); - pagefault_enable(); - set_fs(old_fs); - return (ret < 0) ? ret : len; } diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 54373d93e251..ba751f993c3b 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -1057,7 +1057,7 @@ static enum print_line_t trace_stack_print(struct trace_iterator *iter, trace_seq_puts(s, "<stack trace>\n"); - for (p = field->caller; p && *p != ULONG_MAX && p < end; p++) { + for (p = field->caller; p && p < end && *p != ULONG_MAX; p++) { if (trace_seq_has_overflowed(s)) break; diff --git a/kernel/trace/trace_preemptirq.c b/kernel/trace/trace_preemptirq.c index 71f553cceb3c..4d8e99fdbbbe 100644 --- a/kernel/trace/trace_preemptirq.c +++ b/kernel/trace/trace_preemptirq.c @@ -9,6 +9,7 @@ #include <linux/uaccess.h> #include <linux/module.h> #include <linux/ftrace.h> +#include <linux/kprobes.h> #include "trace.h" #define CREATE_TRACE_POINTS @@ -30,6 +31,7 @@ void trace_hardirqs_on(void) lockdep_hardirqs_on(CALLER_ADDR0); } EXPORT_SYMBOL(trace_hardirqs_on); +NOKPROBE_SYMBOL(trace_hardirqs_on); void trace_hardirqs_off(void) { @@ -43,6 +45,7 @@ void trace_hardirqs_off(void) lockdep_hardirqs_off(CALLER_ADDR0); } EXPORT_SYMBOL(trace_hardirqs_off); +NOKPROBE_SYMBOL(trace_hardirqs_off); __visible void trace_hardirqs_on_caller(unsigned long caller_addr) { @@ -56,6 +59,7 @@ __visible void trace_hardirqs_on_caller(unsigned long caller_addr) lockdep_hardirqs_on(CALLER_ADDR0); } EXPORT_SYMBOL(trace_hardirqs_on_caller); +NOKPROBE_SYMBOL(trace_hardirqs_on_caller); __visible void trace_hardirqs_off_caller(unsigned long caller_addr) { @@ -69,6 +73,7 @@ __visible void trace_hardirqs_off_caller(unsigned long caller_addr) lockdep_hardirqs_off(CALLER_ADDR0); } EXPORT_SYMBOL(trace_hardirqs_off_caller); +NOKPROBE_SYMBOL(trace_hardirqs_off_caller); #endif /* CONFIG_TRACE_IRQFLAGS */ #ifdef CONFIG_TRACE_PREEMPT_TOGGLE diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 9962cb5da8ac..a347faced959 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -13,7 +13,12 @@ #include "trace_probe.h" -const char *reserved_field_names[] = { +#undef C +#define C(a, b) b + +static const char *trace_probe_err_text[] = { ERRORS }; + +static const char *reserved_field_names[] = { "common_type", "common_flags", "common_preempt_count", @@ -133,6 +138,60 @@ fail: return NULL; } +static struct trace_probe_log trace_probe_log; + +void trace_probe_log_init(const char *subsystem, int argc, const char **argv) +{ + trace_probe_log.subsystem = subsystem; + trace_probe_log.argc = argc; + trace_probe_log.argv = argv; + trace_probe_log.index = 0; +} + +void trace_probe_log_clear(void) +{ + memset(&trace_probe_log, 0, sizeof(trace_probe_log)); +} + +void trace_probe_log_set_index(int index) +{ + trace_probe_log.index = index; +} + +void __trace_probe_log_err(int offset, int err_type) +{ + char *command, *p; + int i, len = 0, pos = 0; + + if (!trace_probe_log.argv) + return; + + /* Recalcurate the length and allocate buffer */ + for (i = 0; i < trace_probe_log.argc; i++) { + if (i == trace_probe_log.index) + pos = len; + len += strlen(trace_probe_log.argv[i]) + 1; + } + command = kzalloc(len, GFP_KERNEL); + if (!command) + return; + + /* And make a command string from argv array */ + p = command; + for (i = 0; i < trace_probe_log.argc; i++) { + len = strlen(trace_probe_log.argv[i]); + strcpy(p, trace_probe_log.argv[i]); + p[len] = ' '; + p += len + 1; + } + *(p - 1) = '\0'; + + tracing_log_err(NULL, trace_probe_log.subsystem, command, + trace_probe_err_text, err_type, pos + offset); + + kfree(command); +} + /* Split symbol and offset. */ int traceprobe_split_symbol_offset(char *symbol, long *offset) { @@ -156,26 +215,41 @@ int traceprobe_split_symbol_offset(char *symbol, long *offset) /* @buf must has MAX_EVENT_NAME_LEN size */ int traceprobe_parse_event_name(const char **pevent, const char **pgroup, - char *buf) + char *buf, int offset) { const char *slash, *event = *pevent; + int len; slash = strchr(event, '/'); if (slash) { if (slash == event) { - pr_info("Group name is not specified\n"); + trace_probe_log_err(offset, NO_GROUP_NAME); return -EINVAL; } if (slash - event + 1 > MAX_EVENT_NAME_LEN) { - pr_info("Group name is too long\n"); - return -E2BIG; + trace_probe_log_err(offset, GROUP_TOO_LONG); + return -EINVAL; } strlcpy(buf, event, slash - event + 1); + if (!is_good_name(buf)) { + trace_probe_log_err(offset, BAD_GROUP_NAME); + return -EINVAL; + } *pgroup = buf; *pevent = slash + 1; + offset += slash - event + 1; + event = *pevent; + } + len = strlen(event); + if (len == 0) { + trace_probe_log_err(offset, NO_EVENT_NAME); + return -EINVAL; + } else if (len > MAX_EVENT_NAME_LEN) { + trace_probe_log_err(offset, EVENT_TOO_LONG); + return -EINVAL; } - if (strlen(event) == 0) { - pr_info("Event name is not specified\n"); + if (!is_good_name(event)) { + trace_probe_log_err(offset, BAD_EVENT_NAME); return -EINVAL; } return 0; @@ -184,56 +258,67 @@ int traceprobe_parse_event_name(const char **pevent, const char **pgroup, #define PARAM_MAX_STACK (THREAD_SIZE / sizeof(unsigned long)) static int parse_probe_vars(char *arg, const struct fetch_type *t, - struct fetch_insn *code, unsigned int flags) + struct fetch_insn *code, unsigned int flags, int offs) { unsigned long param; int ret = 0; int len; if (strcmp(arg, "retval") == 0) { - if (flags & TPARG_FL_RETURN) + if (flags & TPARG_FL_RETURN) { code->op = FETCH_OP_RETVAL; - else + } else { + trace_probe_log_err(offs, RETVAL_ON_PROBE); ret = -EINVAL; + } } else if ((len = str_has_prefix(arg, "stack"))) { if (arg[len] == '\0') { code->op = FETCH_OP_STACKP; } else if (isdigit(arg[len])) { ret = kstrtoul(arg + len, 10, ¶m); - if (ret || ((flags & TPARG_FL_KERNEL) && - param > PARAM_MAX_STACK)) + if (ret) { + goto inval_var; + } else if ((flags & TPARG_FL_KERNEL) && + param > PARAM_MAX_STACK) { + trace_probe_log_err(offs, BAD_STACK_NUM); ret = -EINVAL; - else { + } else { code->op = FETCH_OP_STACK; code->param = (unsigned int)param; } } else - ret = -EINVAL; + goto inval_var; } else if (strcmp(arg, "comm") == 0) { code->op = FETCH_OP_COMM; #ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API } else if (((flags & TPARG_FL_MASK) == (TPARG_FL_KERNEL | TPARG_FL_FENTRY)) && (len = str_has_prefix(arg, "arg"))) { - if (!isdigit(arg[len])) - return -EINVAL; ret = kstrtoul(arg + len, 10, ¶m); - if (ret || !param || param > PARAM_MAX_STACK) + if (ret) { + goto inval_var; + } else if (!param || param > PARAM_MAX_STACK) { + trace_probe_log_err(offs, BAD_ARG_NUM); return -EINVAL; + } code->op = FETCH_OP_ARG; code->param = (unsigned int)param - 1; #endif } else - ret = -EINVAL; + goto inval_var; return ret; + +inval_var: + trace_probe_log_err(offs, BAD_VAR); + return -EINVAL; } /* Recursive argument parser */ static int parse_probe_arg(char *arg, const struct fetch_type *type, struct fetch_insn **pcode, struct fetch_insn *end, - unsigned int flags) + unsigned int flags, int offs) { struct fetch_insn *code = *pcode; unsigned long param; @@ -243,7 +328,7 @@ parse_probe_arg(char *arg, const struct fetch_type *type, switch (arg[0]) { case '$': - ret = parse_probe_vars(arg + 1, type, code, flags); + ret = parse_probe_vars(arg + 1, type, code, flags, offs); break; case '%': /* named register */ @@ -252,47 +337,57 @@ parse_probe_arg(char *arg, const struct fetch_type *type, code->op = FETCH_OP_REG; code->param = (unsigned int)ret; ret = 0; - } + } else + trace_probe_log_err(offs, BAD_REG_NAME); break; case '@': /* memory, file-offset or symbol */ if (isdigit(arg[1])) { ret = kstrtoul(arg + 1, 0, ¶m); - if (ret) + if (ret) { + trace_probe_log_err(offs, BAD_MEM_ADDR); break; + } /* load address */ code->op = FETCH_OP_IMM; code->immediate = param; } else if (arg[1] == '+') { /* kprobes don't support file offsets */ - if (flags & TPARG_FL_KERNEL) + if (flags & TPARG_FL_KERNEL) { + trace_probe_log_err(offs, FILE_ON_KPROBE); return -EINVAL; - + } ret = kstrtol(arg + 2, 0, &offset); - if (ret) + if (ret) { + trace_probe_log_err(offs, BAD_FILE_OFFS); break; + } code->op = FETCH_OP_FOFFS; code->immediate = (unsigned long)offset; // imm64? } else { /* uprobes don't support symbols */ - if (!(flags & TPARG_FL_KERNEL)) + if (!(flags & TPARG_FL_KERNEL)) { + trace_probe_log_err(offs, SYM_ON_UPROBE); return -EINVAL; - + } /* Preserve symbol for updating */ code->op = FETCH_NOP_SYMBOL; code->data = kstrdup(arg + 1, GFP_KERNEL); if (!code->data) return -ENOMEM; - if (++code == end) - return -E2BIG; - + if (++code == end) { + trace_probe_log_err(offs, TOO_MANY_OPS); + return -EINVAL; + } code->op = FETCH_OP_IMM; code->immediate = 0; } /* These are fetching from memory */ - if (++code == end) - return -E2BIG; + if (++code == end) { + trace_probe_log_err(offs, TOO_MANY_OPS); + return -EINVAL; + } *pcode = code; code->op = FETCH_OP_DEREF; code->offset = offset; @@ -300,30 +395,41 @@ parse_probe_arg(char *arg, const struct fetch_type *type, case '+': /* deref memory */ arg++; /* Skip '+', because kstrtol() rejects it. */ + /* fall through */ case '-': tmp = strchr(arg, '('); - if (!tmp) + if (!tmp) { + trace_probe_log_err(offs, DEREF_NEED_BRACE); return -EINVAL; - + } *tmp = '\0'; ret = kstrtol(arg, 0, &offset); - if (ret) + if (ret) { + trace_probe_log_err(offs, BAD_DEREF_OFFS); break; - + } + offs += (tmp + 1 - arg) + (arg[0] != '-' ? 1 : 0); arg = tmp + 1; tmp = strrchr(arg, ')'); - - if (tmp) { + if (!tmp) { + trace_probe_log_err(offs + strlen(arg), + DEREF_OPEN_BRACE); + return -EINVAL; + } else { const struct fetch_type *t2 = find_fetch_type(NULL); *tmp = '\0'; - ret = parse_probe_arg(arg, t2, &code, end, flags); + ret = parse_probe_arg(arg, t2, &code, end, flags, offs); if (ret) break; - if (code->op == FETCH_OP_COMM) + if (code->op == FETCH_OP_COMM) { + trace_probe_log_err(offs, COMM_CANT_DEREF); + return -EINVAL; + } + if (++code == end) { + trace_probe_log_err(offs, TOO_MANY_OPS); return -EINVAL; - if (++code == end) - return -E2BIG; + } *pcode = code; code->op = FETCH_OP_DEREF; @@ -333,6 +439,7 @@ parse_probe_arg(char *arg, const struct fetch_type *type, } if (!ret && code->op == FETCH_OP_NOP) { /* Parsed, but do not find fetch method */ + trace_probe_log_err(offs, BAD_FETCH_ARG); ret = -EINVAL; } return ret; @@ -364,7 +471,7 @@ static int __parse_bitfield_probe_arg(const char *bf, return -EINVAL; code++; if (code->op != FETCH_OP_NOP) - return -E2BIG; + return -EINVAL; *pcode = code; code->op = FETCH_OP_MOD_BF; @@ -377,44 +484,66 @@ static int __parse_bitfield_probe_arg(const char *bf, /* String length checking wrapper */ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size, - struct probe_arg *parg, unsigned int flags) + struct probe_arg *parg, unsigned int flags, int offset) { struct fetch_insn *code, *scode, *tmp = NULL; - char *t, *t2; + char *t, *t2, *t3; int ret, len; - if (strlen(arg) > MAX_ARGSTR_LEN) { - pr_info("Argument is too long.: %s\n", arg); - return -ENOSPC; + len = strlen(arg); + if (len > MAX_ARGSTR_LEN) { + trace_probe_log_err(offset, ARG_TOO_LONG); + return -EINVAL; + } else if (len == 0) { + trace_probe_log_err(offset, NO_ARG_BODY); + return -EINVAL; } + parg->comm = kstrdup(arg, GFP_KERNEL); - if (!parg->comm) { - pr_info("Failed to allocate memory for command '%s'.\n", arg); + if (!parg->comm) return -ENOMEM; - } + t = strchr(arg, ':'); if (t) { *t = '\0'; t2 = strchr(++t, '['); if (t2) { - *t2 = '\0'; - parg->count = simple_strtoul(t2 + 1, &t2, 0); - if (strcmp(t2, "]") || parg->count == 0) + *t2++ = '\0'; + t3 = strchr(t2, ']'); + if (!t3) { + offset += t2 + strlen(t2) - arg; + trace_probe_log_err(offset, + ARRAY_NO_CLOSE); + return -EINVAL; + } else if (t3[1] != '\0') { + trace_probe_log_err(offset + t3 + 1 - arg, + BAD_ARRAY_SUFFIX); return -EINVAL; - if (parg->count > MAX_ARRAY_LEN) - return -E2BIG; + } + *t3 = '\0'; + if (kstrtouint(t2, 0, &parg->count) || !parg->count) { + trace_probe_log_err(offset + t2 - arg, + BAD_ARRAY_NUM); + return -EINVAL; + } + if (parg->count > MAX_ARRAY_LEN) { + trace_probe_log_err(offset + t2 - arg, + ARRAY_TOO_BIG); + return -EINVAL; + } } } - /* - * The default type of $comm should be "string", and it can't be - * dereferenced. - */ - if (!t && strcmp(arg, "$comm") == 0) + + /* Since $comm can not be dereferred, we can find $comm by strcmp */ + if (strcmp(arg, "$comm") == 0) { + /* The type of $comm must be "string", and not an array. */ + if (parg->count || (t && strcmp(t, "string"))) + return -EINVAL; parg->type = find_fetch_type("string"); - else + } else parg->type = find_fetch_type(t); if (!parg->type) { - pr_info("Unsupported type: %s\n", t); + trace_probe_log_err(offset + (t ? (t - arg) : 0), BAD_TYPE); return -EINVAL; } parg->offset = *size; @@ -429,13 +558,13 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size, parg->count); } - code = tmp = kzalloc(sizeof(*code) * FETCH_INSN_MAX, GFP_KERNEL); + code = tmp = kcalloc(FETCH_INSN_MAX, sizeof(*code), GFP_KERNEL); if (!code) return -ENOMEM; code[FETCH_INSN_MAX - 1].op = FETCH_OP_END; ret = parse_probe_arg(arg, parg->type, &code, &code[FETCH_INSN_MAX - 1], - flags); + flags, offset); if (ret) goto fail; @@ -443,7 +572,8 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size, if (!strcmp(parg->type->name, "string")) { if (code->op != FETCH_OP_DEREF && code->op != FETCH_OP_IMM && code->op != FETCH_OP_COMM) { - pr_info("string only accepts memory or address.\n"); + trace_probe_log_err(offset + (t ? (t - arg) : 0), + BAD_STRING); ret = -EINVAL; goto fail; } @@ -455,7 +585,8 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size, */ code++; if (code->op != FETCH_OP_NOP) { - ret = -E2BIG; + trace_probe_log_err(offset, TOO_MANY_OPS); + ret = -EINVAL; goto fail; } } @@ -468,7 +599,8 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size, } else { code++; if (code->op != FETCH_OP_NOP) { - ret = -E2BIG; + trace_probe_log_err(offset, TOO_MANY_OPS); + ret = -EINVAL; goto fail; } code->op = FETCH_OP_ST_RAW; @@ -478,20 +610,24 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size, /* Modify operation */ if (t != NULL) { ret = __parse_bitfield_probe_arg(t, parg->type, &code); - if (ret) + if (ret) { + trace_probe_log_err(offset + t - arg, BAD_BITFIELD); goto fail; + } } /* Loop(Array) operation */ if (parg->count) { if (scode->op != FETCH_OP_ST_MEM && scode->op != FETCH_OP_ST_STRING) { - pr_info("array only accepts memory or address\n"); + trace_probe_log_err(offset + (t ? (t - arg) : 0), + BAD_STRING); ret = -EINVAL; goto fail; } code++; if (code->op != FETCH_OP_NOP) { - ret = -E2BIG; + trace_probe_log_err(offset, TOO_MANY_OPS); + ret = -EINVAL; goto fail; } code->op = FETCH_OP_LP_ARRAY; @@ -501,7 +637,7 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size, code->op = FETCH_OP_END; /* Shrink down the code buffer */ - parg->code = kzalloc(sizeof(*code) * (code - tmp + 1), GFP_KERNEL); + parg->code = kcalloc(code - tmp + 1, sizeof(*code), GFP_KERNEL); if (!parg->code) ret = -ENOMEM; else @@ -540,13 +676,19 @@ int traceprobe_parse_probe_arg(struct trace_probe *tp, int i, char *arg, { struct probe_arg *parg = &tp->args[i]; char *body; - int ret; /* Increment count for freeing args in error case */ tp->nr_args++; body = strchr(arg, '='); if (body) { + if (body - arg > MAX_ARG_NAME_LEN) { + trace_probe_log_err(0, ARG_NAME_TOO_LONG); + return -EINVAL; + } else if (body == arg) { + trace_probe_log_err(0, NO_ARG_NAME); + return -EINVAL; + } parg->name = kmemdup_nul(arg, body - arg, GFP_KERNEL); body++; } else { @@ -558,22 +700,16 @@ int traceprobe_parse_probe_arg(struct trace_probe *tp, int i, char *arg, return -ENOMEM; if (!is_good_name(parg->name)) { - pr_info("Invalid argument[%d] name: %s\n", - i, parg->name); + trace_probe_log_err(0, BAD_ARG_NAME); return -EINVAL; } - if (traceprobe_conflict_field_name(parg->name, tp->args, i)) { - pr_info("Argument[%d]: '%s' conflicts with another field.\n", - i, parg->name); + trace_probe_log_err(0, USED_ARG_NAME); return -EINVAL; } - /* Parse fetch argument */ - ret = traceprobe_parse_probe_arg_body(body, &tp->size, parg, flags); - if (ret) - pr_info("Parse error at argument[%d]. (%d)\n", i, ret); - return ret; + return traceprobe_parse_probe_arg_body(body, &tp->size, parg, flags, + body - arg); } void traceprobe_free_probe_arg(struct probe_arg *arg) diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index 8a63f8bc01bc..f9a8c632188b 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -32,6 +32,7 @@ #define MAX_TRACE_ARGS 128 #define MAX_ARGSTR_LEN 63 #define MAX_ARRAY_LEN 64 +#define MAX_ARG_NAME_LEN 32 #define MAX_STRING_SIZE PATH_MAX /* Reserved field names */ @@ -123,6 +124,7 @@ struct fetch_insn { /* fetch + deref*N + store + mod + end <= 16, this allows N=12, enough */ #define FETCH_INSN_MAX 16 +#define FETCH_TOKEN_COMM (-ECOMM) /* Fetch type information table */ struct fetch_type { @@ -279,8 +281,8 @@ extern int traceprobe_update_arg(struct probe_arg *arg); extern void traceprobe_free_probe_arg(struct probe_arg *arg); extern int traceprobe_split_symbol_offset(char *symbol, long *offset); -extern int traceprobe_parse_event_name(const char **pevent, - const char **pgroup, char *buf); +int traceprobe_parse_event_name(const char **pevent, const char **pgroup, + char *buf, int offset); extern int traceprobe_set_print_fmt(struct trace_probe *tp, bool is_return); @@ -297,3 +299,76 @@ extern void destroy_local_trace_uprobe(struct trace_event_call *event_call); #endif extern int traceprobe_define_arg_fields(struct trace_event_call *event_call, size_t offset, struct trace_probe *tp); + +#undef ERRORS +#define ERRORS \ + C(FILE_NOT_FOUND, "Failed to find the given file"), \ + C(NO_REGULAR_FILE, "Not a regular file"), \ + C(BAD_REFCNT, "Invalid reference counter offset"), \ + C(REFCNT_OPEN_BRACE, "Reference counter brace is not closed"), \ + C(BAD_REFCNT_SUFFIX, "Reference counter has wrong suffix"), \ + C(BAD_UPROBE_OFFS, "Invalid uprobe offset"), \ + C(MAXACT_NO_KPROBE, "Maxactive is not for kprobe"), \ + C(BAD_MAXACT, "Invalid maxactive number"), \ + C(MAXACT_TOO_BIG, "Maxactive is too big"), \ + C(BAD_PROBE_ADDR, "Invalid probed address or symbol"), \ + C(BAD_RETPROBE, "Retprobe address must be an function entry"), \ + C(NO_GROUP_NAME, "Group name is not specified"), \ + C(GROUP_TOO_LONG, "Group name is too long"), \ + C(BAD_GROUP_NAME, "Group name must follow the same rules as C identifiers"), \ + C(NO_EVENT_NAME, "Event name is not specified"), \ + C(EVENT_TOO_LONG, "Event name is too long"), \ + C(BAD_EVENT_NAME, "Event name must follow the same rules as C identifiers"), \ + C(RETVAL_ON_PROBE, "$retval is not available on probe"), \ + C(BAD_STACK_NUM, "Invalid stack number"), \ + C(BAD_ARG_NUM, "Invalid argument number"), \ + C(BAD_VAR, "Invalid $-valiable specified"), \ + C(BAD_REG_NAME, "Invalid register name"), \ + C(BAD_MEM_ADDR, "Invalid memory address"), \ + C(FILE_ON_KPROBE, "File offset is not available with kprobe"), \ + C(BAD_FILE_OFFS, "Invalid file offset value"), \ + C(SYM_ON_UPROBE, "Symbol is not available with uprobe"), \ + C(TOO_MANY_OPS, "Dereference is too much nested"), \ + C(DEREF_NEED_BRACE, "Dereference needs a brace"), \ + C(BAD_DEREF_OFFS, "Invalid dereference offset"), \ + C(DEREF_OPEN_BRACE, "Dereference brace is not closed"), \ + C(COMM_CANT_DEREF, "$comm can not be dereferenced"), \ + C(BAD_FETCH_ARG, "Invalid fetch argument"), \ + C(ARRAY_NO_CLOSE, "Array is not closed"), \ + C(BAD_ARRAY_SUFFIX, "Array has wrong suffix"), \ + C(BAD_ARRAY_NUM, "Invalid array size"), \ + C(ARRAY_TOO_BIG, "Array number is too big"), \ + C(BAD_TYPE, "Unknown type is specified"), \ + C(BAD_STRING, "String accepts only memory argument"), \ + C(BAD_BITFIELD, "Invalid bitfield"), \ + C(ARG_NAME_TOO_LONG, "Argument name is too long"), \ + C(NO_ARG_NAME, "Argument name is not specified"), \ + C(BAD_ARG_NAME, "Argument name must follow the same rules as C identifiers"), \ + C(USED_ARG_NAME, "This argument name is already used"), \ + C(ARG_TOO_LONG, "Argument expression is too long"), \ + C(NO_ARG_BODY, "No argument expression"), \ + C(BAD_INSN_BNDRY, "Probe point is not an instruction boundary"),\ + C(FAIL_REG_PROBE, "Failed to register probe event"), + +#undef C +#define C(a, b) TP_ERR_##a + +/* Define TP_ERR_ */ +enum { ERRORS }; + +/* Error text is defined in trace_probe.c */ + +struct trace_probe_log { + const char *subsystem; + const char **argv; + int argc; + int index; +}; + +void trace_probe_log_init(const char *subsystem, int argc, const char **argv); +void trace_probe_log_set_index(int index); +void trace_probe_log_clear(void); +void __trace_probe_log_err(int offset, int err); + +#define trace_probe_log_err(offs, err) \ + __trace_probe_log_err(offs, TP_ERR_##err) diff --git a/kernel/trace/trace_probe_tmpl.h b/kernel/trace/trace_probe_tmpl.h index 5c56afc17cf8..c30c61f12ddd 100644 --- a/kernel/trace/trace_probe_tmpl.h +++ b/kernel/trace/trace_probe_tmpl.h @@ -88,7 +88,7 @@ stage3: /* 3rd stage: store value to buffer */ if (unlikely(!dest)) { if (code->op == FETCH_OP_ST_STRING) { - ret += fetch_store_strlen(val + code->offset); + ret = fetch_store_strlen(val + code->offset); code++; goto array; } else @@ -180,10 +180,12 @@ store_trace_args(void *data, struct trace_probe *tp, struct pt_regs *regs, if (unlikely(arg->dynamic)) *dl = make_data_loc(maxlen, dyndata - base); ret = process_fetch_insn(arg->code, regs, dl, base); - if (unlikely(ret < 0 && arg->dynamic)) + if (unlikely(ret < 0 && arg->dynamic)) { *dl = make_data_loc(0, dyndata - base); - else + } else { dyndata += ret; + maxlen -= ret; + } } } diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 4ea7e6845efb..743b2b520d34 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -180,8 +180,11 @@ static void wakeup_trace_close(struct trace_iterator *iter) } #define GRAPH_TRACER_FLAGS (TRACE_GRAPH_PRINT_PROC | \ - TRACE_GRAPH_PRINT_ABS_TIME | \ - TRACE_GRAPH_PRINT_DURATION) + TRACE_GRAPH_PRINT_CPU | \ + TRACE_GRAPH_PRINT_REL_TIME | \ + TRACE_GRAPH_PRINT_DURATION | \ + TRACE_GRAPH_PRINT_OVERHEAD | \ + TRACE_GRAPH_PRINT_IRQS) static enum print_line_t wakeup_print_line(struct trace_iterator *iter) { @@ -472,6 +475,7 @@ probe_wakeup_sched_switch(void *ignore, bool preempt, __trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, flags, pc); tracing_sched_switch_trace(wakeup_trace, prev, next, flags, pc); + __trace_stack(wakeup_trace, flags, 0, pc); T0 = data->preempt_timestamp; T1 = ftrace_now(cpu); @@ -482,7 +486,7 @@ probe_wakeup_sched_switch(void *ignore, bool preempt, if (likely(!is_tracing_stopped())) { wakeup_trace->max_latency = delta; - update_max_tr(wakeup_trace, wakeup_task, wakeup_cpu); + update_max_tr(wakeup_trace, wakeup_task, wakeup_cpu, NULL); } out_unlock: @@ -583,6 +587,7 @@ probe_wakeup(void *ignore, struct task_struct *p) data = per_cpu_ptr(wakeup_trace->trace_buffer.data, wakeup_cpu); data->preempt_timestamp = ftrace_now(cpu); tracing_sched_wakeup_trace(wakeup_trace, p, current, flags, pc); + __trace_stack(wakeup_trace, flags, 0, pc); /* * We must be careful in using CALLER_ADDR2. But since wake_up diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 9d402e7fc949..69ee8ef12cee 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -792,7 +792,10 @@ trace_selftest_startup_function_graph(struct tracer *trace, /* check the trace buffer */ ret = trace_test_buffer(&tr->trace_buffer, &count); - trace->reset(tr); + /* Need to also simulate the tr->reset to remove this fgraph_ops */ + tracing_stop_cmdline_record(); + unregister_ftrace_graph(&fgraph_ops); + tracing_start(); if (!ret && !count) { diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index eec648a0d673..5d16f73898db 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -18,44 +18,32 @@ #include "trace.h" -static unsigned long stack_dump_trace[STACK_TRACE_ENTRIES+1] = - { [0 ... (STACK_TRACE_ENTRIES)] = ULONG_MAX }; -unsigned stack_trace_index[STACK_TRACE_ENTRIES]; +#define STACK_TRACE_ENTRIES 500 -/* - * Reserve one entry for the passed in ip. This will allow - * us to remove most or all of the stack size overhead - * added by the stack tracer itself. - */ -struct stack_trace stack_trace_max = { - .max_entries = STACK_TRACE_ENTRIES - 1, - .entries = &stack_dump_trace[0], -}; +static unsigned long stack_dump_trace[STACK_TRACE_ENTRIES]; +static unsigned stack_trace_index[STACK_TRACE_ENTRIES]; -unsigned long stack_trace_max_size; -arch_spinlock_t stack_trace_max_lock = +static unsigned int stack_trace_nr_entries; +static unsigned long stack_trace_max_size; +static arch_spinlock_t stack_trace_max_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; DEFINE_PER_CPU(int, disable_stack_tracer); static DEFINE_MUTEX(stack_sysctl_mutex); int stack_tracer_enabled; -static int last_stack_tracer_enabled; -void stack_trace_print(void) +static void print_max_stack(void) { long i; int size; pr_emerg(" Depth Size Location (%d entries)\n" " ----- ---- --------\n", - stack_trace_max.nr_entries); + stack_trace_nr_entries); - for (i = 0; i < stack_trace_max.nr_entries; i++) { - if (stack_dump_trace[i] == ULONG_MAX) - break; - if (i+1 == stack_trace_max.nr_entries || - stack_dump_trace[i+1] == ULONG_MAX) + for (i = 0; i < stack_trace_nr_entries; i++) { + if (i + 1 == stack_trace_nr_entries) size = stack_trace_index[i]; else size = stack_trace_index[i] - stack_trace_index[i+1]; @@ -65,16 +53,7 @@ void stack_trace_print(void) } } -/* - * When arch-specific code overrides this function, the following - * data should be filled up, assuming stack_trace_max_lock is held to - * prevent concurrent updates. - * stack_trace_index[] - * stack_trace_max - * stack_trace_max_size - */ -void __weak -check_stack(unsigned long ip, unsigned long *stack) +static void check_stack(unsigned long ip, unsigned long *stack) { unsigned long this_size, flags; unsigned long *p, *top, *start; static int tracer_frame; @@ -110,13 +89,12 @@ check_stack(unsigned long ip, unsigned long *stack) stack_trace_max_size = this_size; - stack_trace_max.nr_entries = 0; - stack_trace_max.skip = 0; - - save_stack_trace(&stack_trace_max); + stack_trace_nr_entries = stack_trace_save(stack_dump_trace, + ARRAY_SIZE(stack_dump_trace) - 1, + 0); /* Skip over the overhead of the stack tracer itself */ - for (i = 0; i < stack_trace_max.nr_entries; i++) { + for (i = 0; i < stack_trace_nr_entries; i++) { if (stack_dump_trace[i] == ip) break; } @@ -125,7 +103,7 @@ check_stack(unsigned long ip, unsigned long *stack) * Some archs may not have the passed in ip in the dump. * If that happens, we need to show everything. */ - if (i == stack_trace_max.nr_entries) + if (i == stack_trace_nr_entries) i = 0; /* @@ -143,15 +121,13 @@ check_stack(unsigned long ip, unsigned long *stack) * loop will only happen once. This code only takes place * on a new max, so it is far from a fast path. */ - while (i < stack_trace_max.nr_entries) { + while (i < stack_trace_nr_entries) { int found = 0; stack_trace_index[x] = this_size; p = start; - for (; p < top && i < stack_trace_max.nr_entries; p++) { - if (stack_dump_trace[i] == ULONG_MAX) - break; + for (; p < top && i < stack_trace_nr_entries; p++) { /* * The READ_ONCE_NOCHECK is used to let KASAN know that * this is not a stack-out-of-bounds error. @@ -182,12 +158,10 @@ check_stack(unsigned long ip, unsigned long *stack) i++; } - stack_trace_max.nr_entries = x; - for (; x < i; x++) - stack_dump_trace[x] = ULONG_MAX; + stack_trace_nr_entries = x; if (task_stack_end_corrupted(current)) { - stack_trace_print(); + print_max_stack(); BUG(); } @@ -286,7 +260,7 @@ __next(struct seq_file *m, loff_t *pos) { long n = *pos - 1; - if (n >= stack_trace_max.nr_entries || stack_dump_trace[n] == ULONG_MAX) + if (n >= stack_trace_nr_entries) return NULL; m->private = (void *)n; @@ -350,7 +324,7 @@ static int t_show(struct seq_file *m, void *v) seq_printf(m, " Depth Size Location" " (%d entries)\n" " ----- ---- --------\n", - stack_trace_max.nr_entries); + stack_trace_nr_entries); if (!stack_tracer_enabled && !stack_trace_max_size) print_disabled(m); @@ -360,12 +334,10 @@ static int t_show(struct seq_file *m, void *v) i = *(long *)v; - if (i >= stack_trace_max.nr_entries || - stack_dump_trace[i] == ULONG_MAX) + if (i >= stack_trace_nr_entries) return 0; - if (i+1 == stack_trace_max.nr_entries || - stack_dump_trace[i+1] == ULONG_MAX) + if (i + 1 == stack_trace_nr_entries) size = stack_trace_index[i]; else size = stack_trace_index[i] - stack_trace_index[i+1]; @@ -422,23 +394,21 @@ stack_trace_sysctl(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { + int was_enabled; int ret; mutex_lock(&stack_sysctl_mutex); + was_enabled = !!stack_tracer_enabled; ret = proc_dointvec(table, write, buffer, lenp, ppos); - if (ret || !write || - (last_stack_tracer_enabled == !!stack_tracer_enabled)) + if (ret || !write || (was_enabled == !!stack_tracer_enabled)) goto out; - last_stack_tracer_enabled = !!stack_tracer_enabled; - if (stack_tracer_enabled) register_ftrace_function(&trace_ops); else unregister_ftrace_function(&trace_ops); - out: mutex_unlock(&stack_sysctl_mutex); return ret; @@ -454,7 +424,6 @@ static __init int enable_stacktrace(char *str) strncpy(stack_trace_filter_buf, str + len, COMMAND_LINE_SIZE); stack_tracer_enabled = 1; - last_stack_tracer_enabled = 1; return 1; } __setup("stacktrace", enable_stacktrace); diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index f93a56d2db27..fa8fbff736d6 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -314,6 +314,7 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id) struct ring_buffer_event *event; struct ring_buffer *buffer; unsigned long irq_flags; + unsigned long args[6]; int pc; int syscall_nr; int size; @@ -347,7 +348,8 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id) entry = ring_buffer_event_data(event); entry->nr = syscall_nr; - syscall_get_arguments(current, regs, 0, sys_data->nb_args, entry->args); + syscall_get_arguments(current, regs, args); + memcpy(entry->args, args, sizeof(unsigned long) * sys_data->nb_args); event_trigger_unlock_commit(trace_file, buffer, event, entry, irq_flags, pc); @@ -583,6 +585,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) struct syscall_metadata *sys_data; struct syscall_trace_enter *rec; struct hlist_head *head; + unsigned long args[6]; bool valid_prog_array; int syscall_nr; int rctx; @@ -613,8 +616,8 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) return; rec->nr = syscall_nr; - syscall_get_arguments(current, regs, 0, sys_data->nb_args, - (unsigned long *)&rec->args); + syscall_get_arguments(current, regs, args); + memcpy(&rec->args, args, sizeof(unsigned long) * sys_data->nb_args); if ((valid_prog_array && !perf_call_bpf_enter(sys_data->enter_event, regs, sys_data, rec)) || diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 9bde07c06362..7860e3f59fad 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -156,7 +156,10 @@ fetch_store_string(unsigned long addr, void *dest, void *base) if (unlikely(!maxlen)) return -ENOMEM; - ret = strncpy_from_user(dst, src, maxlen); + if (addr == FETCH_TOKEN_COMM) + ret = strlcpy(dst, current->comm, maxlen); + else + ret = strncpy_from_user(dst, src, maxlen); if (ret >= 0) { if (ret == maxlen) dst[ret - 1] = '\0'; @@ -180,7 +183,10 @@ fetch_store_strlen(unsigned long addr) int len; void __user *vaddr = (void __force __user *) addr; - len = strnlen_user(vaddr, MAX_STRING_SIZE); + if (addr == FETCH_TOKEN_COMM) + len = strlen(current->comm) + 1; + else + len = strnlen_user(vaddr, MAX_STRING_SIZE); return (len > MAX_STRING_SIZE) ? 0 : len; } @@ -220,6 +226,9 @@ process_fetch_insn(struct fetch_insn *code, struct pt_regs *regs, void *dest, case FETCH_OP_IMM: val = code->immediate; break; + case FETCH_OP_COMM: + val = FETCH_TOKEN_COMM; + break; case FETCH_OP_FOFFS: val = translate_user_vaddr(code->immediate); break; @@ -273,10 +282,7 @@ alloc_trace_uprobe(const char *group, const char *event, int nargs, bool is_ret) { struct trace_uprobe *tu; - if (!event || !is_good_name(event)) - return ERR_PTR(-EINVAL); - - if (!group || !is_good_name(group)) + if (!event || !group) return ERR_PTR(-EINVAL); tu = kzalloc(SIZEOF_TRACE_UPROBE(nargs), GFP_KERNEL); @@ -420,8 +426,6 @@ end: /* * Argument syntax: * - Add uprobe: p|r[:[GRP/]EVENT] PATH:OFFSET [FETCHARGS] - * - * - Remove uprobe: -:[GRP/]EVENT */ static int trace_uprobe_create(int argc, const char **argv) { @@ -437,10 +441,17 @@ static int trace_uprobe_create(int argc, const char **argv) ret = 0; ref_ctr_offset = 0; - /* argc must be >= 1 */ - if (argv[0][0] == 'r') + switch (argv[0][0]) { + case 'r': is_return = true; - else if (argv[0][0] != 'p' || argc < 2) + break; + case 'p': + break; + default: + return -ECANCELED; + } + + if (argc < 2) return -ECANCELED; if (argv[0][1] == ':') @@ -460,13 +471,19 @@ static int trace_uprobe_create(int argc, const char **argv) return -ECANCELED; } + trace_probe_log_init("trace_uprobe", argc, argv); + trace_probe_log_set_index(1); /* filename is the 2nd argument */ + *arg++ = '\0'; ret = kern_path(filename, LOOKUP_FOLLOW, &path); if (ret) { + trace_probe_log_err(0, FILE_NOT_FOUND); kfree(filename); + trace_probe_log_clear(); return ret; } if (!d_is_reg(path.dentry)) { + trace_probe_log_err(0, NO_REGULAR_FILE); ret = -EINVAL; goto fail_address_parse; } @@ -475,9 +492,16 @@ static int trace_uprobe_create(int argc, const char **argv) rctr = strchr(arg, '('); if (rctr) { rctr_end = strchr(rctr, ')'); - if (rctr > rctr_end || *(rctr_end + 1) != 0) { + if (!rctr_end) { + ret = -EINVAL; + rctr_end = rctr + strlen(rctr); + trace_probe_log_err(rctr_end - filename, + REFCNT_OPEN_BRACE); + goto fail_address_parse; + } else if (rctr_end[1] != '\0') { ret = -EINVAL; - pr_info("Invalid reference counter offset.\n"); + trace_probe_log_err(rctr_end + 1 - filename, + BAD_REFCNT_SUFFIX); goto fail_address_parse; } @@ -485,22 +509,23 @@ static int trace_uprobe_create(int argc, const char **argv) *rctr_end = '\0'; ret = kstrtoul(rctr, 0, &ref_ctr_offset); if (ret) { - pr_info("Invalid reference counter offset.\n"); + trace_probe_log_err(rctr - filename, BAD_REFCNT); goto fail_address_parse; } } /* Parse uprobe offset. */ ret = kstrtoul(arg, 0, &offset); - if (ret) + if (ret) { + trace_probe_log_err(arg - filename, BAD_UPROBE_OFFS); goto fail_address_parse; - - argc -= 2; - argv += 2; + } /* setup a probe */ + trace_probe_log_set_index(0); if (event) { - ret = traceprobe_parse_event_name(&event, &group, buf); + ret = traceprobe_parse_event_name(&event, &group, buf, + event - argv[0]); if (ret) goto fail_address_parse; } else { @@ -522,10 +547,14 @@ static int trace_uprobe_create(int argc, const char **argv) kfree(tail); } + argc -= 2; + argv += 2; + tu = alloc_trace_uprobe(group, event, argc, is_return); if (IS_ERR(tu)) { - pr_info("Failed to allocate trace_uprobe.(%d)\n", (int)PTR_ERR(tu)); ret = PTR_ERR(tu); + /* This must return -ENOMEM otherwise there is a bug */ + WARN_ON_ONCE(ret != -ENOMEM); goto fail_address_parse; } tu->offset = offset; @@ -541,6 +570,7 @@ static int trace_uprobe_create(int argc, const char **argv) goto error; } + trace_probe_log_set_index(i + 2); ret = traceprobe_parse_probe_arg(&tu->tp, i, tmp, is_return ? TPARG_FL_RETURN : 0); kfree(tmp); @@ -549,20 +579,20 @@ static int trace_uprobe_create(int argc, const char **argv) } ret = register_trace_uprobe(tu); - if (ret) - goto error; - return 0; + if (!ret) + goto out; error: free_trace_uprobe(tu); +out: + trace_probe_log_clear(); return ret; fail_address_parse: + trace_probe_log_clear(); path_put(&path); kfree(filename); - pr_info("Failed to parse address or file.\n"); - return ret; } @@ -1306,7 +1336,7 @@ static inline void init_trace_event_call(struct trace_uprobe *tu, call->event.funcs = &uprobe_funcs; call->class->define_fields = uprobe_event_define_fields; - call->flags = TRACE_EVENT_FL_UPROBE; + call->flags = TRACE_EVENT_FL_UPROBE | TRACE_EVENT_FL_CAP_ANY; call->class->reg = trace_uprobe_register; call->data = tu; } diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index 46f2ab1e08a9..df3ade14ccbd 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -1,19 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (C) 2008-2014 Mathieu Desnoyers - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ #include <linux/module.h> #include <linux/mutex.h> diff --git a/kernel/tsacct.c b/kernel/tsacct.c index 370724b45391..7be3e7530841 100644 --- a/kernel/tsacct.c +++ b/kernel/tsacct.c @@ -1,19 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * tsacct.c - System accounting over taskstats interface * * Copyright (C) Jay Lan, <jlan@sgi.com> - * - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * */ #include <linux/kernel.h> diff --git a/kernel/ucount.c b/kernel/ucount.c index f48d1b6376a4..feb128c7b5d9 100644 --- a/kernel/ucount.c +++ b/kernel/ucount.c @@ -1,9 +1,4 @@ -/* - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation, version 2 of the - * License. - */ +// SPDX-License-Identifier: GPL-2.0-only #include <linux/stat.h> #include <linux/sysctl.h> diff --git a/kernel/umh.c b/kernel/umh.c index d937cbad903a..7f255b5a8845 100644 --- a/kernel/umh.c +++ b/kernel/umh.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * umh - the kernel usermode helper */ diff --git a/kernel/up.c b/kernel/up.c index ff536f9cc8a2..862b460ab97a 100644 --- a/kernel/up.c +++ b/kernel/up.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Uniprocessor-only support functions. The counterpart to kernel/smp.c */ @@ -34,14 +35,13 @@ int smp_call_function_single_async(int cpu, call_single_data_t *csd) } EXPORT_SYMBOL(smp_call_function_single_async); -int on_each_cpu(smp_call_func_t func, void *info, int wait) +void on_each_cpu(smp_call_func_t func, void *info, int wait) { unsigned long flags; local_irq_save(flags); func(info); local_irq_restore(flags); - return 0; } EXPORT_SYMBOL(on_each_cpu); diff --git a/kernel/user-return-notifier.c b/kernel/user-return-notifier.c index 9586b670a5b2..870ecd7c63ed 100644 --- a/kernel/user-return-notifier.c +++ b/kernel/user-return-notifier.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only #include <linux/user-return-notifier.h> #include <linux/percpu.h> diff --git a/kernel/user.c b/kernel/user.c index 0df9b1640b2a..5235d7f49982 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * The "user cache". * @@ -62,9 +63,9 @@ struct user_namespace init_user_ns = { .ns.ops = &userns_operations, #endif .flags = USERNS_INIT_FLAGS, -#ifdef CONFIG_PERSISTENT_KEYRINGS - .persistent_keyring_register_sem = - __RWSEM_INITIALIZER(init_user_ns.persistent_keyring_register_sem), +#ifdef CONFIG_KEYS + .keyring_name_list = LIST_HEAD_INIT(init_user_ns.keyring_name_list), + .keyring_sem = __RWSEM_INITIALIZER(init_user_ns.keyring_sem), #endif }; EXPORT_SYMBOL_GPL(init_user_ns); @@ -140,8 +141,6 @@ static void free_user(struct user_struct *up, unsigned long flags) { uid_hash_remove(up); spin_unlock_irqrestore(&uidhash_lock, flags); - key_put(up->uid_keyring); - key_put(up->session_keyring); kmem_cache_free(uid_cachep, up); } @@ -185,7 +184,7 @@ struct user_struct *alloc_uid(kuid_t uid) if (!up) { new = kmem_cache_zalloc(uid_cachep, GFP_KERNEL); if (!new) - goto out_unlock; + return NULL; new->uid = uid; refcount_set(&new->__count, 1); @@ -199,8 +198,6 @@ struct user_struct *alloc_uid(kuid_t uid) spin_lock_irq(&uidhash_lock); up = uid_hash_find(uid, hashent); if (up) { - key_put(new->uid_keyring); - key_put(new->session_keyring); kmem_cache_free(uid_cachep, new); } else { uid_hash_insert(new, hashent); @@ -210,9 +207,6 @@ struct user_struct *alloc_uid(kuid_t uid) } return up; - -out_unlock: - return NULL; } static int __init uid_cache_init(void) diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 923414a246e9..8eadadc478f9 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -1,9 +1,4 @@ -/* - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation, version 2 of the - * License. - */ +// SPDX-License-Identifier: GPL-2.0-only #include <linux/export.h> #include <linux/nsproxy.h> @@ -133,8 +128,9 @@ int create_user_ns(struct cred *new) ns->flags = parent_ns->flags; mutex_unlock(&userns_state_mutex); -#ifdef CONFIG_PERSISTENT_KEYRINGS - init_rwsem(&ns->persistent_keyring_register_sem); +#ifdef CONFIG_KEYS + INIT_LIST_HEAD(&ns->keyring_name_list); + init_rwsem(&ns->keyring_sem); #endif ret = -ENOMEM; if (!setup_userns_sysctls(ns)) @@ -196,9 +192,7 @@ static void free_user_ns(struct work_struct *work) kfree(ns->projid_map.reverse); } retire_userns_sysctls(ns); -#ifdef CONFIG_PERSISTENT_KEYRINGS - key_put(ns->persistent_keyring_register); -#endif + key_free_user_ns(ns); ns_free_inum(&ns->ns); kmem_cache_free(user_ns_cachep, ns); dec_user_namespaces(ucounts); diff --git a/kernel/utsname.c b/kernel/utsname.c index dcd6be1996fe..f0e491193009 100644 --- a/kernel/utsname.c +++ b/kernel/utsname.c @@ -1,12 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2004 IBM Corporation * * Author: Serge Hallyn <serue@us.ibm.com> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation, version 2 of the - * License. */ #include <linux/export.h> diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c index 258033d62cb3..3732c888a949 100644 --- a/kernel/utsname_sysctl.c +++ b/kernel/utsname_sysctl.c @@ -1,12 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2007 * * Author: Eric Biederman <ebiederm@xmision.com> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation, version 2 of the - * License. */ #include <linux/export.h> diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 977918d5d350..7f9e7b9306fe 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -42,9 +42,9 @@ int __read_mostly watchdog_user_enabled = 1; int __read_mostly nmi_watchdog_user_enabled = NMI_WATCHDOG_DEFAULT; int __read_mostly soft_watchdog_user_enabled = 1; int __read_mostly watchdog_thresh = 10; -int __read_mostly nmi_watchdog_available; +static int __read_mostly nmi_watchdog_available; -struct cpumask watchdog_allowed_mask __read_mostly; +static struct cpumask watchdog_allowed_mask __read_mostly; struct cpumask watchdog_cpumask __read_mostly; unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask); @@ -199,6 +199,13 @@ static int __init nosoftlockup_setup(char *str) } __setup("nosoftlockup", nosoftlockup_setup); +static int __init watchdog_thresh_setup(char *str) +{ + get_option(&str, &watchdog_thresh); + return 1; +} +__setup("watchdog_thresh=", watchdog_thresh_setup); + #ifdef CONFIG_SMP int __read_mostly sysctl_softlockup_all_cpu_backtrace; @@ -547,13 +554,15 @@ static void softlockup_start_all(void) int lockup_detector_online_cpu(unsigned int cpu) { - watchdog_enable(cpu); + if (cpumask_test_cpu(cpu, &watchdog_allowed_mask)) + watchdog_enable(cpu); return 0; } int lockup_detector_offline_cpu(unsigned int cpu) { - watchdog_disable(cpu); + if (cpumask_test_cpu(cpu, &watchdog_allowed_mask)) + watchdog_disable(cpu); return 0; } @@ -581,7 +590,7 @@ static void lockup_detector_reconfigure(void) * Create the watchdog thread infrastructure and configure the detector(s). * * The threads are not unparked as watchdog_allowed_mask is empty. When - * the threads are sucessfully initialized, take the proper locks and + * the threads are successfully initialized, take the proper locks and * unpark the threads in the watchdog_cpumask if the watchdog is enabled. */ static __init void lockup_detector_setup(void) diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c index 71381168dede..247bf0b1582c 100644 --- a/kernel/watchdog_hld.c +++ b/kernel/watchdog_hld.c @@ -135,7 +135,8 @@ static void watchdog_overflow_callback(struct perf_event *event, if (__this_cpu_read(hard_watchdog_warn) == true) return; - pr_emerg("Watchdog detected hard LOCKUP on cpu %d", this_cpu); + pr_emerg("Watchdog detected hard LOCKUP on cpu %d\n", + this_cpu); print_modules(); print_irqtrace_events(current); if (regs) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index fc5d23d752a5..601d61150b65 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * kernel/workqueue.c - generic async execution with shared worker pool * @@ -127,16 +128,16 @@ enum { * * PL: wq_pool_mutex protected. * - * PR: wq_pool_mutex protected for writes. Sched-RCU protected for reads. + * PR: wq_pool_mutex protected for writes. RCU protected for reads. * * PW: wq_pool_mutex and wq->mutex protected for writes. Either for reads. * * PWR: wq_pool_mutex and wq->mutex protected for writes. Either or - * sched-RCU for reads. + * RCU for reads. * * WQ: wq->mutex protected. * - * WR: wq->mutex protected for writes. Sched-RCU protected for reads. + * WR: wq->mutex protected for writes. RCU protected for reads. * * MD: wq_mayday_lock protected. */ @@ -183,7 +184,7 @@ struct worker_pool { atomic_t nr_running ____cacheline_aligned_in_smp; /* - * Destruction of pool is sched-RCU protected to allow dereferences + * Destruction of pool is RCU protected to allow dereferences * from get_work_pool(). */ struct rcu_head rcu; @@ -212,7 +213,7 @@ struct pool_workqueue { /* * Release of unbound pwq is punted to system_wq. See put_pwq() * and pwq_unbound_release_workfn() for details. pool_workqueue - * itself is also sched-RCU protected so that the first pwq can be + * itself is also RCU protected so that the first pwq can be * determined without grabbing wq->mutex. */ struct work_struct unbound_release_work; @@ -259,13 +260,15 @@ struct workqueue_struct { struct wq_device *wq_dev; /* I: for sysfs interface */ #endif #ifdef CONFIG_LOCKDEP + char *lock_name; + struct lock_class_key key; struct lockdep_map lockdep_map; #endif char name[WQ_NAME_LEN]; /* I: workqueue name */ /* - * Destruction of workqueue_struct is sched-RCU protected to allow - * walking the workqueues list without grabbing wq_pool_mutex. + * Destruction of workqueue_struct is RCU protected to allow walking + * the workqueues list without grabbing wq_pool_mutex. * This is used to dump all workqueues from sysrq. */ struct rcu_head rcu; @@ -357,20 +360,20 @@ static void workqueue_sysfs_unregister(struct workqueue_struct *wq); #include <trace/events/workqueue.h> #define assert_rcu_or_pool_mutex() \ - RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() && \ + RCU_LOCKDEP_WARN(!rcu_read_lock_held() && \ !lockdep_is_held(&wq_pool_mutex), \ - "sched RCU or wq_pool_mutex should be held") + "RCU or wq_pool_mutex should be held") #define assert_rcu_or_wq_mutex(wq) \ - RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() && \ + RCU_LOCKDEP_WARN(!rcu_read_lock_held() && \ !lockdep_is_held(&wq->mutex), \ - "sched RCU or wq->mutex should be held") + "RCU or wq->mutex should be held") #define assert_rcu_or_wq_mutex_or_pool_mutex(wq) \ - RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() && \ + RCU_LOCKDEP_WARN(!rcu_read_lock_held() && \ !lockdep_is_held(&wq->mutex) && \ !lockdep_is_held(&wq_pool_mutex), \ - "sched RCU, wq->mutex or wq_pool_mutex should be held") + "RCU, wq->mutex or wq_pool_mutex should be held") #define for_each_cpu_worker_pool(pool, cpu) \ for ((pool) = &per_cpu(cpu_worker_pools, cpu)[0]; \ @@ -382,7 +385,7 @@ static void workqueue_sysfs_unregister(struct workqueue_struct *wq); * @pool: iteration cursor * @pi: integer used for iteration * - * This must be called either with wq_pool_mutex held or sched RCU read + * This must be called either with wq_pool_mutex held or RCU read * locked. If the pool needs to be used beyond the locking in effect, the * caller is responsible for guaranteeing that the pool stays online. * @@ -414,7 +417,7 @@ static void workqueue_sysfs_unregister(struct workqueue_struct *wq); * @pwq: iteration cursor * @wq: the target workqueue * - * This must be called either with wq->mutex held or sched RCU read locked. + * This must be called either with wq->mutex held or RCU read locked. * If the pwq needs to be used beyond the locking in effect, the caller is * responsible for guaranteeing that the pwq stays online. * @@ -550,7 +553,7 @@ static int worker_pool_assign_id(struct worker_pool *pool) * @wq: the target workqueue * @node: the node ID * - * This must be called with any of wq_pool_mutex, wq->mutex or sched RCU + * This must be called with any of wq_pool_mutex, wq->mutex or RCU * read locked. * If the pwq needs to be used beyond the locking in effect, the caller is * responsible for guaranteeing that the pwq stays online. @@ -646,7 +649,7 @@ static void set_work_pool_and_clear_pending(struct work_struct *work, * The following mb guarantees that previous clear of a PENDING bit * will not be reordered with any speculative LOADS or STORES from * work->current_func, which is executed afterwards. This possible - * reordering can lead to a missed execution on attempt to qeueue + * reordering can lead to a missed execution on attempt to queue * the same @work. E.g. consider this case: * * CPU#0 CPU#1 @@ -694,8 +697,8 @@ static struct pool_workqueue *get_work_pwq(struct work_struct *work) * @work: the work item of interest * * Pools are created and destroyed under wq_pool_mutex, and allows read - * access under sched-RCU read lock. As such, this function should be - * called under wq_pool_mutex or with preemption disabled. + * access under RCU read lock. As such, this function should be + * called under wq_pool_mutex or inside of a rcu_read_lock() region. * * All fields of the returned pool are accessible as long as the above * mentioned locking is in effect. If the returned pool needs to be used @@ -839,43 +842,32 @@ static void wake_up_worker(struct worker_pool *pool) } /** - * wq_worker_waking_up - a worker is waking up + * wq_worker_running - a worker is running again * @task: task waking up - * @cpu: CPU @task is waking up to * - * This function is called during try_to_wake_up() when a worker is - * being awoken. - * - * CONTEXT: - * spin_lock_irq(rq->lock) + * This function is called when a worker returns from schedule() */ -void wq_worker_waking_up(struct task_struct *task, int cpu) +void wq_worker_running(struct task_struct *task) { struct worker *worker = kthread_data(task); - if (!(worker->flags & WORKER_NOT_RUNNING)) { - WARN_ON_ONCE(worker->pool->cpu != cpu); + if (!worker->sleeping) + return; + if (!(worker->flags & WORKER_NOT_RUNNING)) atomic_inc(&worker->pool->nr_running); - } + worker->sleeping = 0; } /** * wq_worker_sleeping - a worker is going to sleep * @task: task going to sleep * - * This function is called during schedule() when a busy worker is - * going to sleep. Worker on the same cpu can be woken up by - * returning pointer to its task. - * - * CONTEXT: - * spin_lock_irq(rq->lock) - * - * Return: - * Worker task on @cpu to wake up, %NULL if none. + * This function is called from schedule() when a busy worker is + * going to sleep. */ -struct task_struct *wq_worker_sleeping(struct task_struct *task) +void wq_worker_sleeping(struct task_struct *task) { - struct worker *worker = kthread_data(task), *to_wakeup = NULL; + struct worker *next, *worker = kthread_data(task); struct worker_pool *pool; /* @@ -884,13 +876,15 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task) * checking NOT_RUNNING. */ if (worker->flags & WORKER_NOT_RUNNING) - return NULL; + return; pool = worker->pool; - /* this can only happen on the local cpu */ - if (WARN_ON_ONCE(pool->cpu != raw_smp_processor_id())) - return NULL; + if (WARN_ON_ONCE(worker->sleeping)) + return; + + worker->sleeping = 1; + spin_lock_irq(&pool->lock); /* * The counterpart of the following dec_and_test, implied mb, @@ -904,13 +898,17 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task) * lock is safe. */ if (atomic_dec_and_test(&pool->nr_running) && - !list_empty(&pool->worklist)) - to_wakeup = first_idle_worker(pool); - return to_wakeup ? to_wakeup->task : NULL; + !list_empty(&pool->worklist)) { + next = first_idle_worker(pool); + if (next) + wake_up_process(next->task); + } + spin_unlock_irq(&pool->lock); } /** * wq_worker_last_func - retrieve worker's last work function + * @task: Task to retrieve last work function of. * * Determine the last function a worker executed. This is called from * the scheduler to get a worker's last known identity. @@ -918,6 +916,16 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task) * CONTEXT: * spin_lock_irq(rq->lock) * + * This function is called during schedule() when a kworker is going + * to sleep. It's used by psi to identify aggregation workers during + * dequeuing, to allow periodic aggregation to shut-off when that + * worker is the last task in the system or cgroup to go to sleep. + * + * As this function doesn't involve any workqueue-related locking, it + * only returns stable values when called from inside the scheduler's + * queuing and dequeuing paths, when @task, which must be a kworker, + * is guaranteed to not be processing any works. + * * Return: * The last work function %current executed as a worker, NULL if it * hasn't executed any work yet. @@ -1120,7 +1128,7 @@ static void put_pwq_unlocked(struct pool_workqueue *pwq) { if (pwq) { /* - * As both pwqs and pools are sched-RCU protected, the + * As both pwqs and pools are RCU protected, the * following lock operations are safe. */ spin_lock_irq(&pwq->pool->lock); @@ -1248,6 +1256,7 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork, if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) return 0; + rcu_read_lock(); /* * The queueing is in progress, or it is already queued. Try to * steal it from ->worklist without clearing WORK_STRUCT_PENDING. @@ -1286,10 +1295,12 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork, set_work_pool_and_keep_pending(work, pool->id); spin_unlock(&pool->lock); + rcu_read_unlock(); return 1; } spin_unlock(&pool->lock); fail: + rcu_read_unlock(); local_irq_restore(*flags); if (work_is_canceling(work)) return -ENOENT; @@ -1341,7 +1352,7 @@ static bool is_chained_work(struct workqueue_struct *wq) worker = current_wq_worker(); /* - * Return %true iff I'm a worker execuing a work item on @wq. If + * Return %true iff I'm a worker executing a work item on @wq. If * I'm @worker, it's safe to dereference it without locking. */ return worker && worker->current_pwq->wq == wq; @@ -1403,6 +1414,7 @@ static void __queue_work(int cpu, struct workqueue_struct *wq, if (unlikely(wq->flags & __WQ_DRAINING) && WARN_ON_ONCE(!is_chained_work(wq))) return; + rcu_read_lock(); retry: if (req_cpu == WORK_CPU_UNBOUND) cpu = wq_select_unbound_cpu(raw_smp_processor_id()); @@ -1459,10 +1471,8 @@ retry: /* pwq determined, queue */ trace_workqueue_queue_work(req_cpu, pwq, work); - if (WARN_ON(!list_empty(&work->entry))) { - spin_unlock(&pwq->pool->lock); - return; - } + if (WARN_ON(!list_empty(&work->entry))) + goto out; pwq->nr_in_flight[pwq->work_color]++; work_flags = work_color_to_flags(pwq->work_color); @@ -1480,7 +1490,9 @@ retry: insert_work(pwq, work, worklist, work_flags); +out: spin_unlock(&pwq->pool->lock); + rcu_read_unlock(); } /** @@ -1512,6 +1524,90 @@ bool queue_work_on(int cpu, struct workqueue_struct *wq, } EXPORT_SYMBOL(queue_work_on); +/** + * workqueue_select_cpu_near - Select a CPU based on NUMA node + * @node: NUMA node ID that we want to select a CPU from + * + * This function will attempt to find a "random" cpu available on a given + * node. If there are no CPUs available on the given node it will return + * WORK_CPU_UNBOUND indicating that we should just schedule to any + * available CPU if we need to schedule this work. + */ +static int workqueue_select_cpu_near(int node) +{ + int cpu; + + /* No point in doing this if NUMA isn't enabled for workqueues */ + if (!wq_numa_enabled) + return WORK_CPU_UNBOUND; + + /* Delay binding to CPU if node is not valid or online */ + if (node < 0 || node >= MAX_NUMNODES || !node_online(node)) + return WORK_CPU_UNBOUND; + + /* Use local node/cpu if we are already there */ + cpu = raw_smp_processor_id(); + if (node == cpu_to_node(cpu)) + return cpu; + + /* Use "random" otherwise know as "first" online CPU of node */ + cpu = cpumask_any_and(cpumask_of_node(node), cpu_online_mask); + + /* If CPU is valid return that, otherwise just defer */ + return cpu < nr_cpu_ids ? cpu : WORK_CPU_UNBOUND; +} + +/** + * queue_work_node - queue work on a "random" cpu for a given NUMA node + * @node: NUMA node that we are targeting the work for + * @wq: workqueue to use + * @work: work to queue + * + * We queue the work to a "random" CPU within a given NUMA node. The basic + * idea here is to provide a way to somehow associate work with a given + * NUMA node. + * + * This function will only make a best effort attempt at getting this onto + * the right NUMA node. If no node is requested or the requested node is + * offline then we just fall back to standard queue_work behavior. + * + * Currently the "random" CPU ends up being the first available CPU in the + * intersection of cpu_online_mask and the cpumask of the node, unless we + * are running on the node. In that case we just use the current CPU. + * + * Return: %false if @work was already on a queue, %true otherwise. + */ +bool queue_work_node(int node, struct workqueue_struct *wq, + struct work_struct *work) +{ + unsigned long flags; + bool ret = false; + + /* + * This current implementation is specific to unbound workqueues. + * Specifically we only return the first available CPU for a given + * node instead of cycling through individual CPUs within the node. + * + * If this is used with a per-cpu workqueue then the logic in + * workqueue_select_cpu_near would need to be updated to allow for + * some round robin type logic. + */ + WARN_ON_ONCE(!(wq->flags & WQ_UNBOUND)); + + local_irq_save(flags); + + if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) { + int cpu = workqueue_select_cpu_near(node); + + __queue_work(cpu, wq, work); + ret = true; + } + + local_irq_restore(flags); + return ret; +} +EXPORT_SYMBOL_GPL(queue_work_node); + void delayed_work_timer_fn(struct timer_list *t) { struct delayed_work *dwork = from_timer(dwork, t, timer); @@ -1639,7 +1735,7 @@ static void rcu_work_rcufn(struct rcu_head *rcu) * * Return: %false if @rwork was already pending, %true otherwise. Note * that a full RCU grace period is guaranteed only after a %true return. - * While @rwork is guarnateed to be executed after a %false return, the + * While @rwork is guaranteed to be executed after a %false return, the * execution may happen before a full RCU grace period has passed. */ bool queue_rcu_work(struct workqueue_struct *wq, struct rcu_work *rwork) @@ -2181,7 +2277,7 @@ __acquires(&pool->lock) if (unlikely(in_atomic() || lockdep_depth(current) > 0)) { pr_err("BUG: workqueue leaked lock or atomic: %s/0x%08x/%d\n" - " last function: %pf\n", + " last function: %ps\n", current->comm, preempt_count(), task_pid_nr(current), worker->current_func); debug_show_held_locks(current); @@ -2500,11 +2596,11 @@ static void check_flush_dependency(struct workqueue_struct *target_wq, worker = current_wq_worker(); WARN_ONCE(current->flags & PF_MEMALLOC, - "workqueue: PF_MEMALLOC task %d(%s) is flushing !WQ_MEM_RECLAIM %s:%pf", + "workqueue: PF_MEMALLOC task %d(%s) is flushing !WQ_MEM_RECLAIM %s:%ps", current->pid, current->comm, target_wq->name, target_func); WARN_ONCE(worker && ((worker->current_pwq->wq->flags & (WQ_MEM_RECLAIM | __WQ_LEGACY)) == WQ_MEM_RECLAIM), - "workqueue: WQ_MEM_RECLAIM %s:%pf is flushing !WQ_MEM_RECLAIM %s:%pf", + "workqueue: WQ_MEM_RECLAIM %s:%ps is flushing !WQ_MEM_RECLAIM %s:%ps", worker->current_pwq->wq->name, worker->current_func, target_wq->name, target_func); } @@ -2878,14 +2974,14 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr, might_sleep(); - local_irq_disable(); + rcu_read_lock(); pool = get_work_pool(work); if (!pool) { - local_irq_enable(); + rcu_read_unlock(); return false; } - spin_lock(&pool->lock); + spin_lock_irq(&pool->lock); /* see the comment in try_to_grab_pending() with the same code */ pwq = get_work_pwq(work); if (pwq) { @@ -2917,10 +3013,11 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr, lock_map_acquire(&pwq->wq->lockdep_map); lock_map_release(&pwq->wq->lockdep_map); } - + rcu_read_unlock(); return true; already_gone: spin_unlock_irq(&pool->lock); + rcu_read_unlock(); return false; } @@ -2931,6 +3028,9 @@ static bool __flush_work(struct work_struct *work, bool from_cancel) if (WARN_ON(!wq_online)) return false; + if (WARN_ON(!work->func)) + return false; + if (!from_cancel) { lock_map_acquire(&work->lockdep_map); lock_map_release(&work->lockdep_map); @@ -3229,7 +3329,7 @@ EXPORT_SYMBOL_GPL(execute_in_process_context); * * Undo alloc_workqueue_attrs(). */ -void free_workqueue_attrs(struct workqueue_attrs *attrs) +static void free_workqueue_attrs(struct workqueue_attrs *attrs) { if (attrs) { free_cpumask_var(attrs->cpumask); @@ -3239,21 +3339,20 @@ void free_workqueue_attrs(struct workqueue_attrs *attrs) /** * alloc_workqueue_attrs - allocate a workqueue_attrs - * @gfp_mask: allocation mask to use * * Allocate a new workqueue_attrs, initialize with default settings and * return it. * * Return: The allocated new workqueue_attr on success. %NULL on failure. */ -struct workqueue_attrs *alloc_workqueue_attrs(gfp_t gfp_mask) +static struct workqueue_attrs *alloc_workqueue_attrs(void) { struct workqueue_attrs *attrs; - attrs = kzalloc(sizeof(*attrs), gfp_mask); + attrs = kzalloc(sizeof(*attrs), GFP_KERNEL); if (!attrs) goto fail; - if (!alloc_cpumask_var(&attrs->cpumask, gfp_mask)) + if (!alloc_cpumask_var(&attrs->cpumask, GFP_KERNEL)) goto fail; cpumask_copy(attrs->cpumask, cpu_possible_mask); @@ -3331,17 +3430,57 @@ static int init_worker_pool(struct worker_pool *pool) pool->refcnt = 1; /* shouldn't fail above this point */ - pool->attrs = alloc_workqueue_attrs(GFP_KERNEL); + pool->attrs = alloc_workqueue_attrs(); if (!pool->attrs) return -ENOMEM; return 0; } +#ifdef CONFIG_LOCKDEP +static void wq_init_lockdep(struct workqueue_struct *wq) +{ + char *lock_name; + + lockdep_register_key(&wq->key); + lock_name = kasprintf(GFP_KERNEL, "%s%s", "(wq_completion)", wq->name); + if (!lock_name) + lock_name = wq->name; + + wq->lock_name = lock_name; + lockdep_init_map(&wq->lockdep_map, lock_name, &wq->key, 0); +} + +static void wq_unregister_lockdep(struct workqueue_struct *wq) +{ + lockdep_unregister_key(&wq->key); +} + +static void wq_free_lockdep(struct workqueue_struct *wq) +{ + if (wq->lock_name != wq->name) + kfree(wq->lock_name); +} +#else +static void wq_init_lockdep(struct workqueue_struct *wq) +{ +} + +static void wq_unregister_lockdep(struct workqueue_struct *wq) +{ +} + +static void wq_free_lockdep(struct workqueue_struct *wq) +{ +} +#endif + static void rcu_free_wq(struct rcu_head *rcu) { struct workqueue_struct *wq = container_of(rcu, struct workqueue_struct, rcu); + wq_free_lockdep(wq); + if (!(wq->flags & WQ_UNBOUND)) free_percpu(wq->cpu_pwqs); else @@ -3364,7 +3503,7 @@ static void rcu_free_pool(struct rcu_head *rcu) * put_unbound_pool - put a worker_pool * @pool: worker_pool to put * - * Put @pool. If its refcnt reaches zero, it gets destroyed in sched-RCU + * Put @pool. If its refcnt reaches zero, it gets destroyed in RCU * safe manner. get_unbound_pool() calls this function on its failure path * and this function should be able to release pools which went through, * successfully or not, init_worker_pool(). @@ -3418,7 +3557,7 @@ static void put_unbound_pool(struct worker_pool *pool) del_timer_sync(&pool->idle_timer); del_timer_sync(&pool->mayday_timer); - /* sched-RCU protected to allow dereferences from get_work_pool() */ + /* RCU protected to allow dereferences from get_work_pool() */ call_rcu(&pool->rcu, rcu_free_pool); } @@ -3532,8 +3671,10 @@ static void pwq_unbound_release_workfn(struct work_struct *work) * If we're the last pwq going away, @wq is already dead and no one * is gonna access it anymore. Schedule RCU free. */ - if (is_last) + if (is_last) { + wq_unregister_lockdep(wq); call_rcu(&wq->rcu, rcu_free_wq); + } } /** @@ -3754,8 +3895,8 @@ apply_wqattrs_prepare(struct workqueue_struct *wq, ctx = kzalloc(struct_size(ctx, pwq_tbl, nr_node_ids), GFP_KERNEL); - new_attrs = alloc_workqueue_attrs(GFP_KERNEL); - tmp_attrs = alloc_workqueue_attrs(GFP_KERNEL); + new_attrs = alloc_workqueue_attrs(); + tmp_attrs = alloc_workqueue_attrs(); if (!ctx || !new_attrs || !tmp_attrs) goto out_free; @@ -3891,7 +4032,7 @@ static int apply_workqueue_attrs_locked(struct workqueue_struct *wq, * * Return: 0 on success and -errno on failure. */ -int apply_workqueue_attrs(struct workqueue_struct *wq, +static int apply_workqueue_attrs(struct workqueue_struct *wq, const struct workqueue_attrs *attrs) { int ret; @@ -3902,7 +4043,6 @@ int apply_workqueue_attrs(struct workqueue_struct *wq, return ret; } -EXPORT_SYMBOL_GPL(apply_workqueue_attrs); /** * wq_update_unbound_numa - update NUMA affinity of a wq for CPU hot[un]plug @@ -4067,11 +4207,10 @@ static int init_rescuer(struct workqueue_struct *wq) return 0; } -struct workqueue_struct *__alloc_workqueue_key(const char *fmt, - unsigned int flags, - int max_active, - struct lock_class_key *key, - const char *lock_name, ...) +__printf(1, 4) +struct workqueue_struct *alloc_workqueue(const char *fmt, + unsigned int flags, + int max_active, ...) { size_t tbl_size = 0; va_list args; @@ -4101,12 +4240,12 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt, return NULL; if (flags & WQ_UNBOUND) { - wq->unbound_attrs = alloc_workqueue_attrs(GFP_KERNEL); + wq->unbound_attrs = alloc_workqueue_attrs(); if (!wq->unbound_attrs) goto err_free_wq; } - va_start(args, lock_name); + va_start(args, max_active); vsnprintf(wq->name, sizeof(wq->name), fmt, args); va_end(args); @@ -4123,11 +4262,11 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt, INIT_LIST_HEAD(&wq->flusher_overflow); INIT_LIST_HEAD(&wq->maydays); - lockdep_init_map(&wq->lockdep_map, lock_name, key, 0); + wq_init_lockdep(wq); INIT_LIST_HEAD(&wq->list); if (alloc_and_link_pwqs(wq) < 0) - goto err_free_wq; + goto err_unreg_lockdep; if (wq_online && init_rescuer(wq) < 0) goto err_destroy; @@ -4153,6 +4292,9 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt, return wq; +err_unreg_lockdep: + wq_unregister_lockdep(wq); + wq_free_lockdep(wq); err_free_wq: free_workqueue_attrs(wq->unbound_attrs); kfree(wq); @@ -4161,7 +4303,7 @@ err_destroy: destroy_workqueue(wq); return NULL; } -EXPORT_SYMBOL_GPL(__alloc_workqueue_key); +EXPORT_SYMBOL_GPL(alloc_workqueue); /** * destroy_workqueue - safely terminate a workqueue @@ -4214,6 +4356,7 @@ void destroy_workqueue(struct workqueue_struct *wq) kthread_stop(wq->rescuer->task); if (!(wq->flags & WQ_UNBOUND)) { + wq_unregister_lockdep(wq); /* * The base ref is never dropped on per-cpu pwqs. Directly * schedule RCU free. @@ -4328,7 +4471,8 @@ bool workqueue_congested(int cpu, struct workqueue_struct *wq) struct pool_workqueue *pwq; bool ret; - rcu_read_lock_sched(); + rcu_read_lock(); + preempt_disable(); if (cpu == WORK_CPU_UNBOUND) cpu = smp_processor_id(); @@ -4339,7 +4483,8 @@ bool workqueue_congested(int cpu, struct workqueue_struct *wq) pwq = unbound_pwq_by_node(wq, cpu_to_node(cpu)); ret = !list_empty(&pwq->delayed_works); - rcu_read_unlock_sched(); + preempt_enable(); + rcu_read_unlock(); return ret; } @@ -4365,15 +4510,15 @@ unsigned int work_busy(struct work_struct *work) if (work_pending(work)) ret |= WORK_BUSY_PENDING; - local_irq_save(flags); + rcu_read_lock(); pool = get_work_pool(work); if (pool) { - spin_lock(&pool->lock); + spin_lock_irqsave(&pool->lock, flags); if (find_worker_executing_work(pool, work)) ret |= WORK_BUSY_RUNNING; - spin_unlock(&pool->lock); + spin_unlock_irqrestore(&pool->lock, flags); } - local_irq_restore(flags); + rcu_read_unlock(); return ret; } @@ -4444,7 +4589,7 @@ void print_worker_info(const char *log_lvl, struct task_struct *task) probe_kernel_read(desc, worker->desc, sizeof(desc) - 1); if (fn || name[0] || desc[0]) { - printk("%sWorkqueue: %s %pf", log_lvl, name, fn); + printk("%sWorkqueue: %s %ps", log_lvl, name, fn); if (strcmp(name, desc)) pr_cont(" (%s)", desc); pr_cont("\n"); @@ -4469,7 +4614,7 @@ static void pr_cont_work(bool comma, struct work_struct *work) pr_cont("%s BAR(%d)", comma ? "," : "", task_pid_nr(barr->task)); } else { - pr_cont("%s %pf", comma ? "," : "", work->func); + pr_cont("%s %ps", comma ? "," : "", work->func); } } @@ -4501,7 +4646,7 @@ static void show_pwq(struct pool_workqueue *pwq) if (worker->current_pwq != pwq) continue; - pr_cont("%s %d%s:%pf", comma ? "," : "", + pr_cont("%s %d%s:%ps", comma ? "," : "", task_pid_nr(worker->task), worker == pwq->wq->rescuer ? "(RESCUER)" : "", worker->current_func); @@ -4557,7 +4702,7 @@ void show_workqueue_state(void) unsigned long flags; int pi; - rcu_read_lock_sched(); + rcu_read_lock(); pr_info("Showing busy workqueues and worker pools:\n"); @@ -4622,7 +4767,7 @@ void show_workqueue_state(void) touch_nmi_watchdog(); } - rcu_read_unlock_sched(); + rcu_read_unlock(); } /* used to show worker information through /proc/PID/{comm,stat,status} */ @@ -4786,7 +4931,7 @@ static void rebind_workers(struct worker_pool *pool) * * WRITE_ONCE() is necessary because @worker->flags may be * tested without holding any lock in - * wq_worker_waking_up(). Without it, NOT_RUNNING test may + * wq_worker_running(). Without it, NOT_RUNNING test may * fail incorrectly leading to premature concurrency * management operations. */ @@ -5009,16 +5154,16 @@ bool freeze_workqueues_busy(void) * nr_active is monotonically decreasing. It's safe * to peek without lock. */ - rcu_read_lock_sched(); + rcu_read_lock(); for_each_pwq(pwq, wq) { WARN_ON_ONCE(pwq->nr_active < 0); if (pwq->nr_active) { busy = true; - rcu_read_unlock_sched(); + rcu_read_unlock(); goto out_unlock; } } - rcu_read_unlock_sched(); + rcu_read_unlock(); } out_unlock: mutex_unlock(&wq_pool_mutex); @@ -5213,7 +5358,8 @@ static ssize_t wq_pool_ids_show(struct device *dev, const char *delim = ""; int node, written = 0; - rcu_read_lock_sched(); + get_online_cpus(); + rcu_read_lock(); for_each_node(node) { written += scnprintf(buf + written, PAGE_SIZE - written, "%s%d:%d", delim, node, @@ -5221,7 +5367,8 @@ static ssize_t wq_pool_ids_show(struct device *dev, delim = " "; } written += scnprintf(buf + written, PAGE_SIZE - written, "\n"); - rcu_read_unlock_sched(); + rcu_read_unlock(); + put_online_cpus(); return written; } @@ -5246,7 +5393,7 @@ static struct workqueue_attrs *wq_sysfs_prep_attrs(struct workqueue_struct *wq) lockdep_assert_held(&wq_pool_mutex); - attrs = alloc_workqueue_attrs(GFP_KERNEL); + attrs = alloc_workqueue_attrs(); if (!attrs) return NULL; @@ -5668,7 +5815,7 @@ static void __init wq_numa_init(void) return; } - wq_update_unbound_numa_attrs_buf = alloc_workqueue_attrs(GFP_KERNEL); + wq_update_unbound_numa_attrs_buf = alloc_workqueue_attrs(); BUG_ON(!wq_update_unbound_numa_attrs_buf); /* @@ -5743,7 +5890,7 @@ int __init workqueue_init_early(void) for (i = 0; i < NR_STD_WORKER_POOLS; i++) { struct workqueue_attrs *attrs; - BUG_ON(!(attrs = alloc_workqueue_attrs(GFP_KERNEL))); + BUG_ON(!(attrs = alloc_workqueue_attrs())); attrs->nice = std_nice[i]; unbound_std_wq_attrs[i] = attrs; @@ -5752,7 +5899,7 @@ int __init workqueue_init_early(void) * guaranteed by max_active which is enforced by pwqs. * Turn off NUMA so that dfl_pwq is used for all nodes. */ - BUG_ON(!(attrs = alloc_workqueue_attrs(GFP_KERNEL))); + BUG_ON(!(attrs = alloc_workqueue_attrs())); attrs->nice = std_nice[i]; attrs->no_numa = true; ordered_wq_attrs[i] = attrs; diff --git a/kernel/workqueue_internal.h b/kernel/workqueue_internal.h index cb68b03ca89a..498de0e909a4 100644 --- a/kernel/workqueue_internal.h +++ b/kernel/workqueue_internal.h @@ -44,6 +44,7 @@ struct worker { unsigned long last_active; /* L: last active timestamp */ unsigned int flags; /* X: flags */ int id; /* I: worker id */ + int sleeping; /* None */ /* * Opaque string set with work_set_desc(). Printed out with task @@ -72,8 +73,8 @@ static inline struct worker *current_wq_worker(void) * Scheduler hooks for concurrency managed workqueue. Only to be used from * sched/ and workqueue.c. */ -void wq_worker_waking_up(struct task_struct *task, int cpu); -struct task_struct *wq_worker_sleeping(struct task_struct *task); +void wq_worker_running(struct task_struct *task); +void wq_worker_sleeping(struct task_struct *task); work_func_t wq_worker_last_func(struct task_struct *task); #endif /* _KERNEL_WORKQUEUE_INTERNAL_H */ |