summaryrefslogtreecommitdiffstats
path: root/kernel/fork.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/fork.c')
-rw-r--r--kernel/fork.c306
1 files changed, 223 insertions, 83 deletions
diff --git a/kernel/fork.c b/kernel/fork.c
index b4cba953040a..d8ae0f1b4148 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* linux/kernel/fork.c
*
@@ -122,7 +123,7 @@
unsigned long total_forks; /* Handle normal Linux uptimes. */
int nr_threads; /* The idle threads do not count.. */
-int max_threads; /* tunable limit on nr_threads */
+static int max_threads; /* tunable limit on nr_threads */
DEFINE_PER_CPU(unsigned long, process_counts) = 0;
@@ -247,7 +248,11 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
struct page *page = alloc_pages_node(node, THREADINFO_GFP,
THREAD_SIZE_ORDER);
- return page ? page_address(page) : NULL;
+ if (likely(page)) {
+ tsk->stack = page_address(page);
+ return tsk->stack;
+ }
+ return NULL;
#endif
}
@@ -672,7 +677,6 @@ void __mmdrop(struct mm_struct *mm)
WARN_ON_ONCE(mm == current->active_mm);
mm_free_pgd(mm);
destroy_context(mm);
- hmm_mm_destroy(mm);
mmu_notifier_mm_destroy(mm);
check_mm(mm);
put_user_ns(mm->user_ns);
@@ -893,6 +897,8 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
#ifdef CONFIG_STACKPROTECTOR
tsk->stack_canary = get_random_canary();
#endif
+ if (orig->cpus_ptr == &orig->cpus_mask)
+ tsk->cpus_ptr = &tsk->cpus_mask;
/*
* One for us, one for whoever does the "release_task()" (usually
@@ -1704,38 +1710,39 @@ static void pidfd_show_fdinfo(struct seq_file *m, struct file *f)
}
#endif
+/*
+ * Poll support for process exit notification.
+ */
+static unsigned int pidfd_poll(struct file *file, struct poll_table_struct *pts)
+{
+ struct task_struct *task;
+ struct pid *pid = file->private_data;
+ int poll_flags = 0;
+
+ poll_wait(file, &pid->wait_pidfd, pts);
+
+ rcu_read_lock();
+ task = pid_task(pid, PIDTYPE_PID);
+ /*
+ * Inform pollers only when the whole thread group exits.
+ * If the thread group leader exits before all other threads in the
+ * group, then poll(2) should block, similar to the wait(2) family.
+ */
+ if (!task || (task->exit_state && thread_group_empty(task)))
+ poll_flags = POLLIN | POLLRDNORM;
+ rcu_read_unlock();
+
+ return poll_flags;
+}
+
const struct file_operations pidfd_fops = {
.release = pidfd_release,
+ .poll = pidfd_poll,
#ifdef CONFIG_PROC_FS
.show_fdinfo = pidfd_show_fdinfo,
#endif
};
-/**
- * pidfd_create() - Create a new pid file descriptor.
- *
- * @pid: struct pid that the pidfd will reference
- *
- * This creates a new pid file descriptor with the O_CLOEXEC flag set.
- *
- * Note, that this function can only be called after the fd table has
- * been unshared to avoid leaking the pidfd to the new process.
- *
- * Return: On success, a cloexec pidfd is returned.
- * On error, a negative errno number will be returned.
- */
-static int pidfd_create(struct pid *pid)
-{
- int fd;
-
- fd = anon_inode_getfd("[pidfd]", &pidfd_fops, get_pid(pid),
- O_RDWR | O_CLOEXEC);
- if (fd < 0)
- put_pid(pid);
-
- return fd;
-}
-
static void __delayed_free_task(struct rcu_head *rhp)
{
struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
@@ -1760,19 +1767,16 @@ static __always_inline void delayed_free_task(struct task_struct *tsk)
* flags). The actual kick-off is left to the caller.
*/
static __latent_entropy struct task_struct *copy_process(
- unsigned long clone_flags,
- unsigned long stack_start,
- unsigned long stack_size,
- int __user *parent_tidptr,
- int __user *child_tidptr,
struct pid *pid,
int trace,
- unsigned long tls,
- int node)
+ int node,
+ struct kernel_clone_args *args)
{
int pidfd = -1, retval;
struct task_struct *p;
struct multiprocess_signals delayed;
+ struct file *pidfile = NULL;
+ u64 clone_flags = args->flags;
/*
* Don't allow sharing the root directory with processes in a different
@@ -1821,27 +1825,12 @@ static __latent_entropy struct task_struct *copy_process(
}
if (clone_flags & CLONE_PIDFD) {
- int reserved;
-
/*
- * - CLONE_PARENT_SETTID is useless for pidfds and also
- * parent_tidptr is used to return pidfds.
* - CLONE_DETACHED is blocked so that we can potentially
* reuse it later for CLONE_PIDFD.
* - CLONE_THREAD is blocked until someone really needs it.
*/
- if (clone_flags &
- (CLONE_DETACHED | CLONE_PARENT_SETTID | CLONE_THREAD))
- return ERR_PTR(-EINVAL);
-
- /*
- * Verify that parent_tidptr is sane so we can potentially
- * reuse it later.
- */
- if (get_user(reserved, parent_tidptr))
- return ERR_PTR(-EFAULT);
-
- if (reserved != 0)
+ if (clone_flags & (CLONE_DETACHED | CLONE_THREAD))
return ERR_PTR(-EINVAL);
}
@@ -1874,11 +1863,11 @@ static __latent_entropy struct task_struct *copy_process(
* p->set_child_tid which is (ab)used as a kthread's data pointer for
* kernel threads (PF_KTHREAD).
*/
- p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL;
+ p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? args->child_tid : NULL;
/*
* Clear TID on mm_release()?
*/
- p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr : NULL;
+ p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? args->child_tid : NULL;
ftrace_graph_init_task(p);
@@ -1983,9 +1972,6 @@ static __latent_entropy struct task_struct *copy_process(
p->pagefault_disabled = 0;
#ifdef CONFIG_LOCKDEP
- p->lockdep_depth = 0; /* no locks held yet */
- p->curr_chain_key = 0;
- p->lockdep_recursion = 0;
lockdep_init_task(p);
#endif
@@ -2037,7 +2023,8 @@ static __latent_entropy struct task_struct *copy_process(
retval = copy_io(clone_flags, p);
if (retval)
goto bad_fork_cleanup_namespaces;
- retval = copy_thread_tls(clone_flags, stack_start, stack_size, p, tls);
+ retval = copy_thread_tls(clone_flags, args->stack, args->stack_size, p,
+ args->tls);
if (retval)
goto bad_fork_cleanup_io;
@@ -2057,12 +2044,22 @@ static __latent_entropy struct task_struct *copy_process(
* if the fd table isn't shared).
*/
if (clone_flags & CLONE_PIDFD) {
- retval = pidfd_create(pid);
+ retval = get_unused_fd_flags(O_RDWR | O_CLOEXEC);
if (retval < 0)
goto bad_fork_free_pid;
pidfd = retval;
- retval = put_user(pidfd, parent_tidptr);
+
+ pidfile = anon_inode_getfile("[pidfd]", &pidfd_fops, pid,
+ O_RDWR | O_CLOEXEC);
+ if (IS_ERR(pidfile)) {
+ put_unused_fd(pidfd);
+ retval = PTR_ERR(pidfile);
+ goto bad_fork_free_pid;
+ }
+ get_pid(pid); /* held by pidfile now */
+
+ retval = put_user(pidfd, args->pidfd);
if (retval)
goto bad_fork_put_pidfd;
}
@@ -2105,7 +2102,7 @@ static __latent_entropy struct task_struct *copy_process(
if (clone_flags & CLONE_PARENT)
p->exit_signal = current->group_leader->exit_signal;
else
- p->exit_signal = (clone_flags & CSIGNAL);
+ p->exit_signal = args->exit_signal;
p->group_leader = p;
p->tgid = p->pid;
}
@@ -2138,7 +2135,7 @@ static __latent_entropy struct task_struct *copy_process(
*/
p->start_time = ktime_get_ns();
- p->real_start_time = ktime_get_boot_ns();
+ p->real_start_time = ktime_get_boottime_ns();
/*
* Make it visible to the rest of the system, but dont wake it up yet.
@@ -2179,6 +2176,9 @@ static __latent_entropy struct task_struct *copy_process(
goto bad_fork_cancel_cgroup;
}
+ /* past the last point of failure */
+ if (pidfile)
+ fd_install(pidfd, pidfile);
init_task_pid_links(p);
if (likely(p->pid)) {
@@ -2245,8 +2245,10 @@ bad_fork_cancel_cgroup:
bad_fork_cgroup_threadgroup_change_end:
cgroup_threadgroup_change_end(current);
bad_fork_put_pidfd:
- if (clone_flags & CLONE_PIDFD)
- ksys_close(pidfd);
+ if (clone_flags & CLONE_PIDFD) {
+ fput(pidfile);
+ put_unused_fd(pidfd);
+ }
bad_fork_free_pid:
if (pid != &init_struct_pid)
free_pid(pid);
@@ -2313,8 +2315,11 @@ static inline void init_idle_pids(struct task_struct *idle)
struct task_struct *fork_idle(int cpu)
{
struct task_struct *task;
- task = copy_process(CLONE_VM, 0, 0, NULL, NULL, &init_struct_pid, 0, 0,
- cpu_to_node(cpu));
+ struct kernel_clone_args args = {
+ .flags = CLONE_VM,
+ };
+
+ task = copy_process(&init_struct_pid, 0, cpu_to_node(cpu), &args);
if (!IS_ERR(task)) {
init_idle_pids(task);
init_idle(task, cpu);
@@ -2334,13 +2339,9 @@ struct mm_struct *copy_init_mm(void)
* It copies the process, and if successful kick-starts
* it and waits for it to finish using the VM if required.
*/
-long _do_fork(unsigned long clone_flags,
- unsigned long stack_start,
- unsigned long stack_size,
- int __user *parent_tidptr,
- int __user *child_tidptr,
- unsigned long tls)
+long _do_fork(struct kernel_clone_args *args)
{
+ u64 clone_flags = args->flags;
struct completion vfork;
struct pid *pid;
struct task_struct *p;
@@ -2356,7 +2357,7 @@ long _do_fork(unsigned long clone_flags,
if (!(clone_flags & CLONE_UNTRACED)) {
if (clone_flags & CLONE_VFORK)
trace = PTRACE_EVENT_VFORK;
- else if ((clone_flags & CSIGNAL) != SIGCHLD)
+ else if (args->exit_signal != SIGCHLD)
trace = PTRACE_EVENT_CLONE;
else
trace = PTRACE_EVENT_FORK;
@@ -2365,8 +2366,7 @@ long _do_fork(unsigned long clone_flags,
trace = 0;
}
- p = copy_process(clone_flags, stack_start, stack_size, parent_tidptr,
- child_tidptr, NULL, trace, tls, NUMA_NO_NODE);
+ p = copy_process(NULL, trace, NUMA_NO_NODE, args);
add_latent_entropy();
if (IS_ERR(p))
@@ -2382,7 +2382,7 @@ long _do_fork(unsigned long clone_flags,
nr = pid_vnr(pid);
if (clone_flags & CLONE_PARENT_SETTID)
- put_user(nr, parent_tidptr);
+ put_user(nr, args->parent_tid);
if (clone_flags & CLONE_VFORK) {
p->vfork_done = &vfork;
@@ -2405,6 +2405,16 @@ long _do_fork(unsigned long clone_flags,
return nr;
}
+bool legacy_clone_args_valid(const struct kernel_clone_args *kargs)
+{
+ /* clone(CLONE_PIDFD) uses parent_tidptr to return a pidfd */
+ if ((kargs->flags & CLONE_PIDFD) &&
+ (kargs->flags & CLONE_PARENT_SETTID))
+ return false;
+
+ return true;
+}
+
#ifndef CONFIG_HAVE_COPY_THREAD_TLS
/* For compatibility with architectures that call do_fork directly rather than
* using the syscall entry points below. */
@@ -2414,8 +2424,20 @@ long do_fork(unsigned long clone_flags,
int __user *parent_tidptr,
int __user *child_tidptr)
{
- return _do_fork(clone_flags, stack_start, stack_size,
- parent_tidptr, child_tidptr, 0);
+ struct kernel_clone_args args = {
+ .flags = (clone_flags & ~CSIGNAL),
+ .pidfd = parent_tidptr,
+ .child_tid = child_tidptr,
+ .parent_tid = parent_tidptr,
+ .exit_signal = (clone_flags & CSIGNAL),
+ .stack = stack_start,
+ .stack_size = stack_size,
+ };
+
+ if (!legacy_clone_args_valid(&args))
+ return -EINVAL;
+
+ return _do_fork(&args);
}
#endif
@@ -2424,15 +2446,25 @@ long do_fork(unsigned long clone_flags,
*/
pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
{
- return _do_fork(flags|CLONE_VM|CLONE_UNTRACED, (unsigned long)fn,
- (unsigned long)arg, NULL, NULL, 0);
+ struct kernel_clone_args args = {
+ .flags = ((flags | CLONE_VM | CLONE_UNTRACED) & ~CSIGNAL),
+ .exit_signal = (flags & CSIGNAL),
+ .stack = (unsigned long)fn,
+ .stack_size = (unsigned long)arg,
+ };
+
+ return _do_fork(&args);
}
#ifdef __ARCH_WANT_SYS_FORK
SYSCALL_DEFINE0(fork)
{
#ifdef CONFIG_MMU
- return _do_fork(SIGCHLD, 0, 0, NULL, NULL, 0);
+ struct kernel_clone_args args = {
+ .exit_signal = SIGCHLD,
+ };
+
+ return _do_fork(&args);
#else
/* can not support in nommu mode */
return -EINVAL;
@@ -2443,8 +2475,12 @@ SYSCALL_DEFINE0(fork)
#ifdef __ARCH_WANT_SYS_VFORK
SYSCALL_DEFINE0(vfork)
{
- return _do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, 0,
- 0, NULL, NULL, 0);
+ struct kernel_clone_args args = {
+ .flags = CLONE_VFORK | CLONE_VM,
+ .exit_signal = SIGCHLD,
+ };
+
+ return _do_fork(&args);
}
#endif
@@ -2472,7 +2508,111 @@ SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
unsigned long, tls)
#endif
{
- return _do_fork(clone_flags, newsp, 0, parent_tidptr, child_tidptr, tls);
+ struct kernel_clone_args args = {
+ .flags = (clone_flags & ~CSIGNAL),
+ .pidfd = parent_tidptr,
+ .child_tid = child_tidptr,
+ .parent_tid = parent_tidptr,
+ .exit_signal = (clone_flags & CSIGNAL),
+ .stack = newsp,
+ .tls = tls,
+ };
+
+ if (!legacy_clone_args_valid(&args))
+ return -EINVAL;
+
+ return _do_fork(&args);
+}
+#endif
+
+#ifdef __ARCH_WANT_SYS_CLONE3
+noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs,
+ struct clone_args __user *uargs,
+ size_t size)
+{
+ struct clone_args args;
+
+ if (unlikely(size > PAGE_SIZE))
+ return -E2BIG;
+
+ if (unlikely(size < sizeof(struct clone_args)))
+ return -EINVAL;
+
+ if (unlikely(!access_ok(uargs, size)))
+ return -EFAULT;
+
+ if (size > sizeof(struct clone_args)) {
+ unsigned char __user *addr;
+ unsigned char __user *end;
+ unsigned char val;
+
+ addr = (void __user *)uargs + sizeof(struct clone_args);
+ end = (void __user *)uargs + size;
+
+ for (; addr < end; addr++) {
+ if (get_user(val, addr))
+ return -EFAULT;
+ if (val)
+ return -E2BIG;
+ }
+
+ size = sizeof(struct clone_args);
+ }
+
+ if (copy_from_user(&args, uargs, size))
+ return -EFAULT;
+
+ *kargs = (struct kernel_clone_args){
+ .flags = args.flags,
+ .pidfd = u64_to_user_ptr(args.pidfd),
+ .child_tid = u64_to_user_ptr(args.child_tid),
+ .parent_tid = u64_to_user_ptr(args.parent_tid),
+ .exit_signal = args.exit_signal,
+ .stack = args.stack,
+ .stack_size = args.stack_size,
+ .tls = args.tls,
+ };
+
+ return 0;
+}
+
+static bool clone3_args_valid(const struct kernel_clone_args *kargs)
+{
+ /*
+ * All lower bits of the flag word are taken.
+ * Verify that no other unknown flags are passed along.
+ */
+ if (kargs->flags & ~CLONE_LEGACY_FLAGS)
+ return false;
+
+ /*
+ * - make the CLONE_DETACHED bit reuseable for clone3
+ * - make the CSIGNAL bits reuseable for clone3
+ */
+ if (kargs->flags & (CLONE_DETACHED | CSIGNAL))
+ return false;
+
+ if ((kargs->flags & (CLONE_THREAD | CLONE_PARENT)) &&
+ kargs->exit_signal)
+ return false;
+
+ return true;
+}
+
+SYSCALL_DEFINE2(clone3, struct clone_args __user *, uargs, size_t, size)
+{
+ int err;
+
+ struct kernel_clone_args kargs;
+
+ err = copy_clone_args_from_user(&kargs, uargs, size);
+ if (err)
+ return err;
+
+ if (!clone3_args_valid(&kargs))
+ return -EINVAL;
+
+ return _do_fork(&kargs);
}
#endif