diff options
-rw-r--r-- | tools/virtiofsd/fuse_lowlevel.h | 1 | ||||
-rw-r--r-- | tools/virtiofsd/helper.c | 47 | ||||
-rw-r--r-- | tools/virtiofsd/passthrough_ll.c | 102 |
3 files changed, 133 insertions, 17 deletions
diff --git a/tools/virtiofsd/fuse_lowlevel.h b/tools/virtiofsd/fuse_lowlevel.h index 8f6d705b5c..562fd5241e 100644 --- a/tools/virtiofsd/fuse_lowlevel.h +++ b/tools/virtiofsd/fuse_lowlevel.h @@ -1777,6 +1777,7 @@ struct fuse_cmdline_opts { int syslog; int log_level; unsigned int max_idle_threads; + unsigned long rlimit_nofile; }; /** diff --git a/tools/virtiofsd/helper.c b/tools/virtiofsd/helper.c index 819c2bc13c..00a1ef666a 100644 --- a/tools/virtiofsd/helper.c +++ b/tools/virtiofsd/helper.c @@ -23,6 +23,8 @@ #include <stdlib.h> #include <string.h> #include <sys/param.h> +#include <sys/time.h> +#include <sys/resource.h> #include <unistd.h> #define FUSE_HELPER_OPT(t, p) \ @@ -53,6 +55,7 @@ static const struct fuse_opt fuse_helper_opts[] = { FUSE_HELPER_OPT("subtype=", nodefault_subtype), FUSE_OPT_KEY("subtype=", FUSE_OPT_KEY_KEEP), FUSE_HELPER_OPT("max_idle_threads=%u", max_idle_threads), + FUSE_HELPER_OPT("--rlimit-nofile=%lu", rlimit_nofile), FUSE_HELPER_OPT("--syslog", syslog), FUSE_HELPER_OPT_VALUE("log_level=debug", log_level, FUSE_LOG_DEBUG), FUSE_HELPER_OPT_VALUE("log_level=info", log_level, FUSE_LOG_INFO), @@ -171,6 +174,10 @@ void fuse_cmdline_help(void) " default: no_writeback\n" " -o xattr|no_xattr enable/disable xattr\n" " default: no_xattr\n" + " --rlimit-nofile=<num> set maximum number of file descriptors\n" + " (0 leaves rlimit unchanged)\n" + " default: min(1000000, fs.file-max - 16384)\n" + " if the current rlimit is lower\n" ); } @@ -191,11 +198,51 @@ static int fuse_helper_opt_proc(void *data, const char *arg, int key, } } +static unsigned long get_default_rlimit_nofile(void) +{ + g_autofree gchar *file_max_str = NULL; + const rlim_t reserved_fds = 16384; /* leave at least this many fds free */ + rlim_t max_fds = 1000000; /* our default RLIMIT_NOFILE target */ + rlim_t file_max; + struct rlimit rlim; + + /* + * Reduce max_fds below the system-wide maximum, if necessary. This + * ensures there are fds available for other processes so we don't + * cause resource exhaustion. + */ + if (!g_file_get_contents("/proc/sys/fs/file-max", &file_max_str, + NULL, NULL)) { + fuse_log(FUSE_LOG_ERR, "can't read /proc/sys/fs/file-max\n"); + exit(1); + } + file_max = g_ascii_strtoull(file_max_str, NULL, 10); + if (file_max < 2 * reserved_fds) { + fuse_log(FUSE_LOG_ERR, + "The fs.file-max sysctl is too low (%lu) to allow a " + "reasonable number of open files.\n", + (unsigned long)file_max); + exit(1); + } + max_fds = MIN(file_max - reserved_fds, max_fds); + + if (getrlimit(RLIMIT_NOFILE, &rlim) < 0) { + fuse_log(FUSE_LOG_ERR, "getrlimit(RLIMIT_NOFILE): %m\n"); + exit(1); + } + + if (rlim.rlim_cur >= max_fds) { + return 0; /* we have more fds available than required! */ + } + return max_fds; +} + int fuse_parse_cmdline(struct fuse_args *args, struct fuse_cmdline_opts *opts) { memset(opts, 0, sizeof(struct fuse_cmdline_opts)); opts->max_idle_threads = 10; + opts->rlimit_nofile = get_default_rlimit_nofile(); opts->foreground = 1; if (fuse_opt_parse(args, opts, fuse_helper_opts, fuse_helper_opt_proc) == diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c index 4c35c95b25..3ba1d90984 100644 --- a/tools/virtiofsd/passthrough_ll.c +++ b/tools/virtiofsd/passthrough_ll.c @@ -2531,11 +2531,24 @@ static void print_capabilities(void) } /* + * Drop all Linux capabilities because the wait parent process only needs to + * sit in waitpid(2) and terminate. + */ +static void setup_wait_parent_capabilities(void) +{ + capng_setpid(syscall(SYS_gettid)); + capng_clear(CAPNG_SELECT_BOTH); + capng_apply(CAPNG_SELECT_BOTH); +} + +/* * Move to a new mount, net, and pid namespaces to isolate this process. */ static void setup_namespaces(struct lo_data *lo, struct fuse_session *se) { pid_t child; + char template[] = "virtiofsd-XXXXXX"; + char *tmpdir; /* * Create a new pid namespace for *child* processes. We'll have to @@ -2561,6 +2574,8 @@ static void setup_namespaces(struct lo_data *lo, struct fuse_session *se) pid_t waited; int wstatus; + setup_wait_parent_capabilities(); + /* The parent waits for the child */ do { waited = waitpid(child, &wstatus, 0); @@ -2597,12 +2612,33 @@ static void setup_namespaces(struct lo_data *lo, struct fuse_session *se) exit(1); } + tmpdir = mkdtemp(template); + if (!tmpdir) { + fuse_log(FUSE_LOG_ERR, "tmpdir(%s): %m\n", template); + exit(1); + } + + if (mount("/proc/self/fd", tmpdir, NULL, MS_BIND, NULL) < 0) { + fuse_log(FUSE_LOG_ERR, "mount(/proc/self/fd, %s, MS_BIND): %m\n", + tmpdir); + exit(1); + } + /* Now we can get our /proc/self/fd directory file descriptor */ - lo->proc_self_fd = open("/proc/self/fd", O_PATH); + lo->proc_self_fd = open(tmpdir, O_PATH); if (lo->proc_self_fd == -1) { - fuse_log(FUSE_LOG_ERR, "open(/proc/self/fd, O_PATH): %m\n"); + fuse_log(FUSE_LOG_ERR, "open(%s, O_PATH): %m\n", tmpdir); exit(1); } + + if (umount2(tmpdir, MNT_DETACH) < 0) { + fuse_log(FUSE_LOG_ERR, "umount2(%s, MNT_DETACH): %m\n", tmpdir); + exit(1); + } + + if (rmdir(tmpdir) < 0) { + fuse_log(FUSE_LOG_ERR, "rmdir(%s): %m\n", tmpdir); + } } /* @@ -2643,7 +2679,7 @@ static void setup_mounts(const char *source) int oldroot; int newroot; - if (mount(source, source, NULL, MS_BIND, NULL) < 0) { + if (mount(source, source, NULL, MS_BIND | MS_REC, NULL) < 0) { fuse_log(FUSE_LOG_ERR, "mount(%s, %s, MS_BIND): %m\n", source, source); exit(1); } @@ -2696,6 +2732,43 @@ static void setup_mounts(const char *source) } /* + * Only keep whitelisted capabilities that are needed for file system operation + */ +static void setup_capabilities(void) +{ + pthread_mutex_lock(&cap.mutex); + capng_restore_state(&cap.saved); + + /* + * Whitelist file system-related capabilities that are needed for a file + * server to act like root. Drop everything else like networking and + * sysadmin capabilities. + * + * Exclusions: + * 1. CAP_LINUX_IMMUTABLE is not included because it's only used via ioctl + * and we don't support that. + * 2. CAP_MAC_OVERRIDE is not included because it only seems to be + * used by the Smack LSM. Omit it until there is demand for it. + */ + capng_setpid(syscall(SYS_gettid)); + capng_clear(CAPNG_SELECT_BOTH); + capng_updatev(CAPNG_ADD, CAPNG_PERMITTED | CAPNG_EFFECTIVE, + CAP_CHOWN, + CAP_DAC_OVERRIDE, + CAP_DAC_READ_SEARCH, + CAP_FOWNER, + CAP_FSETID, + CAP_SETGID, + CAP_SETUID, + CAP_MKNOD, + CAP_SETFCAP); + capng_apply(CAPNG_SELECT_BOTH); + + cap.saved = capng_save_state(); + pthread_mutex_unlock(&cap.mutex); +} + +/* * Lock down this process to prevent access to other processes or files outside * source directory. This reduces the impact of arbitrary code execution bugs. */ @@ -2705,26 +2778,21 @@ static void setup_sandbox(struct lo_data *lo, struct fuse_session *se, setup_namespaces(lo, se); setup_mounts(lo->source); setup_seccomp(enable_syslog); + setup_capabilities(); } -/* Raise the maximum number of open file descriptors */ -static void setup_nofile_rlimit(void) +/* Set the maximum number of open file descriptors */ +static void setup_nofile_rlimit(unsigned long rlimit_nofile) { - const rlim_t max_fds = 1000000; - struct rlimit rlim; - - if (getrlimit(RLIMIT_NOFILE, &rlim) < 0) { - fuse_log(FUSE_LOG_ERR, "getrlimit(RLIMIT_NOFILE): %m\n"); - exit(1); - } + struct rlimit rlim = { + .rlim_cur = rlimit_nofile, + .rlim_max = rlimit_nofile, + }; - if (rlim.rlim_cur >= max_fds) { + if (rlimit_nofile == 0) { return; /* nothing to do */ } - rlim.rlim_cur = max_fds; - rlim.rlim_max = max_fds; - if (setrlimit(RLIMIT_NOFILE, &rlim) < 0) { /* Ignore SELinux denials */ if (errno == EPERM) { @@ -2977,7 +3045,7 @@ int main(int argc, char *argv[]) fuse_daemonize(opts.foreground); - setup_nofile_rlimit(); + setup_nofile_rlimit(opts.rlimit_nofile); /* Must be before sandbox since it wants /proc */ setup_capng(); |