summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--tools/virtiofsd/fuse_lowlevel.h1
-rw-r--r--tools/virtiofsd/helper.c47
-rw-r--r--tools/virtiofsd/passthrough_ll.c102
3 files changed, 133 insertions, 17 deletions
diff --git a/tools/virtiofsd/fuse_lowlevel.h b/tools/virtiofsd/fuse_lowlevel.h
index 8f6d705b5c..562fd5241e 100644
--- a/tools/virtiofsd/fuse_lowlevel.h
+++ b/tools/virtiofsd/fuse_lowlevel.h
@@ -1777,6 +1777,7 @@ struct fuse_cmdline_opts {
int syslog;
int log_level;
unsigned int max_idle_threads;
+ unsigned long rlimit_nofile;
};
/**
diff --git a/tools/virtiofsd/helper.c b/tools/virtiofsd/helper.c
index 819c2bc13c..00a1ef666a 100644
--- a/tools/virtiofsd/helper.c
+++ b/tools/virtiofsd/helper.c
@@ -23,6 +23,8 @@
#include <stdlib.h>
#include <string.h>
#include <sys/param.h>
+#include <sys/time.h>
+#include <sys/resource.h>
#include <unistd.h>
#define FUSE_HELPER_OPT(t, p) \
@@ -53,6 +55,7 @@ static const struct fuse_opt fuse_helper_opts[] = {
FUSE_HELPER_OPT("subtype=", nodefault_subtype),
FUSE_OPT_KEY("subtype=", FUSE_OPT_KEY_KEEP),
FUSE_HELPER_OPT("max_idle_threads=%u", max_idle_threads),
+ FUSE_HELPER_OPT("--rlimit-nofile=%lu", rlimit_nofile),
FUSE_HELPER_OPT("--syslog", syslog),
FUSE_HELPER_OPT_VALUE("log_level=debug", log_level, FUSE_LOG_DEBUG),
FUSE_HELPER_OPT_VALUE("log_level=info", log_level, FUSE_LOG_INFO),
@@ -171,6 +174,10 @@ void fuse_cmdline_help(void)
" default: no_writeback\n"
" -o xattr|no_xattr enable/disable xattr\n"
" default: no_xattr\n"
+ " --rlimit-nofile=<num> set maximum number of file descriptors\n"
+ " (0 leaves rlimit unchanged)\n"
+ " default: min(1000000, fs.file-max - 16384)\n"
+ " if the current rlimit is lower\n"
);
}
@@ -191,11 +198,51 @@ static int fuse_helper_opt_proc(void *data, const char *arg, int key,
}
}
+static unsigned long get_default_rlimit_nofile(void)
+{
+ g_autofree gchar *file_max_str = NULL;
+ const rlim_t reserved_fds = 16384; /* leave at least this many fds free */
+ rlim_t max_fds = 1000000; /* our default RLIMIT_NOFILE target */
+ rlim_t file_max;
+ struct rlimit rlim;
+
+ /*
+ * Reduce max_fds below the system-wide maximum, if necessary. This
+ * ensures there are fds available for other processes so we don't
+ * cause resource exhaustion.
+ */
+ if (!g_file_get_contents("/proc/sys/fs/file-max", &file_max_str,
+ NULL, NULL)) {
+ fuse_log(FUSE_LOG_ERR, "can't read /proc/sys/fs/file-max\n");
+ exit(1);
+ }
+ file_max = g_ascii_strtoull(file_max_str, NULL, 10);
+ if (file_max < 2 * reserved_fds) {
+ fuse_log(FUSE_LOG_ERR,
+ "The fs.file-max sysctl is too low (%lu) to allow a "
+ "reasonable number of open files.\n",
+ (unsigned long)file_max);
+ exit(1);
+ }
+ max_fds = MIN(file_max - reserved_fds, max_fds);
+
+ if (getrlimit(RLIMIT_NOFILE, &rlim) < 0) {
+ fuse_log(FUSE_LOG_ERR, "getrlimit(RLIMIT_NOFILE): %m\n");
+ exit(1);
+ }
+
+ if (rlim.rlim_cur >= max_fds) {
+ return 0; /* we have more fds available than required! */
+ }
+ return max_fds;
+}
+
int fuse_parse_cmdline(struct fuse_args *args, struct fuse_cmdline_opts *opts)
{
memset(opts, 0, sizeof(struct fuse_cmdline_opts));
opts->max_idle_threads = 10;
+ opts->rlimit_nofile = get_default_rlimit_nofile();
opts->foreground = 1;
if (fuse_opt_parse(args, opts, fuse_helper_opts, fuse_helper_opt_proc) ==
diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c
index 4c35c95b25..3ba1d90984 100644
--- a/tools/virtiofsd/passthrough_ll.c
+++ b/tools/virtiofsd/passthrough_ll.c
@@ -2531,11 +2531,24 @@ static void print_capabilities(void)
}
/*
+ * Drop all Linux capabilities because the wait parent process only needs to
+ * sit in waitpid(2) and terminate.
+ */
+static void setup_wait_parent_capabilities(void)
+{
+ capng_setpid(syscall(SYS_gettid));
+ capng_clear(CAPNG_SELECT_BOTH);
+ capng_apply(CAPNG_SELECT_BOTH);
+}
+
+/*
* Move to a new mount, net, and pid namespaces to isolate this process.
*/
static void setup_namespaces(struct lo_data *lo, struct fuse_session *se)
{
pid_t child;
+ char template[] = "virtiofsd-XXXXXX";
+ char *tmpdir;
/*
* Create a new pid namespace for *child* processes. We'll have to
@@ -2561,6 +2574,8 @@ static void setup_namespaces(struct lo_data *lo, struct fuse_session *se)
pid_t waited;
int wstatus;
+ setup_wait_parent_capabilities();
+
/* The parent waits for the child */
do {
waited = waitpid(child, &wstatus, 0);
@@ -2597,12 +2612,33 @@ static void setup_namespaces(struct lo_data *lo, struct fuse_session *se)
exit(1);
}
+ tmpdir = mkdtemp(template);
+ if (!tmpdir) {
+ fuse_log(FUSE_LOG_ERR, "tmpdir(%s): %m\n", template);
+ exit(1);
+ }
+
+ if (mount("/proc/self/fd", tmpdir, NULL, MS_BIND, NULL) < 0) {
+ fuse_log(FUSE_LOG_ERR, "mount(/proc/self/fd, %s, MS_BIND): %m\n",
+ tmpdir);
+ exit(1);
+ }
+
/* Now we can get our /proc/self/fd directory file descriptor */
- lo->proc_self_fd = open("/proc/self/fd", O_PATH);
+ lo->proc_self_fd = open(tmpdir, O_PATH);
if (lo->proc_self_fd == -1) {
- fuse_log(FUSE_LOG_ERR, "open(/proc/self/fd, O_PATH): %m\n");
+ fuse_log(FUSE_LOG_ERR, "open(%s, O_PATH): %m\n", tmpdir);
exit(1);
}
+
+ if (umount2(tmpdir, MNT_DETACH) < 0) {
+ fuse_log(FUSE_LOG_ERR, "umount2(%s, MNT_DETACH): %m\n", tmpdir);
+ exit(1);
+ }
+
+ if (rmdir(tmpdir) < 0) {
+ fuse_log(FUSE_LOG_ERR, "rmdir(%s): %m\n", tmpdir);
+ }
}
/*
@@ -2643,7 +2679,7 @@ static void setup_mounts(const char *source)
int oldroot;
int newroot;
- if (mount(source, source, NULL, MS_BIND, NULL) < 0) {
+ if (mount(source, source, NULL, MS_BIND | MS_REC, NULL) < 0) {
fuse_log(FUSE_LOG_ERR, "mount(%s, %s, MS_BIND): %m\n", source, source);
exit(1);
}
@@ -2696,6 +2732,43 @@ static void setup_mounts(const char *source)
}
/*
+ * Only keep whitelisted capabilities that are needed for file system operation
+ */
+static void setup_capabilities(void)
+{
+ pthread_mutex_lock(&cap.mutex);
+ capng_restore_state(&cap.saved);
+
+ /*
+ * Whitelist file system-related capabilities that are needed for a file
+ * server to act like root. Drop everything else like networking and
+ * sysadmin capabilities.
+ *
+ * Exclusions:
+ * 1. CAP_LINUX_IMMUTABLE is not included because it's only used via ioctl
+ * and we don't support that.
+ * 2. CAP_MAC_OVERRIDE is not included because it only seems to be
+ * used by the Smack LSM. Omit it until there is demand for it.
+ */
+ capng_setpid(syscall(SYS_gettid));
+ capng_clear(CAPNG_SELECT_BOTH);
+ capng_updatev(CAPNG_ADD, CAPNG_PERMITTED | CAPNG_EFFECTIVE,
+ CAP_CHOWN,
+ CAP_DAC_OVERRIDE,
+ CAP_DAC_READ_SEARCH,
+ CAP_FOWNER,
+ CAP_FSETID,
+ CAP_SETGID,
+ CAP_SETUID,
+ CAP_MKNOD,
+ CAP_SETFCAP);
+ capng_apply(CAPNG_SELECT_BOTH);
+
+ cap.saved = capng_save_state();
+ pthread_mutex_unlock(&cap.mutex);
+}
+
+/*
* Lock down this process to prevent access to other processes or files outside
* source directory. This reduces the impact of arbitrary code execution bugs.
*/
@@ -2705,26 +2778,21 @@ static void setup_sandbox(struct lo_data *lo, struct fuse_session *se,
setup_namespaces(lo, se);
setup_mounts(lo->source);
setup_seccomp(enable_syslog);
+ setup_capabilities();
}
-/* Raise the maximum number of open file descriptors */
-static void setup_nofile_rlimit(void)
+/* Set the maximum number of open file descriptors */
+static void setup_nofile_rlimit(unsigned long rlimit_nofile)
{
- const rlim_t max_fds = 1000000;
- struct rlimit rlim;
-
- if (getrlimit(RLIMIT_NOFILE, &rlim) < 0) {
- fuse_log(FUSE_LOG_ERR, "getrlimit(RLIMIT_NOFILE): %m\n");
- exit(1);
- }
+ struct rlimit rlim = {
+ .rlim_cur = rlimit_nofile,
+ .rlim_max = rlimit_nofile,
+ };
- if (rlim.rlim_cur >= max_fds) {
+ if (rlimit_nofile == 0) {
return; /* nothing to do */
}
- rlim.rlim_cur = max_fds;
- rlim.rlim_max = max_fds;
-
if (setrlimit(RLIMIT_NOFILE, &rlim) < 0) {
/* Ignore SELinux denials */
if (errno == EPERM) {
@@ -2977,7 +3045,7 @@ int main(int argc, char *argv[])
fuse_daemonize(opts.foreground);
- setup_nofile_rlimit();
+ setup_nofile_rlimit(opts.rlimit_nofile);
/* Must be before sandbox since it wants /proc */
setup_capng();