From 2abc77af89e17582db9039293c8ac881c8c96d79 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 12 Jul 2018 11:18:42 -0400 Subject: new helper: open_with_fake_path() open a file by given inode, faking ->f_path. Use with shitloads of caution - at the very least you'd damn better make sure that some dentry alias of that inode is pinned down by the path in question. Again, this is no general-purpose interface and I hope it will eventually go away. Right now overlayfs wants something like that, but nothing else should. Any out-of-tree code with bright idea of using this one *will* eventually get hurt, with zero notice and great delight on my part. I refuse to use EXPORT_SYMBOL_GPL(), especially in situations when it's really EXPORT_SYMBOL_DONT_USE_IT(), but don't take that export as "you are welcome to use it". Signed-off-by: Al Viro --- fs/open.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'fs/open.c') diff --git a/fs/open.c b/fs/open.c index ee893240d199..dd15711eb658 100644 --- a/fs/open.c +++ b/fs/open.c @@ -925,6 +925,24 @@ struct file *dentry_open(const struct path *path, int flags, } EXPORT_SYMBOL(dentry_open); +struct file *open_with_fake_path(const struct path *path, int flags, + struct inode *inode, const struct cred *cred) +{ + struct file *f = alloc_empty_file(flags, cred); + if (!IS_ERR(f)) { + int error; + + f->f_path = *path; + error = do_dentry_open(f, inode, NULL); + if (error) { + fput(f); + f = ERR_PTR(error); + } + } + return f; +} +EXPORT_SYMBOL(open_with_fake_path); + static inline int build_open_flags(int flags, umode_t mode, struct open_flags *op) { int lookup_flags = 0; -- cgit v1.2.3-55-g7522 From d3b1084dfd629ef89bc1c4bab95e5cb87e7d08c2 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Wed, 18 Jul 2018 15:44:40 +0200 Subject: vfs: make open_with_fake_path() not contribute to nr_files Stacking file operations in overlay will store an extra open file for each overlay file opened. The overhead is just that of "struct file" which is about 256bytes, because overlay already pins an extra dentry and inode when the file is open, which add up to a much larger overhead. For fear of breaking working setups, don't start accounting the extra file. Signed-off-by: Miklos Szeredi --- fs/file_table.c | 69 +++++++++++++++++++++++++++++++++++++----------------- fs/internal.h | 1 + fs/open.c | 2 +- include/linux/fs.h | 3 +++ 4 files changed, 53 insertions(+), 22 deletions(-) (limited to 'fs/open.c') diff --git a/fs/file_table.c b/fs/file_table.c index 9b70ed2bbc4e..0cc7bea6b51a 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -52,7 +52,8 @@ static void file_free_rcu(struct rcu_head *head) static inline void file_free(struct file *f) { security_file_free(f); - percpu_counter_dec(&nr_files); + if (!(f->f_mode & FMODE_NOACCOUNT)) + percpu_counter_dec(&nr_files); call_rcu(&f->f_u.fu_rcuhead, file_free_rcu); } @@ -91,6 +92,34 @@ int proc_nr_files(struct ctl_table *table, int write, } #endif +static struct file *__alloc_file(int flags, const struct cred *cred) +{ + struct file *f; + int error; + + f = kmem_cache_zalloc(filp_cachep, GFP_KERNEL); + if (unlikely(!f)) + return ERR_PTR(-ENOMEM); + + f->f_cred = get_cred(cred); + error = security_file_alloc(f); + if (unlikely(error)) { + file_free_rcu(&f->f_u.fu_rcuhead); + return ERR_PTR(error); + } + + atomic_long_set(&f->f_count, 1); + rwlock_init(&f->f_owner.lock); + spin_lock_init(&f->f_lock); + mutex_init(&f->f_pos_lock); + eventpoll_init_file(f); + f->f_flags = flags; + f->f_mode = OPEN_FMODE(flags); + /* f->f_version: 0 */ + + return f; +} + /* Find an unused file structure and return a pointer to it. * Returns an error pointer if some error happend e.g. we over file * structures limit, run out of memory or operation is not permitted. @@ -105,7 +134,6 @@ struct file *alloc_empty_file(int flags, const struct cred *cred) { static long old_max; struct file *f; - int error; /* * Privileged users can go above max_files @@ -119,26 +147,10 @@ struct file *alloc_empty_file(int flags, const struct cred *cred) goto over; } - f = kmem_cache_zalloc(filp_cachep, GFP_KERNEL); - if (unlikely(!f)) - return ERR_PTR(-ENOMEM); - - f->f_cred = get_cred(cred); - error = security_file_alloc(f); - if (unlikely(error)) { - file_free_rcu(&f->f_u.fu_rcuhead); - return ERR_PTR(error); - } + f = __alloc_file(flags, cred); + if (!IS_ERR(f)) + percpu_counter_inc(&nr_files); - atomic_long_set(&f->f_count, 1); - rwlock_init(&f->f_owner.lock); - spin_lock_init(&f->f_lock); - mutex_init(&f->f_pos_lock); - eventpoll_init_file(f); - f->f_flags = flags; - f->f_mode = OPEN_FMODE(flags); - /* f->f_version: 0 */ - percpu_counter_inc(&nr_files); return f; over: @@ -150,6 +162,21 @@ over: return ERR_PTR(-ENFILE); } +/* + * Variant of alloc_empty_file() that doesn't check and modify nr_files. + * + * Should not be used unless there's a very good reason to do so. + */ +struct file *alloc_empty_file_noaccount(int flags, const struct cred *cred) +{ + struct file *f = __alloc_file(flags, cred); + + if (!IS_ERR(f)) + f->f_mode |= FMODE_NOACCOUNT; + + return f; +} + /** * alloc_file - allocate and initialize a 'struct file' * diff --git a/fs/internal.h b/fs/internal.h index 52a346903748..442098fa0a84 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -94,6 +94,7 @@ extern void chroot_fs_refs(const struct path *, const struct path *); * file_table.c */ extern struct file *alloc_empty_file(int, const struct cred *); +extern struct file *alloc_empty_file_noaccount(int, const struct cred *); /* * super.c diff --git a/fs/open.c b/fs/open.c index dd15711eb658..9c6617dbb2c0 100644 --- a/fs/open.c +++ b/fs/open.c @@ -928,7 +928,7 @@ EXPORT_SYMBOL(dentry_open); struct file *open_with_fake_path(const struct path *path, int flags, struct inode *inode, const struct cred *cred) { - struct file *f = alloc_empty_file(flags, cred); + struct file *f = alloc_empty_file_noaccount(flags, cred); if (!IS_ERR(f)) { int error; diff --git a/include/linux/fs.h b/include/linux/fs.h index 5ce2b413abc6..e1884840d556 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -156,6 +156,9 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset, /* File is capable of returning -EAGAIN if I/O will block */ #define FMODE_NOWAIT ((__force fmode_t)0x8000000) +/* File does not contribute to nr_files count */ +#define FMODE_NOACCOUNT ((__force fmode_t)0x20000000) + /* * Flag for rw_copy_check_uvector and compat_rw_copy_check_uvector * that indicates that they should check the contents of the iovec are -- cgit v1.2.3-55-g7522 From a6518f73e60e5044656d1ba587e7463479a9381a Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Fri, 6 Jul 2018 23:57:06 +0200 Subject: vfs: don't open real Let overlayfs do its thing when opening a file. This enables stacking and fixes the corner case when a file is opened for read, modified through a writable open, and data is read from the read-only file. After this patch the read-only open will not return stale data even in this case. Signed-off-by: Miklos Szeredi Signed-off-by: Al Viro --- fs/open.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'fs/open.c') diff --git a/fs/open.c b/fs/open.c index 9c6617dbb2c0..fed24862ef83 100644 --- a/fs/open.c +++ b/fs/open.c @@ -893,13 +893,8 @@ EXPORT_SYMBOL(file_path); */ int vfs_open(const struct path *path, struct file *file) { - struct dentry *dentry = d_real(path->dentry, NULL, file->f_flags, 0); - - if (IS_ERR(dentry)) - return PTR_ERR(dentry); - file->f_path = *path; - return do_dentry_open(file, d_backing_inode(dentry), NULL); + return do_dentry_open(file, d_backing_inode(path->dentry), NULL); } struct file *dentry_open(const struct path *path, int flags, -- cgit v1.2.3-55-g7522 From 6742cee04353231015ddbe7e8b404ac9c1eb4473 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Wed, 18 Jul 2018 15:44:43 +0200 Subject: Revert "ovl: don't allow writing ioctl on lower layer" This reverts commit 7c6893e3c9abf6a9676e060a1e35e5caca673d57. Overlayfs no longer relies on the vfs for checking writability of files. Signed-off-by: Miklos Szeredi --- fs/internal.h | 2 -- fs/namespace.c | 64 +++------------------------------------------------------- fs/open.c | 4 ++-- fs/xattr.c | 9 ++++----- 4 files changed, 9 insertions(+), 70 deletions(-) (limited to 'fs/open.c') diff --git a/fs/internal.h b/fs/internal.h index 9c3b4c40e582..abb41f346b18 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -80,10 +80,8 @@ extern void __init mnt_init(void); extern int __mnt_want_write(struct vfsmount *); extern int __mnt_want_write_file(struct file *); -extern int mnt_want_write_file_path(struct file *); extern void __mnt_drop_write(struct vfsmount *); extern void __mnt_drop_write_file(struct file *); -extern void mnt_drop_write_file_path(struct file *); /* * fs_struct.c diff --git a/fs/namespace.c b/fs/namespace.c index 76a742e36b32..c16921dba157 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -431,18 +431,13 @@ int __mnt_want_write_file(struct file *file) } /** - * mnt_want_write_file_path - get write access to a file's mount + * mnt_want_write_file - get write access to a file's mount * @file: the file who's mount on which to take a write * * This is like mnt_want_write, but it takes a file and can * do some optimisations if the file is open for write already - * - * Called by the vfs for cases when we have an open file at hand, but will do an - * inode operation on it (important distinction for files opened on overlayfs, - * since the file operations will come from the real underlying file, while - * inode operations come from the overlay). */ -int mnt_want_write_file_path(struct file *file) +int mnt_want_write_file(struct file *file) { int ret; @@ -452,53 +447,6 @@ int mnt_want_write_file_path(struct file *file) sb_end_write(file->f_path.mnt->mnt_sb); return ret; } - -static inline int may_write_real(struct file *file) -{ - struct dentry *dentry = file->f_path.dentry; - struct dentry *upperdentry; - - /* Writable file? */ - if (file->f_mode & FMODE_WRITER) - return 0; - - /* Not overlayfs? */ - if (likely(!(dentry->d_flags & DCACHE_OP_REAL))) - return 0; - - /* File refers to upper, writable layer? */ - upperdentry = d_real(dentry, NULL, 0, D_REAL_UPPER); - if (upperdentry && file_inode(file) == d_inode(upperdentry)) - return 0; - - /* Lower layer: can't write to real file, sorry... */ - return -EPERM; -} - -/** - * mnt_want_write_file - get write access to a file's mount - * @file: the file who's mount on which to take a write - * - * This is like mnt_want_write, but it takes a file and can - * do some optimisations if the file is open for write already - * - * Mostly called by filesystems from their ioctl operation before performing - * modification. On overlayfs this needs to check if the file is on a read-only - * lower layer and deny access in that case. - */ -int mnt_want_write_file(struct file *file) -{ - int ret; - - ret = may_write_real(file); - if (!ret) { - sb_start_write(file_inode(file)->i_sb); - ret = __mnt_want_write_file(file); - if (ret) - sb_end_write(file_inode(file)->i_sb); - } - return ret; -} EXPORT_SYMBOL_GPL(mnt_want_write_file); /** @@ -536,15 +484,9 @@ void __mnt_drop_write_file(struct file *file) __mnt_drop_write(file->f_path.mnt); } -void mnt_drop_write_file_path(struct file *file) -{ - mnt_drop_write(file->f_path.mnt); -} - void mnt_drop_write_file(struct file *file) { - __mnt_drop_write(file->f_path.mnt); - sb_end_write(file_inode(file)->i_sb); + mnt_drop_write(file->f_path.mnt); } EXPORT_SYMBOL(mnt_drop_write_file); diff --git a/fs/open.c b/fs/open.c index fed24862ef83..a3c03a4a9078 100644 --- a/fs/open.c +++ b/fs/open.c @@ -707,12 +707,12 @@ int ksys_fchown(unsigned int fd, uid_t user, gid_t group) if (!f.file) goto out; - error = mnt_want_write_file_path(f.file); + error = mnt_want_write_file(f.file); if (error) goto out_fput; audit_file(f.file); error = chown_common(&f.file->f_path, user, group); - mnt_drop_write_file_path(f.file); + mnt_drop_write_file(f.file); out_fput: fdput(f); out: diff --git a/fs/xattr.c b/fs/xattr.c index f9cb1db187b7..3a24027c062d 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -23,7 +23,6 @@ #include #include -#include "internal.h" static const char * strcmp_prefix(const char *a, const char *a_prefix) @@ -501,10 +500,10 @@ SYSCALL_DEFINE5(fsetxattr, int, fd, const char __user *, name, if (!f.file) return error; audit_file(f.file); - error = mnt_want_write_file_path(f.file); + error = mnt_want_write_file(f.file); if (!error) { error = setxattr(f.file->f_path.dentry, name, value, size, flags); - mnt_drop_write_file_path(f.file); + mnt_drop_write_file(f.file); } fdput(f); return error; @@ -733,10 +732,10 @@ SYSCALL_DEFINE2(fremovexattr, int, fd, const char __user *, name) if (!f.file) return error; audit_file(f.file); - error = mnt_want_write_file_path(f.file); + error = mnt_want_write_file(f.file); if (!error) { error = removexattr(f.file->f_path.dentry, name); - mnt_drop_write_file_path(f.file); + mnt_drop_write_file(f.file); } fdput(f); return error; -- cgit v1.2.3-55-g7522 From 4ab30319fd7c691a1b3165325c647a5cd6d282ac Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Wed, 18 Jul 2018 15:44:43 +0200 Subject: Revert "vfs: add flags to d_real()" This reverts commit 495e642939114478a5237a7d91661ba93b76f15a. No user of "flags" argument of d_real() remain. Signed-off-by: Miklos Szeredi --- Documentation/filesystems/Locking | 2 +- Documentation/filesystems/vfs.txt | 2 +- fs/open.c | 2 +- fs/overlayfs/super.c | 4 ++-- include/linux/dcache.h | 11 +++++------ include/linux/fs.h | 2 +- 6 files changed, 11 insertions(+), 12 deletions(-) (limited to 'fs/open.c') diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking index 2c391338c675..24e1a4f37c83 100644 --- a/Documentation/filesystems/Locking +++ b/Documentation/filesystems/Locking @@ -22,7 +22,7 @@ prototypes: struct vfsmount *(*d_automount)(struct path *path); int (*d_manage)(const struct path *, bool); struct dentry *(*d_real)(struct dentry *, const struct inode *, - unsigned int, unsigned int); + unsigned int); locking rules: rename_lock ->d_lock may block rcu-walk diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt index 829a7b7857a4..6417b161f88d 100644 --- a/Documentation/filesystems/vfs.txt +++ b/Documentation/filesystems/vfs.txt @@ -1001,7 +1001,7 @@ struct dentry_operations { struct vfsmount *(*d_automount)(struct path *); int (*d_manage)(const struct path *, bool); struct dentry *(*d_real)(struct dentry *, const struct inode *, - unsigned int, unsigned int); + unsigned int); }; d_revalidate: called when the VFS needs to revalidate a dentry. This diff --git a/fs/open.c b/fs/open.c index a3c03a4a9078..a973ca074896 100644 --- a/fs/open.c +++ b/fs/open.c @@ -96,7 +96,7 @@ long vfs_truncate(const struct path *path, loff_t length) * write access on the upper inode, not on the overlay inode. For * non-overlay filesystems d_real() is an identity function. */ - upperdentry = d_real(path->dentry, NULL, O_WRONLY, 0); + upperdentry = d_real(path->dentry, NULL, O_WRONLY); error = PTR_ERR(upperdentry); if (IS_ERR(upperdentry)) goto mnt_drop_write_and_out; diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index a7e2287e4b8f..5bc261de5041 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -92,7 +92,7 @@ static int ovl_check_append_only(struct inode *inode, int flag) static struct dentry *ovl_d_real(struct dentry *dentry, const struct inode *inode, - unsigned int open_flags, unsigned int flags) + unsigned int open_flags) { struct dentry *real; int err; @@ -128,7 +128,7 @@ static struct dentry *ovl_d_real(struct dentry *dentry, goto bug; /* Handle recursion */ - real = d_real(real, inode, open_flags, 0); + real = d_real(real, inode, open_flags); if (!inode || inode == d_inode(real)) return real; diff --git a/include/linux/dcache.h b/include/linux/dcache.h index ddae4103d324..8fe4efa94af6 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -146,7 +146,7 @@ struct dentry_operations { struct vfsmount *(*d_automount)(struct path *); int (*d_manage)(const struct path *, bool); struct dentry *(*d_real)(struct dentry *, const struct inode *, - unsigned int, unsigned int); + unsigned int); } ____cacheline_aligned; /* @@ -568,8 +568,7 @@ static inline struct dentry *d_backing_dentry(struct dentry *upper) * d_real - Return the real dentry * @dentry: the dentry to query * @inode: inode to select the dentry from multiple layers (can be NULL) - * @open_flags: open flags to control copy-up behavior - * @flags: flags to control what is returned by this function + * @flags: open flags to control copy-up behavior * * If dentry is on a union/overlay, then return the underlying, real dentry. * Otherwise return the dentry itself. @@ -578,10 +577,10 @@ static inline struct dentry *d_backing_dentry(struct dentry *upper) */ static inline struct dentry *d_real(struct dentry *dentry, const struct inode *inode, - unsigned int open_flags, unsigned int flags) + unsigned int flags) { if (unlikely(dentry->d_flags & DCACHE_OP_REAL)) - return dentry->d_op->d_real(dentry, inode, open_flags, flags); + return dentry->d_op->d_real(dentry, inode, flags); else return dentry; } @@ -596,7 +595,7 @@ static inline struct dentry *d_real(struct dentry *dentry, static inline struct inode *d_real_inode(const struct dentry *dentry) { /* This usage of d_real() results in const dentry */ - return d_backing_inode(d_real((struct dentry *) dentry, NULL, 0, 0)); + return d_backing_inode(d_real((struct dentry *) dentry, NULL, 0)); } struct name_snapshot { diff --git a/include/linux/fs.h b/include/linux/fs.h index 8f8c9ac1c9d5..16e2741cec3c 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1249,7 +1249,7 @@ static inline struct inode *file_inode(const struct file *f) static inline struct dentry *file_dentry(const struct file *file) { - return d_real(file->f_path.dentry, file_inode(file), 0, 0); + return d_real(file->f_path.dentry, file_inode(file), 0); } static inline int locks_lock_file_wait(struct file *filp, struct file_lock *fl) -- cgit v1.2.3-55-g7522 From 8cf9ee5061037accf61775f438ad7513576d4413 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Wed, 18 Jul 2018 15:44:43 +0200 Subject: Revert "vfs: do get_write_access() on upper layer of overlayfs" This reverts commit 4d0c5ba2ff79ef9f5188998b29fd28fcb05f3667. We now get write access on both overlay and underlying layers so this patch is no longer needed for correct operation. Signed-off-by: Miklos Szeredi --- fs/locks.c | 3 +-- fs/open.c | 15 ++------------- 2 files changed, 3 insertions(+), 15 deletions(-) (limited to 'fs/open.c') diff --git a/fs/locks.c b/fs/locks.c index db7b6917d9c5..baa564841c03 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -1654,8 +1654,7 @@ check_conflicting_open(const struct dentry *dentry, const long arg, int flags) if (flags & FL_LAYOUT) return 0; - if ((arg == F_RDLCK) && - (atomic_read(&d_real_inode(dentry)->i_writecount) > 0)) + if ((arg == F_RDLCK) && (atomic_read(&inode->i_writecount) > 0)) return -EAGAIN; if ((arg == F_WRLCK) && ((d_count(dentry) > 1) || diff --git a/fs/open.c b/fs/open.c index a973ca074896..72bcae72bce9 100644 --- a/fs/open.c +++ b/fs/open.c @@ -68,7 +68,6 @@ int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs, long vfs_truncate(const struct path *path, loff_t length) { struct inode *inode; - struct dentry *upperdentry; long error; inode = path->dentry->d_inode; @@ -91,17 +90,7 @@ long vfs_truncate(const struct path *path, loff_t length) if (IS_APPEND(inode)) goto mnt_drop_write_and_out; - /* - * If this is an overlayfs then do as if opening the file so we get - * write access on the upper inode, not on the overlay inode. For - * non-overlay filesystems d_real() is an identity function. - */ - upperdentry = d_real(path->dentry, NULL, O_WRONLY); - error = PTR_ERR(upperdentry); - if (IS_ERR(upperdentry)) - goto mnt_drop_write_and_out; - - error = get_write_access(upperdentry->d_inode); + error = get_write_access(inode); if (error) goto mnt_drop_write_and_out; @@ -120,7 +109,7 @@ long vfs_truncate(const struct path *path, loff_t length) error = do_truncate(path->dentry, length, 0, NULL); put_write_and_out: - put_write_access(upperdentry->d_inode); + put_write_access(inode); mnt_drop_write_and_out: mnt_drop_write(path->mnt); out: -- cgit v1.2.3-55-g7522