From 653c60b633a9019a54a80d64b5ed33ecb214823c Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 23 Feb 2015 21:43:37 +1100 Subject: xfs: introduce mmap/truncate lock Right now we cannot serialise mmap against truncate or hole punch sanely. ->page_mkwrite is not able to take locks that the read IO path normally takes (i.e. the inode iolock) because that could result in lock inversions (read - iolock - page fault - page_mkwrite - iolock) and so we cannot use an IO path lock to serialise page write faults against truncate operations. Instead, introduce a new lock that is used *only* in the ->page_mkwrite path that is the equivalent of the iolock. The lock ordering in a page fault is i_mmaplock -> page lock -> i_ilock, and so in truncate we can i_iolock -> i_mmaplock and so lock out new write faults during the process of truncation. Because i_mmap_lock is outside the page lock, we can hold it across all the same operations we hold the i_iolock for. The only difference is that we never hold the i_mmaplock in the normal IO path and so do not ever have the possibility that we can page fault inside it. Hence there are no recursion issues on the i_mmap_lock and so we can use it to serialise page fault IO against inode modification operations that affect the IO path. This patch introduces the i_mmaplock infrastructure, lockdep annotations and initialisation/destruction code. Use of the new lock will be in subsequent patches. Signed-off-by: Dave Chinner Reviewed-by: Brian Foster Signed-off-by: Dave Chinner --- fs/xfs/xfs_inode.c | 128 ++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 97 insertions(+), 31 deletions(-) (limited to 'fs/xfs/xfs_inode.c') diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index daafa1f6d260..ac24818f7b2d 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -117,24 +117,34 @@ xfs_ilock_attr_map_shared( } /* - * The xfs inode contains 2 locks: a multi-reader lock called the - * i_iolock and a multi-reader lock called the i_lock. This routine - * allows either or both of the locks to be obtained. + * The xfs inode contains 3 multi-reader locks: the i_iolock the i_mmap_lock and + * the i_lock. This routine allows various combinations of the locks to be + * obtained. * - * The 2 locks should always be ordered so that the IO lock is - * obtained first in order to prevent deadlock. + * The 3 locks should always be ordered so that the IO lock is obtained first, + * the mmap lock second and the ilock last in order to prevent deadlock. * - * ip -- the inode being locked - * lock_flags -- this parameter indicates the inode's locks - * to be locked. It can be: - * XFS_IOLOCK_SHARED, - * XFS_IOLOCK_EXCL, - * XFS_ILOCK_SHARED, - * XFS_ILOCK_EXCL, - * XFS_IOLOCK_SHARED | XFS_ILOCK_SHARED, - * XFS_IOLOCK_SHARED | XFS_ILOCK_EXCL, - * XFS_IOLOCK_EXCL | XFS_ILOCK_SHARED, - * XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL + * Basic locking order: + * + * i_iolock -> i_mmap_lock -> page_lock -> i_ilock + * + * mmap_sem locking order: + * + * i_iolock -> page lock -> mmap_sem + * mmap_sem -> i_mmap_lock -> page_lock + * + * The difference in mmap_sem locking order mean that we cannot hold the + * i_mmap_lock over syscall based read(2)/write(2) based IO. These IO paths can + * fault in pages during copy in/out (for buffered IO) or require the mmap_sem + * in get_user_pages() to map the user pages into the kernel address space for + * direct IO. Similarly the i_iolock cannot be taken inside a page fault because + * page faults already hold the mmap_sem. + * + * Hence to serialise fully against both syscall and mmap based IO, we need to + * take both the i_iolock and the i_mmap_lock. These locks should *only* be both + * taken in places where we need to invalidate the page cache in a race + * free manner (e.g. truncate, hole punch and other extent manipulation + * functions). */ void xfs_ilock( @@ -150,6 +160,8 @@ xfs_ilock( */ ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) != (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)); + ASSERT((lock_flags & (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)) != + (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)); ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) != (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0); @@ -159,6 +171,11 @@ xfs_ilock( else if (lock_flags & XFS_IOLOCK_SHARED) mraccess_nested(&ip->i_iolock, XFS_IOLOCK_DEP(lock_flags)); + if (lock_flags & XFS_MMAPLOCK_EXCL) + mrupdate_nested(&ip->i_mmaplock, XFS_MMAPLOCK_DEP(lock_flags)); + else if (lock_flags & XFS_MMAPLOCK_SHARED) + mraccess_nested(&ip->i_mmaplock, XFS_MMAPLOCK_DEP(lock_flags)); + if (lock_flags & XFS_ILOCK_EXCL) mrupdate_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags)); else if (lock_flags & XFS_ILOCK_SHARED) @@ -191,6 +208,8 @@ xfs_ilock_nowait( */ ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) != (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)); + ASSERT((lock_flags & (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)) != + (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)); ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) != (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0); @@ -202,21 +221,35 @@ xfs_ilock_nowait( if (!mrtryaccess(&ip->i_iolock)) goto out; } + + if (lock_flags & XFS_MMAPLOCK_EXCL) { + if (!mrtryupdate(&ip->i_mmaplock)) + goto out_undo_iolock; + } else if (lock_flags & XFS_MMAPLOCK_SHARED) { + if (!mrtryaccess(&ip->i_mmaplock)) + goto out_undo_iolock; + } + if (lock_flags & XFS_ILOCK_EXCL) { if (!mrtryupdate(&ip->i_lock)) - goto out_undo_iolock; + goto out_undo_mmaplock; } else if (lock_flags & XFS_ILOCK_SHARED) { if (!mrtryaccess(&ip->i_lock)) - goto out_undo_iolock; + goto out_undo_mmaplock; } return 1; - out_undo_iolock: +out_undo_mmaplock: + if (lock_flags & XFS_MMAPLOCK_EXCL) + mrunlock_excl(&ip->i_mmaplock); + else if (lock_flags & XFS_MMAPLOCK_SHARED) + mrunlock_shared(&ip->i_mmaplock); +out_undo_iolock: if (lock_flags & XFS_IOLOCK_EXCL) mrunlock_excl(&ip->i_iolock); else if (lock_flags & XFS_IOLOCK_SHARED) mrunlock_shared(&ip->i_iolock); - out: +out: return 0; } @@ -244,6 +277,8 @@ xfs_iunlock( */ ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) != (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)); + ASSERT((lock_flags & (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)) != + (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)); ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) != (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0); @@ -254,6 +289,11 @@ xfs_iunlock( else if (lock_flags & XFS_IOLOCK_SHARED) mrunlock_shared(&ip->i_iolock); + if (lock_flags & XFS_MMAPLOCK_EXCL) + mrunlock_excl(&ip->i_mmaplock); + else if (lock_flags & XFS_MMAPLOCK_SHARED) + mrunlock_shared(&ip->i_mmaplock); + if (lock_flags & XFS_ILOCK_EXCL) mrunlock_excl(&ip->i_lock); else if (lock_flags & XFS_ILOCK_SHARED) @@ -271,11 +311,14 @@ xfs_ilock_demote( xfs_inode_t *ip, uint lock_flags) { - ASSERT(lock_flags & (XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL)); - ASSERT((lock_flags & ~(XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL)) == 0); + ASSERT(lock_flags & (XFS_IOLOCK_EXCL|XFS_MMAPLOCK_EXCL|XFS_ILOCK_EXCL)); + ASSERT((lock_flags & + ~(XFS_IOLOCK_EXCL|XFS_MMAPLOCK_EXCL|XFS_ILOCK_EXCL)) == 0); if (lock_flags & XFS_ILOCK_EXCL) mrdemote(&ip->i_lock); + if (lock_flags & XFS_MMAPLOCK_EXCL) + mrdemote(&ip->i_mmaplock); if (lock_flags & XFS_IOLOCK_EXCL) mrdemote(&ip->i_iolock); @@ -294,6 +337,12 @@ xfs_isilocked( return rwsem_is_locked(&ip->i_lock.mr_lock); } + if (lock_flags & (XFS_MMAPLOCK_EXCL|XFS_MMAPLOCK_SHARED)) { + if (!(lock_flags & XFS_MMAPLOCK_SHARED)) + return !!ip->i_mmaplock.mr_writer; + return rwsem_is_locked(&ip->i_mmaplock.mr_lock); + } + if (lock_flags & (XFS_IOLOCK_EXCL|XFS_IOLOCK_SHARED)) { if (!(lock_flags & XFS_IOLOCK_SHARED)) return !!ip->i_iolock.mr_writer; @@ -314,14 +363,27 @@ int xfs_lock_delays; #endif /* - * Bump the subclass so xfs_lock_inodes() acquires each lock with - * a different value + * Bump the subclass so xfs_lock_inodes() acquires each lock with a different + * value. This shouldn't be called for page fault locking, but we also need to + * ensure we don't overrun the number of lockdep subclasses for the iolock or + * mmaplock as that is limited to 12 by the mmap lock lockdep annotations. */ static inline int xfs_lock_inumorder(int lock_mode, int subclass) { - if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)) + if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)) { + ASSERT(subclass + XFS_LOCK_INUMORDER < + (1 << (XFS_MMAPLOCK_SHIFT - XFS_IOLOCK_SHIFT))); lock_mode |= (subclass + XFS_LOCK_INUMORDER) << XFS_IOLOCK_SHIFT; + } + + if (lock_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)) { + ASSERT(subclass + XFS_LOCK_INUMORDER < + (1 << (XFS_ILOCK_SHIFT - XFS_MMAPLOCK_SHIFT))); + lock_mode |= (subclass + XFS_LOCK_INUMORDER) << + XFS_MMAPLOCK_SHIFT; + } + if (lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)) lock_mode |= (subclass + XFS_LOCK_INUMORDER) << XFS_ILOCK_SHIFT; @@ -440,10 +502,10 @@ again: } /* - * xfs_lock_two_inodes() can only be used to lock one type of lock - * at a time - the iolock or the ilock, but not both at once. If - * we lock both at once, lockdep will report false positives saying - * we have violated locking orders. + * xfs_lock_two_inodes() can only be used to lock one type of lock at a time - + * the iolock, the mmaplock or the ilock, but not more than one at a time. If we + * lock more than one at a time, lockdep will report false positives saying we + * have violated locking orders. */ void xfs_lock_two_inodes( @@ -455,8 +517,12 @@ xfs_lock_two_inodes( int attempts = 0; xfs_log_item_t *lp; - if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)) - ASSERT((lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)) == 0); + if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)) { + ASSERT(!(lock_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL))); + ASSERT(!(lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL))); + } else if (lock_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)) + ASSERT(!(lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL))); + ASSERT(ip0->i_ino != ip1->i_ino); if (ip0->i_ino > ip1->i_ino) { -- cgit v1.2.3-55-g7522 From 58c904734cd0917cd0953067dd68003572407c7b Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 23 Feb 2015 22:38:08 +1100 Subject: xfs: inodes are new until the dentry cache is set up Al Viro noticed a generic set of issues to do with filehandle lookup racing with dentry cache setup. They involve a filehandle lookup occurring while an inode is being created and the filehandle lookup racing with the dentry creation for the real file. This can lead to multiple dentries for the one path being instantiated. There are a host of other issues around this same set of paths. The underlying cause is that file handle lookup only waits on inode cache instantiation rather than full dentry cache instantiation. XFS is mostly immune to the problems discovered due to it's own internal inode cache, but there are a couple of corner cases where races can happen. We currently clear the XFS_INEW flag when the inode is fully set up after insertion into the cache. Newly allocated inodes are inserted locked and so aren't usable until the allocation transaction commits. This, however, occurs before the dentry and security information is fully initialised and hence the inode is unlocked and available for lookups to find too early. To solve the problem, only clear the XFS_INEW flag for newly created inodes once the dentry is fully instantiated. This means lookups will retry until the XFS_INEW flag is removed from the inode and hence avoids the race conditions in questions. THis also means that xfs_create(), xfs_create_tmpfile() and xfs_symlink() need to finish the setup of the inode in their error paths if we had allocated the inode but failed later in the creation process. xfs_symlink(), in particular, needed a lot of help to make it's error handling match that of xfs_create(). Signed-off-by: Dave Chinner Reviewed-by: Brian Foster Signed-off-by: Dave Chinner --- fs/xfs/xfs_icache.c | 4 ++-- fs/xfs/xfs_inode.c | 22 ++++++++++++-------- fs/xfs/xfs_inode.h | 22 ++++++++++++++++++++ fs/xfs/xfs_iops.c | 24 +++++++++------------- fs/xfs/xfs_iops.h | 2 -- fs/xfs/xfs_qm.c | 13 ++++++++---- fs/xfs/xfs_symlink.c | 58 ++++++++++++++++++++++++++++++---------------------- 7 files changed, 90 insertions(+), 55 deletions(-) (limited to 'fs/xfs/xfs_inode.c') diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 9771b7ef62ed..76a9f2783282 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -439,11 +439,11 @@ again: *ipp = ip; /* - * If we have a real type for an on-disk inode, we can set ops(&unlock) + * If we have a real type for an on-disk inode, we can setup the inode * now. If it's a new inode being created, xfs_ialloc will handle it. */ if (xfs_iflags_test(ip, XFS_INEW) && ip->i_d.di_mode != 0) - xfs_setup_inode(ip); + xfs_setup_existing_inode(ip); return 0; out_error_or_again: diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index daafa1f6d260..d0414f305967 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -818,7 +818,7 @@ xfs_ialloc( xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); xfs_trans_log_inode(tp, ip, flags); - /* now that we have an i_mode we can setup inode ops and unlock */ + /* now that we have an i_mode we can setup the inode structure */ xfs_setup_inode(ip); *ipp = ip; @@ -1235,12 +1235,14 @@ xfs_create( xfs_trans_cancel(tp, cancel_flags); out_release_inode: /* - * Wait until after the current transaction is aborted to - * release the inode. This prevents recursive transactions - * and deadlocks from xfs_inactive. + * Wait until after the current transaction is aborted to finish the + * setup of the inode and release the inode. This prevents recursive + * transactions and deadlocks from xfs_inactive. */ - if (ip) + if (ip) { + xfs_finish_inode_setup(ip); IRELE(ip); + } xfs_qm_dqrele(udqp); xfs_qm_dqrele(gdqp); @@ -1345,12 +1347,14 @@ xfs_create_tmpfile( xfs_trans_cancel(tp, cancel_flags); out_release_inode: /* - * Wait until after the current transaction is aborted to - * release the inode. This prevents recursive transactions - * and deadlocks from xfs_inactive. + * Wait until after the current transaction is aborted to finish the + * setup of the inode and release the inode. This prevents recursive + * transactions and deadlocks from xfs_inactive. */ - if (ip) + if (ip) { + xfs_finish_inode_setup(ip); IRELE(ip); + } xfs_qm_dqrele(udqp); xfs_qm_dqrele(gdqp); diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 86cd6b39bed7..8e82b41d2050 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -390,6 +390,28 @@ int xfs_zero_eof(struct xfs_inode *, xfs_off_t, xfs_fsize_t); int xfs_iozero(struct xfs_inode *, loff_t, size_t); +/* from xfs_iops.c */ +/* + * When setting up a newly allocated inode, we need to call + * xfs_finish_inode_setup() once the inode is fully instantiated at + * the VFS level to prevent the rest of the world seeing the inode + * before we've completed instantiation. Otherwise we can do it + * the moment the inode lookup is complete. + */ +extern void xfs_setup_inode(struct xfs_inode *ip); +static inline void xfs_finish_inode_setup(struct xfs_inode *ip) +{ + xfs_iflags_clear(ip, XFS_INEW); + barrier(); + unlock_new_inode(VFS_I(ip)); +} + +static inline void xfs_setup_existing_inode(struct xfs_inode *ip) +{ + xfs_setup_inode(ip); + xfs_finish_inode_setup(ip); +} + #define IHOLD(ip) \ do { \ ASSERT(atomic_read(&VFS_I(ip)->i_count) > 0) ; \ diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index d919ad7b16bf..d7782ae1af3c 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -187,6 +187,8 @@ xfs_generic_create( else d_instantiate(dentry, inode); + xfs_finish_inode_setup(ip); + out_free_acl: if (default_acl) posix_acl_release(default_acl); @@ -195,6 +197,7 @@ xfs_generic_create( return error; out_cleanup_inode: + xfs_finish_inode_setup(ip); if (!tmpfile) xfs_cleanup_inode(dir, inode, dentry); iput(inode); @@ -367,9 +370,11 @@ xfs_vn_symlink( goto out_cleanup_inode; d_instantiate(dentry, inode); + xfs_finish_inode_setup(cip); return 0; out_cleanup_inode: + xfs_finish_inode_setup(cip); xfs_cleanup_inode(dir, inode, dentry); iput(inode); out: @@ -1236,16 +1241,12 @@ xfs_diflags_to_iflags( } /* - * Initialize the Linux inode, set up the operation vectors and - * unlock the inode. - * - * When reading existing inodes from disk this is called directly - * from xfs_iget, when creating a new inode it is called from - * xfs_ialloc after setting up the inode. + * Initialize the Linux inode and set up the operation vectors. * - * We are always called with an uninitialised linux inode here. - * We need to initialise the necessary fields and take a reference - * on it. + * When reading existing inodes from disk this is called directly from xfs_iget, + * when creating a new inode it is called from xfs_ialloc after setting up the + * inode. These callers have different criteria for clearing XFS_INEW, so leave + * it up to the caller to deal with unlocking the inode appropriately. */ void xfs_setup_inode( @@ -1332,9 +1333,4 @@ xfs_setup_inode( inode_has_no_xattr(inode); cache_no_acl(inode); } - - xfs_iflags_clear(ip, XFS_INEW); - barrier(); - - unlock_new_inode(inode); } diff --git a/fs/xfs/xfs_iops.h b/fs/xfs/xfs_iops.h index ea7a98e9cb70..a0f84abb0d09 100644 --- a/fs/xfs/xfs_iops.h +++ b/fs/xfs/xfs_iops.h @@ -25,8 +25,6 @@ extern const struct file_operations xfs_dir_file_operations; extern ssize_t xfs_vn_listxattr(struct dentry *, char *data, size_t size); -extern void xfs_setup_inode(struct xfs_inode *); - /* * Internal setattr interfaces. */ diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index 53cc2aaf8d2b..c6b22e1e77ed 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -719,6 +719,7 @@ xfs_qm_qino_alloc( xfs_trans_t *tp; int error; int committed; + bool need_alloc = true; *ip = NULL; /* @@ -747,6 +748,7 @@ xfs_qm_qino_alloc( return error; mp->m_sb.sb_gquotino = NULLFSINO; mp->m_sb.sb_pquotino = NULLFSINO; + need_alloc = false; } } @@ -758,7 +760,7 @@ xfs_qm_qino_alloc( return error; } - if (!*ip) { + if (need_alloc) { error = xfs_dir_ialloc(&tp, NULL, S_IFREG, 1, 0, 0, 1, ip, &committed); if (error) { @@ -794,11 +796,14 @@ xfs_qm_qino_alloc( spin_unlock(&mp->m_sb_lock); xfs_log_sb(tp); - if ((error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES))) { + error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + if (error) { + ASSERT(XFS_FORCED_SHUTDOWN(mp)); xfs_alert(mp, "%s failed (error %d)!", __func__, error); - return error; } - return 0; + if (need_alloc) + xfs_finish_inode_setup(*ip); + return error; } diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c index 25791df6f638..3df411eadb86 100644 --- a/fs/xfs/xfs_symlink.c +++ b/fs/xfs/xfs_symlink.c @@ -177,7 +177,7 @@ xfs_symlink( int pathlen; struct xfs_bmap_free free_list; xfs_fsblock_t first_block; - bool unlock_dp_on_error = false; + bool unlock_dp_on_error = false; uint cancel_flags; int committed; xfs_fileoff_t first_fsb; @@ -221,7 +221,7 @@ xfs_symlink( XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp, &pdqp); if (error) - goto std_return; + return error; tp = xfs_trans_alloc(mp, XFS_TRANS_SYMLINK); cancel_flags = XFS_TRANS_RELEASE_LOG_RES; @@ -241,7 +241,7 @@ xfs_symlink( } if (error) { cancel_flags = 0; - goto error_return; + goto out_trans_cancel; } xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT); @@ -252,7 +252,7 @@ xfs_symlink( */ if (dp->i_d.di_flags & XFS_DIFLAG_NOSYMLINKS) { error = -EPERM; - goto error_return; + goto out_trans_cancel; } /* @@ -261,7 +261,7 @@ xfs_symlink( error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp, pdqp, resblks, 1, 0); if (error) - goto error_return; + goto out_trans_cancel; /* * Check for ability to enter directory entry, if no space reserved. @@ -269,7 +269,7 @@ xfs_symlink( if (!resblks) { error = xfs_dir_canenter(tp, dp, link_name); if (error) - goto error_return; + goto out_trans_cancel; } /* * Initialize the bmap freelist prior to calling either @@ -282,15 +282,14 @@ xfs_symlink( */ error = xfs_dir_ialloc(&tp, dp, S_IFLNK | (mode & ~S_IFMT), 1, 0, prid, resblks > 0, &ip, NULL); - if (error) { - if (error == -ENOSPC) - goto error_return; - goto error1; - } + if (error) + goto out_trans_cancel; /* - * An error after we've joined dp to the transaction will result in the - * transaction cancel unlocking dp so don't do it explicitly in the + * Now we join the directory inode to the transaction. We do not do it + * earlier because xfs_dir_ialloc might commit the previous transaction + * (and release all the locks). An error from here on will result in + * the transaction cancel unlocking dp so don't do it explicitly in the * error path. */ xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL); @@ -330,7 +329,7 @@ xfs_symlink( XFS_BMAPI_METADATA, &first_block, resblks, mval, &nmaps, &free_list); if (error) - goto error2; + goto out_bmap_cancel; if (resblks) resblks -= fs_blocks; @@ -348,7 +347,7 @@ xfs_symlink( BTOBB(byte_cnt), 0); if (!bp) { error = -ENOMEM; - goto error2; + goto out_bmap_cancel; } bp->b_ops = &xfs_symlink_buf_ops; @@ -378,7 +377,7 @@ xfs_symlink( error = xfs_dir_createname(tp, dp, link_name, ip->i_ino, &first_block, &free_list, resblks); if (error) - goto error2; + goto out_bmap_cancel; xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); @@ -392,10 +391,13 @@ xfs_symlink( } error = xfs_bmap_finish(&tp, &free_list, &committed); - if (error) { - goto error2; - } + if (error) + goto out_bmap_cancel; + error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + if (error) + goto out_release_inode; + xfs_qm_dqrele(udqp); xfs_qm_dqrele(gdqp); xfs_qm_dqrele(pdqp); @@ -403,20 +405,28 @@ xfs_symlink( *ipp = ip; return 0; - error2: - IRELE(ip); - error1: +out_bmap_cancel: xfs_bmap_cancel(&free_list); cancel_flags |= XFS_TRANS_ABORT; - error_return: +out_trans_cancel: xfs_trans_cancel(tp, cancel_flags); +out_release_inode: + /* + * Wait until after the current transaction is aborted to finish the + * setup of the inode and release the inode. This prevents recursive + * transactions and deadlocks from xfs_inactive. + */ + if (ip) { + xfs_finish_inode_setup(ip); + IRELE(ip); + } + xfs_qm_dqrele(udqp); xfs_qm_dqrele(gdqp); xfs_qm_dqrele(pdqp); if (unlock_dp_on_error) xfs_iunlock(dp, XFS_ILOCK_EXCL); - std_return: return error; } -- cgit v1.2.3-55-g7522 From 95afcf5c7bca93fb84d260f70c304f35ef4c3114 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 25 Mar 2015 14:03:32 +1100 Subject: xfs: clean up inode locking for RENAME_WHITEOUT When doing RENAME_WHITEOUT, we now have to lock 5 inodes into the rename transaction. This means we need to update xfs_sort_for_rename() and xfs_lock_inodes() to handle up to 5 inodes. Because of the vagaries of rename, this means we could have anywhere between 3 and 5 inodes locked into the transaction.... While xfs_lock_inodes() does not need anything other than an assert telling us we are passing more inodes that we ever thought we should see, it could do with a logic rework to remove all the indenting. This is not a functional change - it just makes the code a lot easier to read. Signed-off-by: Dave Chinner Reviewed-by: Eric Sandeen Reviewed-by: Brian Foster Signed-off-by: Dave Chinner --- fs/xfs/xfs_inode.c | 145 +++++++++++++++++++++++++---------------------------- 1 file changed, 67 insertions(+), 78 deletions(-) (limited to 'fs/xfs/xfs_inode.c') diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index d0414f305967..d0a98bafcbac 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -329,15 +329,14 @@ xfs_lock_inumorder(int lock_mode, int subclass) } /* - * The following routine will lock n inodes in exclusive mode. - * We assume the caller calls us with the inodes in i_ino order. + * The following routine will lock n inodes in exclusive mode. We assume the + * caller calls us with the inodes in i_ino order. * - * We need to detect deadlock where an inode that we lock - * is in the AIL and we start waiting for another inode that is locked - * by a thread in a long running transaction (such as truncate). This can - * result in deadlock since the long running trans might need to wait - * for the inode we just locked in order to push the tail and free space - * in the log. + * We need to detect deadlock where an inode that we lock is in the AIL and we + * start waiting for another inode that is locked by a thread in a long running + * transaction (such as truncate). This can result in deadlock since the long + * running trans might need to wait for the inode we just locked in order to + * push the tail and free space in the log. */ void xfs_lock_inodes( @@ -348,30 +347,27 @@ xfs_lock_inodes( int attempts = 0, i, j, try_lock; xfs_log_item_t *lp; - ASSERT(ips && (inodes >= 2)); /* we need at least two */ + /* currently supports between 2 and 5 inodes */ + ASSERT(ips && inodes >= 2 && inodes <= 5); try_lock = 0; i = 0; - again: for (; i < inodes; i++) { ASSERT(ips[i]); - if (i && (ips[i] == ips[i-1])) /* Already locked */ + if (i && (ips[i] == ips[i - 1])) /* Already locked */ continue; /* - * If try_lock is not set yet, make sure all locked inodes - * are not in the AIL. - * If any are, set try_lock to be used later. + * If try_lock is not set yet, make sure all locked inodes are + * not in the AIL. If any are, set try_lock to be used later. */ - if (!try_lock) { for (j = (i - 1); j >= 0 && !try_lock; j--) { lp = (xfs_log_item_t *)ips[j]->i_itemp; - if (lp && (lp->li_flags & XFS_LI_IN_AIL)) { + if (lp && (lp->li_flags & XFS_LI_IN_AIL)) try_lock++; - } } } @@ -381,51 +377,42 @@ again: * we can't get any, we must release all we have * and try again. */ + if (!try_lock) { + xfs_ilock(ips[i], xfs_lock_inumorder(lock_mode, i)); + continue; + } + + /* try_lock means we have an inode locked that is in the AIL. */ + ASSERT(i != 0); + if (xfs_ilock_nowait(ips[i], xfs_lock_inumorder(lock_mode, i))) + continue; - if (try_lock) { - /* try_lock must be 0 if i is 0. */ + /* + * Unlock all previous guys and try again. xfs_iunlock will try + * to push the tail if the inode is in the AIL. + */ + attempts++; + for (j = i - 1; j >= 0; j--) { /* - * try_lock means we have an inode locked - * that is in the AIL. + * Check to see if we've already unlocked this one. Not + * the first one going back, and the inode ptr is the + * same. */ - ASSERT(i != 0); - if (!xfs_ilock_nowait(ips[i], xfs_lock_inumorder(lock_mode, i))) { - attempts++; - - /* - * Unlock all previous guys and try again. - * xfs_iunlock will try to push the tail - * if the inode is in the AIL. - */ - - for(j = i - 1; j >= 0; j--) { - - /* - * Check to see if we've already - * unlocked this one. - * Not the first one going back, - * and the inode ptr is the same. - */ - if ((j != (i - 1)) && ips[j] == - ips[j+1]) - continue; - - xfs_iunlock(ips[j], lock_mode); - } + if (j != (i - 1) && ips[j] == ips[j + 1]) + continue; + + xfs_iunlock(ips[j], lock_mode); + } - if ((attempts % 5) == 0) { - delay(1); /* Don't just spin the CPU */ + if ((attempts % 5) == 0) { + delay(1); /* Don't just spin the CPU */ #ifdef DEBUG - xfs_lock_delays++; + xfs_lock_delays++; #endif - } - i = 0; - try_lock = 0; - goto again; - } - } else { - xfs_ilock(ips[i], xfs_lock_inumorder(lock_mode, i)); } + i = 0; + try_lock = 0; + goto again; } #ifdef DEBUG @@ -2615,19 +2602,22 @@ xfs_remove( /* * Enter all inodes for a rename transaction into a sorted array. */ +#define __XFS_SORT_INODES 5 STATIC void xfs_sort_for_rename( - xfs_inode_t *dp1, /* in: old (source) directory inode */ - xfs_inode_t *dp2, /* in: new (target) directory inode */ - xfs_inode_t *ip1, /* in: inode of old entry */ - xfs_inode_t *ip2, /* in: inode of new entry, if it - already exists, NULL otherwise. */ - xfs_inode_t **i_tab,/* out: array of inode returned, sorted */ - int *num_inodes) /* out: number of inodes in array */ + struct xfs_inode *dp1, /* in: old (source) directory inode */ + struct xfs_inode *dp2, /* in: new (target) directory inode */ + struct xfs_inode *ip1, /* in: inode of old entry */ + struct xfs_inode *ip2, /* in: inode of new entry */ + struct xfs_inode *wip, /* in: whiteout inode */ + struct xfs_inode **i_tab,/* out: sorted array of inodes */ + int *num_inodes) /* in/out: inodes in array */ { - xfs_inode_t *temp; int i, j; + ASSERT(*num_inodes == __XFS_SORT_INODES); + memset(i_tab, 0, *num_inodes * sizeof(struct xfs_inode *)); + /* * i_tab contains a list of pointers to inodes. We initialize * the table here & we'll sort it. We will then use it to @@ -2635,25 +2625,24 @@ xfs_sort_for_rename( * * Note that the table may contain duplicates. e.g., dp1 == dp2. */ - i_tab[0] = dp1; - i_tab[1] = dp2; - i_tab[2] = ip1; - if (ip2) { - *num_inodes = 4; - i_tab[3] = ip2; - } else { - *num_inodes = 3; - i_tab[3] = NULL; - } + i = 0; + i_tab[i++] = dp1; + i_tab[i++] = dp2; + i_tab[i++] = ip1; + if (ip2) + i_tab[i++] = ip2; + if (wip) + i_tab[i++] = wip; + *num_inodes = i; /* * Sort the elements via bubble sort. (Remember, there are at - * most 4 elements to sort, so this is adequate.) + * most 5 elements to sort, so this is adequate.) */ for (i = 0; i < *num_inodes; i++) { for (j = 1; j < *num_inodes; j++) { if (i_tab[j]->i_ino < i_tab[j-1]->i_ino) { - temp = i_tab[j]; + struct xfs_inode *temp = i_tab[j]; i_tab[j] = i_tab[j-1]; i_tab[j-1] = temp; } @@ -2801,16 +2790,16 @@ xfs_rename( xfs_fsblock_t first_block; int cancel_flags; int committed; - xfs_inode_t *inodes[4]; + xfs_inode_t *inodes[__XFS_SORT_INODES]; + int num_inodes = __XFS_SORT_INODES; int spaceres; - int num_inodes; trace_xfs_rename(src_dp, target_dp, src_name, target_name); new_parent = (src_dp != target_dp); src_is_directory = S_ISDIR(src_ip->i_d.di_mode); - xfs_sort_for_rename(src_dp, target_dp, src_ip, target_ip, + xfs_sort_for_rename(src_dp, target_dp, src_ip, target_ip, NULL, inodes, &num_inodes); xfs_bmap_init(&free_list, &first_block); -- cgit v1.2.3-55-g7522 From 445883e8133975fe8d555610c70c1c4c501f0bf8 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 25 Mar 2015 14:05:43 +1100 Subject: xfs: cleanup xfs_rename error handling The jump labels are ambiguous and unclear and some of the error paths are used inconsistently. Rules for error jumps are: - use out_trans_cancel for unmodified transaction context - use out_bmap_cancel on ENOSPC errors - use out_trans_abort when transaction is likely to be dirty. Signed-off-by: Dave Chinner Reviewed-by: Eric Sandeen Reviewed-by: Brian Foster Signed-off-by: Dave Chinner --- fs/xfs/xfs_inode.c | 59 ++++++++++++++++++++++++------------------------------ 1 file changed, 26 insertions(+), 33 deletions(-) (limited to 'fs/xfs/xfs_inode.c') diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index d0a98bafcbac..426dbf7d094a 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -2788,7 +2788,7 @@ xfs_rename( int error; xfs_bmap_free_t free_list; xfs_fsblock_t first_block; - int cancel_flags; + int cancel_flags = 0; int committed; xfs_inode_t *inodes[__XFS_SORT_INODES]; int num_inodes = __XFS_SORT_INODES; @@ -2802,28 +2802,23 @@ xfs_rename( xfs_sort_for_rename(src_dp, target_dp, src_ip, target_ip, NULL, inodes, &num_inodes); - xfs_bmap_init(&free_list, &first_block); tp = xfs_trans_alloc(mp, XFS_TRANS_RENAME); - cancel_flags = XFS_TRANS_RELEASE_LOG_RES; spaceres = XFS_RENAME_SPACE_RES(mp, target_name->len); error = xfs_trans_reserve(tp, &M_RES(mp)->tr_rename, spaceres, 0); if (error == -ENOSPC) { spaceres = 0; error = xfs_trans_reserve(tp, &M_RES(mp)->tr_rename, 0, 0); } - if (error) { - xfs_trans_cancel(tp, 0); - goto std_return; - } + if (error) + goto out_trans_cancel; + cancel_flags = XFS_TRANS_RELEASE_LOG_RES; /* * Attach the dquots to the inodes */ error = xfs_qm_vop_rename_dqattach(inodes); - if (error) { - xfs_trans_cancel(tp, cancel_flags); - goto std_return; - } + if (error) + goto out_trans_cancel; /* * Lock all the participating inodes. Depending upon whether @@ -2853,9 +2848,11 @@ xfs_rename( if (unlikely((target_dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) && (xfs_get_projid(target_dp) != xfs_get_projid(src_ip)))) { error = -EXDEV; - goto error_return; + goto out_trans_cancel; } + xfs_bmap_init(&free_list, &first_block); + /* * Handle RENAME_EXCHANGE flags */ @@ -2864,7 +2861,7 @@ xfs_rename( target_dp, target_name, target_ip, &free_list, &first_block, spaceres); if (error) - goto abort_return; + goto out_trans_abort; goto finish_rename; } @@ -2879,7 +2876,7 @@ xfs_rename( if (!spaceres) { error = xfs_dir_canenter(tp, target_dp, target_name); if (error) - goto error_return; + goto out_trans_cancel; } /* * If target does not exist and the rename crosses @@ -2890,9 +2887,9 @@ xfs_rename( src_ip->i_ino, &first_block, &free_list, spaceres); if (error == -ENOSPC) - goto error_return; + goto out_bmap_cancel; if (error) - goto abort_return; + goto out_trans_abort; xfs_trans_ichgtime(tp, target_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); @@ -2900,7 +2897,7 @@ xfs_rename( if (new_parent && src_is_directory) { error = xfs_bumplink(tp, target_dp); if (error) - goto abort_return; + goto out_trans_abort; } } else { /* target_ip != NULL */ /* @@ -2915,7 +2912,7 @@ xfs_rename( if (!(xfs_dir_isempty(target_ip)) || (target_ip->i_d.di_nlink > 2)) { error = -EEXIST; - goto error_return; + goto out_trans_cancel; } } @@ -2932,7 +2929,7 @@ xfs_rename( src_ip->i_ino, &first_block, &free_list, spaceres); if (error) - goto abort_return; + goto out_trans_abort; xfs_trans_ichgtime(tp, target_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); @@ -2943,7 +2940,7 @@ xfs_rename( */ error = xfs_droplink(tp, target_ip); if (error) - goto abort_return; + goto out_trans_abort; if (src_is_directory) { /* @@ -2951,7 +2948,7 @@ xfs_rename( */ error = xfs_droplink(tp, target_ip); if (error) - goto abort_return; + goto out_trans_abort; } } /* target_ip != NULL */ @@ -2968,7 +2965,7 @@ xfs_rename( &first_block, &free_list, spaceres); ASSERT(error != -EEXIST); if (error) - goto abort_return; + goto out_trans_abort; } /* @@ -2994,13 +2991,13 @@ xfs_rename( */ error = xfs_droplink(tp, src_dp); if (error) - goto abort_return; + goto out_trans_abort; } error = xfs_dir_removename(tp, src_dp, src_name, src_ip->i_ino, &first_block, &free_list, spaceres); if (error) - goto abort_return; + goto out_trans_abort; xfs_trans_ichgtime(tp, src_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); xfs_trans_log_inode(tp, src_dp, XFS_ILOG_CORE); @@ -3018,12 +3015,8 @@ finish_rename: } error = xfs_bmap_finish(&tp, &free_list, &committed); - if (error) { - xfs_bmap_cancel(&free_list); - xfs_trans_cancel(tp, (XFS_TRANS_RELEASE_LOG_RES | - XFS_TRANS_ABORT)); - goto std_return; - } + if (error) + goto out_trans_abort; /* * trans_commit will unlock src_ip, target_ip & decrement @@ -3031,12 +3024,12 @@ finish_rename: */ return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); - abort_return: +out_trans_abort: cancel_flags |= XFS_TRANS_ABORT; - error_return: +out_bmap_cancel: xfs_bmap_cancel(&free_list); +out_trans_cancel: xfs_trans_cancel(tp, cancel_flags); - std_return: return error; } -- cgit v1.2.3-55-g7522 From 310606b0c7e385e9dd3533d168413ad2c579d961 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 25 Mar 2015 14:06:07 +1100 Subject: xfs: factor out xfs_finish_rename() Rather than use a jump label for the final transaction commit in the rename, factor it into a simple helper function and call it appropriately. This slightly reduces the spaghetti nature of xfs_rename. Signed-off-by: Dave Chinner Reviewed-by: Eric Sandeen Reviewed-by: Brian Foster Signed-off-by: Dave Chinner --- fs/xfs/xfs_inode.c | 48 +++++++++++++++++++++++++++--------------------- 1 file changed, 27 insertions(+), 21 deletions(-) (limited to 'fs/xfs/xfs_inode.c') diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 426dbf7d094a..c3fe00cb19e4 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -2650,6 +2650,31 @@ xfs_sort_for_rename( } } +static int +xfs_finish_rename( + struct xfs_trans *tp, + struct xfs_bmap_free *free_list) +{ + int committed = 0; + int error; + + /* + * If this is a synchronous mount, make sure that the rename transaction + * goes to disk before returning to the user. + */ + if (tp->t_mountp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) + xfs_trans_set_sync(tp); + + error = xfs_bmap_finish(&tp, free_list, &committed); + if (error) { + xfs_bmap_cancel(free_list); + xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT); + return error; + } + + return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); +} + /* * xfs_cross_rename() * @@ -2789,7 +2814,6 @@ xfs_rename( xfs_bmap_free_t free_list; xfs_fsblock_t first_block; int cancel_flags = 0; - int committed; xfs_inode_t *inodes[__XFS_SORT_INODES]; int num_inodes = __XFS_SORT_INODES; int spaceres; @@ -2862,7 +2886,7 @@ xfs_rename( &free_list, &first_block, spaceres); if (error) goto out_trans_abort; - goto finish_rename; + return xfs_finish_rename(tp, &free_list); } /* @@ -3004,25 +3028,7 @@ xfs_rename( if (new_parent) xfs_trans_log_inode(tp, target_dp, XFS_ILOG_CORE); -finish_rename: - /* - * If this is a synchronous mount, make sure that the - * rename transaction goes to disk before returning to - * the user. - */ - if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) { - xfs_trans_set_sync(tp); - } - - error = xfs_bmap_finish(&tp, &free_list, &committed); - if (error) - goto out_trans_abort; - - /* - * trans_commit will unlock src_ip, target_ip & decrement - * the vnode references. - */ - return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + return xfs_finish_rename(tp, &free_list); out_trans_abort: cancel_flags |= XFS_TRANS_ABORT; -- cgit v1.2.3-55-g7522 From eeacd3217b8fa8143f5dc27ded405790c74f01e8 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 25 Mar 2015 14:08:07 +1100 Subject: xfs: make xfs_cross_rename() complete fully Now that xfs_finish_rename() exists, there is no reason for xfs_cross_rename() to return to xfs_rename() to finish off the rename transaction. Drive the completion code into xfs_cross_rename() and handle all errors there so as to simplify the xfs_rename() code. Further, push the rename exchange target_ip check to early in the rename code so as to make the error handling easy and obviously correct. Signed-off-by: Dave Chinner Reviewed-by: Eric Sandeen Reviewed-by: Brian Foster Signed-off-by: Dave Chinner --- fs/xfs/xfs_inode.c | 41 +++++++++++++++++++++-------------------- 1 file changed, 21 insertions(+), 20 deletions(-) (limited to 'fs/xfs/xfs_inode.c') diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index c3fe00cb19e4..b376ebe5fad9 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -2703,14 +2703,14 @@ xfs_cross_rename( ip2->i_ino, first_block, free_list, spaceres); if (error) - goto out; + goto out_trans_abort; /* Swap inode number for dirent in second parent */ error = xfs_dir_replace(tp, dp2, name2, ip1->i_ino, first_block, free_list, spaceres); if (error) - goto out; + goto out_trans_abort; /* * If we're renaming one or more directories across different parents, @@ -2725,16 +2725,16 @@ xfs_cross_rename( dp1->i_ino, first_block, free_list, spaceres); if (error) - goto out; + goto out_trans_abort; /* transfer ip2 ".." reference to dp1 */ if (!S_ISDIR(ip1->i_d.di_mode)) { error = xfs_droplink(tp, dp2); if (error) - goto out; + goto out_trans_abort; error = xfs_bumplink(tp, dp1); if (error) - goto out; + goto out_trans_abort; } /* @@ -2752,16 +2752,16 @@ xfs_cross_rename( dp2->i_ino, first_block, free_list, spaceres); if (error) - goto out; + goto out_trans_abort; /* transfer ip1 ".." reference to dp2 */ if (!S_ISDIR(ip2->i_d.di_mode)) { error = xfs_droplink(tp, dp1); if (error) - goto out; + goto out_trans_abort; error = xfs_bumplink(tp, dp2); if (error) - goto out; + goto out_trans_abort; } /* @@ -2789,7 +2789,11 @@ xfs_cross_rename( } xfs_trans_ichgtime(tp, dp1, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); xfs_trans_log_inode(tp, dp1, XFS_ILOG_CORE); -out: + return xfs_finish_rename(tp, free_list); + +out_trans_abort: + xfs_bmap_cancel(free_list); + xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT); return error; } @@ -2820,6 +2824,9 @@ xfs_rename( trace_xfs_rename(src_dp, target_dp, src_name, target_name); + if ((flags & RENAME_EXCHANGE) && !target_ip) + return -EINVAL; + new_parent = (src_dp != target_dp); src_is_directory = S_ISDIR(src_ip->i_d.di_mode); @@ -2877,17 +2884,11 @@ xfs_rename( xfs_bmap_init(&free_list, &first_block); - /* - * Handle RENAME_EXCHANGE flags - */ - if (flags & RENAME_EXCHANGE) { - error = xfs_cross_rename(tp, src_dp, src_name, src_ip, - target_dp, target_name, target_ip, - &free_list, &first_block, spaceres); - if (error) - goto out_trans_abort; - return xfs_finish_rename(tp, &free_list); - } + /* RENAME_EXCHANGE is unique from here on. */ + if (flags & RENAME_EXCHANGE) + return xfs_cross_rename(tp, src_dp, src_name, src_ip, + target_dp, target_name, target_ip, + &free_list, &first_block, spaceres); /* * Set up the target. -- cgit v1.2.3-55-g7522 From 7dcf5c3e4527cfa2807567b00387cf2ed5e07f00 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 25 Mar 2015 14:08:08 +1100 Subject: xfs: add RENAME_WHITEOUT support Whiteouts are used by overlayfs - it has a crazy convention that a whiteout is a character device inode with a major:minor of 0:0. Because it's not documented anywhere, here's an example of what RENAME_WHITEOUT does on ext4: # echo foo > /mnt/scratch/foo # echo bar > /mnt/scratch/bar # ls -l /mnt/scratch total 24 -rw-r--r-- 1 root root 4 Feb 11 20:22 bar -rw-r--r-- 1 root root 4 Feb 11 20:22 foo drwx------ 2 root root 16384 Feb 11 20:18 lost+found # src/renameat2 -w /mnt/scratch/foo /mnt/scratch/bar # ls -l /mnt/scratch total 20 -rw-r--r-- 1 root root 4 Feb 11 20:22 bar c--------- 1 root root 0, 0 Feb 11 20:23 foo drwx------ 2 root root 16384 Feb 11 20:18 lost+found # cat /mnt/scratch/bar foo # In XFS rename terms, the operation that has been done is that source (foo) has been moved to the target (bar), which is like a nomal rename operation, but rather than the source being removed, it have been replaced with a whiteout. We can't allocate whiteout inodes within the rename transaction due to allocation being a multi-commit transaction: rename needs to be a single, atomic commit. Hence we have several options here, form most efficient to least efficient: - use DT_WHT in the target dirent and do no whiteout inode allocation. The main issue with this approach is that we need hooks in lookup to create a virtual chardev inode to present to userspace and in places where we might need to modify the dirent e.g. unlink. Overlayfs also needs to be taught about DT_WHT. Most invasive change, lowest overhead. - create a special whiteout inode in the root directory (e.g. a ".wino" dirent) and then hardlink every new whiteout to it. This means we only need to create a single whiteout inode, and rename simply creates a hardlink to it. We can use DT_WHT for these, though using DT_CHR means we won't have to modify overlayfs, nor anything in userspace. Downside is we have to look up the whiteout inode on every operation and create it if it doesn't exist. - copy ext4: create a special whiteout chardev inode for every whiteout. This is more complex than the above options because of the lack of atomicity between inode creation and the rename operation, requiring us to create a tmpfile inode and then linking it into the directory structure during the rename. At least with a tmpfile inode crashes between the create and rename doesn't leave unreferenced inodes or directory pollution around. By far the simplest thing to do in the short term is to copy ext4. While it is the most inefficient way of supporting whiteouts, but as an initial implementation we can simply reuse existing functions and add a small amount of extra code the the rename operation. When we get full whiteout support in the VFS (via the dentry cache) we can then look to supporting DT_WHT method outlined as the first method of supporting whiteouts. But until then, we'll stick with what overlayfs expects us to be: dumb and stupid. Signed-off-by: Dave Chinner --- fs/xfs/xfs_inode.c | 129 +++++++++++++++++++++++++++++++++++++++++++---------- fs/xfs/xfs_iops.c | 2 +- 2 files changed, 107 insertions(+), 24 deletions(-) (limited to 'fs/xfs/xfs_inode.c') diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index b376ebe5fad9..ded129d5ec0d 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -2797,40 +2797,81 @@ out_trans_abort: return error; } +/* + * xfs_rename_alloc_whiteout() + * + * Return a referenced, unlinked, unlocked inode that that can be used as a + * whiteout in a rename transaction. We use a tmpfile inode here so that if we + * crash between allocating the inode and linking it into the rename transaction + * recovery will free the inode and we won't leak it. + */ +static int +xfs_rename_alloc_whiteout( + struct xfs_inode *dp, + struct xfs_inode **wip) +{ + struct xfs_inode *tmpfile; + int error; + + error = xfs_create_tmpfile(dp, NULL, S_IFCHR | WHITEOUT_MODE, &tmpfile); + if (error) + return error; + + /* Satisfy xfs_bumplink that this is a real tmpfile */ + xfs_finish_inode_setup(tmpfile); + VFS_I(tmpfile)->i_state |= I_LINKABLE; + + *wip = tmpfile; + return 0; +} + /* * xfs_rename */ int xfs_rename( - xfs_inode_t *src_dp, - struct xfs_name *src_name, - xfs_inode_t *src_ip, - xfs_inode_t *target_dp, - struct xfs_name *target_name, - xfs_inode_t *target_ip, - unsigned int flags) + struct xfs_inode *src_dp, + struct xfs_name *src_name, + struct xfs_inode *src_ip, + struct xfs_inode *target_dp, + struct xfs_name *target_name, + struct xfs_inode *target_ip, + unsigned int flags) { - xfs_trans_t *tp = NULL; - xfs_mount_t *mp = src_dp->i_mount; - int new_parent; /* moving to a new dir */ - int src_is_directory; /* src_name is a directory */ - int error; - xfs_bmap_free_t free_list; - xfs_fsblock_t first_block; - int cancel_flags = 0; - xfs_inode_t *inodes[__XFS_SORT_INODES]; - int num_inodes = __XFS_SORT_INODES; - int spaceres; + struct xfs_mount *mp = src_dp->i_mount; + struct xfs_trans *tp; + struct xfs_bmap_free free_list; + xfs_fsblock_t first_block; + struct xfs_inode *wip = NULL; /* whiteout inode */ + struct xfs_inode *inodes[__XFS_SORT_INODES]; + int num_inodes = __XFS_SORT_INODES; + int new_parent = (src_dp != target_dp); + int src_is_directory = S_ISDIR(src_ip->i_d.di_mode); + int cancel_flags = 0; + int spaceres; + int error; trace_xfs_rename(src_dp, target_dp, src_name, target_name); if ((flags & RENAME_EXCHANGE) && !target_ip) return -EINVAL; - new_parent = (src_dp != target_dp); - src_is_directory = S_ISDIR(src_ip->i_d.di_mode); + /* + * If we are doing a whiteout operation, allocate the whiteout inode + * we will be placing at the target and ensure the type is set + * appropriately. + */ + if (flags & RENAME_WHITEOUT) { + ASSERT(!(flags & (RENAME_NOREPLACE | RENAME_EXCHANGE))); + error = xfs_rename_alloc_whiteout(target_dp, &wip); + if (error) + return error; + + /* setup target dirent info as whiteout */ + src_name->type = XFS_DIR3_FT_CHRDEV; + } - xfs_sort_for_rename(src_dp, target_dp, src_ip, target_ip, NULL, + xfs_sort_for_rename(src_dp, target_dp, src_ip, target_ip, wip, inodes, &num_inodes); tp = xfs_trans_alloc(mp, XFS_TRANS_RENAME); @@ -2870,6 +2911,8 @@ xfs_rename( xfs_trans_ijoin(tp, src_ip, XFS_ILOCK_EXCL); if (target_ip) xfs_trans_ijoin(tp, target_ip, XFS_ILOCK_EXCL); + if (wip) + xfs_trans_ijoin(tp, wip, XFS_ILOCK_EXCL); /* * If we are using project inheritance, we only allow renames @@ -3019,17 +3062,55 @@ xfs_rename( goto out_trans_abort; } - error = xfs_dir_removename(tp, src_dp, src_name, src_ip->i_ino, + /* + * For whiteouts, we only need to update the source dirent with the + * inode number of the whiteout inode rather than removing it + * altogether. + */ + if (wip) { + error = xfs_dir_replace(tp, src_dp, src_name, wip->i_ino, &first_block, &free_list, spaceres); + } else + error = xfs_dir_removename(tp, src_dp, src_name, src_ip->i_ino, + &first_block, &free_list, spaceres); if (error) goto out_trans_abort; + /* + * For whiteouts, we need to bump the link count on the whiteout inode. + * This means that failures all the way up to this point leave the inode + * on the unlinked list and so cleanup is a simple matter of dropping + * the remaining reference to it. If we fail here after bumping the link + * count, we're shutting down the filesystem so we'll never see the + * intermediate state on disk. + */ + if (wip) { + ASSERT(wip->i_d.di_nlink == 0); + error = xfs_bumplink(tp, wip); + if (error) + goto out_trans_abort; + error = xfs_iunlink_remove(tp, wip); + if (error) + goto out_trans_abort; + xfs_trans_log_inode(tp, wip, XFS_ILOG_CORE); + + /* + * Now we have a real link, clear the "I'm a tmpfile" state + * flag from the inode so it doesn't accidentally get misused in + * future. + */ + VFS_I(wip)->i_state &= ~I_LINKABLE; + } + xfs_trans_ichgtime(tp, src_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); xfs_trans_log_inode(tp, src_dp, XFS_ILOG_CORE); if (new_parent) xfs_trans_log_inode(tp, target_dp, XFS_ILOG_CORE); - return xfs_finish_rename(tp, &free_list); + error = xfs_finish_rename(tp, &free_list); + if (wip) + IRELE(wip); + return error; out_trans_abort: cancel_flags |= XFS_TRANS_ABORT; @@ -3037,6 +3118,8 @@ out_bmap_cancel: xfs_bmap_cancel(&free_list); out_trans_cancel: xfs_trans_cancel(tp, cancel_flags); + if (wip) + IRELE(wip); return error; } diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index d7782ae1af3c..9bcad7132f75 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -394,7 +394,7 @@ xfs_vn_rename( struct xfs_name oname; struct xfs_name nname; - if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE)) + if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT)) return -EINVAL; /* if we are exchanging files, we need to set i_mode of both files */ -- cgit v1.2.3-55-g7522 From 86aaf02e57d25a3b4094ca76ff805ee2d7aa30f8 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Wed, 25 Mar 2015 14:54:53 +1100 Subject: xfs: use bool instead of int in xfs_rename() new_parent and src_is_directory are only used in 0/1 context. Signed-off-by: Fabian Frederick Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/xfs_inode.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/xfs/xfs_inode.c') diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index daafa1f6d260..b7064f20de7f 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -2790,8 +2790,8 @@ xfs_rename( { xfs_trans_t *tp = NULL; xfs_mount_t *mp = src_dp->i_mount; - int new_parent; /* moving to a new dir */ - int src_is_directory; /* src_name is a directory */ + bool new_parent; /* moving to a new dir */ + bool src_is_directory; /* src_name is a directory */ int error; xfs_bmap_free_t free_list; xfs_fsblock_t first_block; -- cgit v1.2.3-55-g7522 From 6dfe5a049f2d48582050339d2a6b6fda36dfd14c Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Fri, 29 May 2015 07:40:08 +1000 Subject: xfs: xfs_attr_inactive leaves inconsistent attr fork state behind xfs_attr_inactive() is supposed to clean up the attribute fork when the inode is being freed. While it removes attribute fork extents, it completely ignores attributes in local format, which means that there can still be active attributes on the inode after xfs_attr_inactive() has run. This leads to problems with concurrent inode writeback - the in-core inode attribute fork is removed without locking on the assumption that nothing will be attempting to access the attribute fork after a call to xfs_attr_inactive() because it isn't supposed to exist on disk any more. To fix this, make xfs_attr_inactive() completely remove all traces of the attribute fork from the inode, regardless of it's state. Further, also remove the in-core attribute fork structure safely so that there is nothing further that needs to be done by callers to clean up the attribute fork. This means we can remove the in-core and on-disk attribute forks atomically. Also, on error simply remove the in-memory attribute fork. There's nothing that can be done with it once we have failed to remove the on-disk attribute fork, so we may as well just blow it away here anyway. cc: # 3.12 to 4.0 Reported-by: Waiman Long Signed-off-by: Dave Chinner Reviewed-by: Brian Foster Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_attr_leaf.c | 8 ++--- fs/xfs/libxfs/xfs_attr_leaf.h | 2 +- fs/xfs/xfs_attr_inactive.c | 83 +++++++++++++++++++++++++------------------ fs/xfs/xfs_inode.c | 12 +++---- 4 files changed, 58 insertions(+), 47 deletions(-) (limited to 'fs/xfs/xfs_inode.c') diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c index 04e79d57bca6..e9d401ce93bb 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.c +++ b/fs/xfs/libxfs/xfs_attr_leaf.c @@ -574,8 +574,8 @@ xfs_attr_shortform_add(xfs_da_args_t *args, int forkoff) * After the last attribute is removed revert to original inode format, * making all literal area available to the data fork once more. */ -STATIC void -xfs_attr_fork_reset( +void +xfs_attr_fork_remove( struct xfs_inode *ip, struct xfs_trans *tp) { @@ -641,7 +641,7 @@ xfs_attr_shortform_remove(xfs_da_args_t *args) (mp->m_flags & XFS_MOUNT_ATTR2) && (dp->i_d.di_format != XFS_DINODE_FMT_BTREE) && !(args->op_flags & XFS_DA_OP_ADDNAME)) { - xfs_attr_fork_reset(dp, args->trans); + xfs_attr_fork_remove(dp, args->trans); } else { xfs_idata_realloc(dp, -size, XFS_ATTR_FORK); dp->i_d.di_forkoff = xfs_attr_shortform_bytesfit(dp, totsize); @@ -905,7 +905,7 @@ xfs_attr3_leaf_to_shortform( if (forkoff == -1) { ASSERT(dp->i_mount->m_flags & XFS_MOUNT_ATTR2); ASSERT(dp->i_d.di_format != XFS_DINODE_FMT_BTREE); - xfs_attr_fork_reset(dp, args->trans); + xfs_attr_fork_remove(dp, args->trans); goto out; } diff --git a/fs/xfs/libxfs/xfs_attr_leaf.h b/fs/xfs/libxfs/xfs_attr_leaf.h index 025c4b820c03..882c8d338891 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.h +++ b/fs/xfs/libxfs/xfs_attr_leaf.h @@ -53,7 +53,7 @@ int xfs_attr_shortform_remove(struct xfs_da_args *args); int xfs_attr_shortform_list(struct xfs_attr_list_context *context); int xfs_attr_shortform_allfit(struct xfs_buf *bp, struct xfs_inode *dp); int xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes); - +void xfs_attr_fork_remove(struct xfs_inode *ip, struct xfs_trans *tp); /* * Internal routines when attribute fork size == XFS_LBSIZE(mp). diff --git a/fs/xfs/xfs_attr_inactive.c b/fs/xfs/xfs_attr_inactive.c index f9c1c64782d3..3fbf167cfb4c 100644 --- a/fs/xfs/xfs_attr_inactive.c +++ b/fs/xfs/xfs_attr_inactive.c @@ -380,23 +380,31 @@ xfs_attr3_root_inactive( return error; } +/* + * xfs_attr_inactive kills all traces of an attribute fork on an inode. It + * removes both the on-disk and in-memory inode fork. Note that this also has to + * handle the condition of inodes without attributes but with an attribute fork + * configured, so we can't use xfs_inode_hasattr() here. + * + * The in-memory attribute fork is removed even on error. + */ int -xfs_attr_inactive(xfs_inode_t *dp) +xfs_attr_inactive( + struct xfs_inode *dp) { - xfs_trans_t *trans; - xfs_mount_t *mp; - int error; + struct xfs_trans *trans; + struct xfs_mount *mp; + int cancel_flags = 0; + int lock_mode = XFS_ILOCK_SHARED; + int error = 0; mp = dp->i_mount; ASSERT(! XFS_NOT_DQATTACHED(mp, dp)); - xfs_ilock(dp, XFS_ILOCK_SHARED); - if (!xfs_inode_hasattr(dp) || - dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) { - xfs_iunlock(dp, XFS_ILOCK_SHARED); - return 0; - } - xfs_iunlock(dp, XFS_ILOCK_SHARED); + xfs_ilock(dp, lock_mode); + if (!XFS_IFORK_Q(dp)) + goto out_destroy_fork; + xfs_iunlock(dp, lock_mode); /* * Start our first transaction of the day. @@ -408,13 +416,18 @@ xfs_attr_inactive(xfs_inode_t *dp) * the inode in every transaction to let it float upward through * the log. */ + lock_mode = 0; trans = xfs_trans_alloc(mp, XFS_TRANS_ATTRINVAL); error = xfs_trans_reserve(trans, &M_RES(mp)->tr_attrinval, 0, 0); - if (error) { - xfs_trans_cancel(trans, 0); - return error; - } - xfs_ilock(dp, XFS_ILOCK_EXCL); + if (error) + goto out_cancel; + + lock_mode = XFS_ILOCK_EXCL; + cancel_flags = XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT; + xfs_ilock(dp, lock_mode); + + if (!XFS_IFORK_Q(dp)) + goto out_cancel; /* * No need to make quota reservations here. We expect to release some @@ -422,29 +435,31 @@ xfs_attr_inactive(xfs_inode_t *dp) */ xfs_trans_ijoin(trans, dp, 0); - /* - * Decide on what work routines to call based on the inode size. - */ - if (!xfs_inode_hasattr(dp) || - dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) { - error = 0; - goto out; + /* invalidate and truncate the attribute fork extents */ + if (dp->i_d.di_aformat != XFS_DINODE_FMT_LOCAL) { + error = xfs_attr3_root_inactive(&trans, dp); + if (error) + goto out_cancel; + + error = xfs_itruncate_extents(&trans, dp, XFS_ATTR_FORK, 0); + if (error) + goto out_cancel; } - error = xfs_attr3_root_inactive(&trans, dp); - if (error) - goto out; - error = xfs_itruncate_extents(&trans, dp, XFS_ATTR_FORK, 0); - if (error) - goto out; + /* Reset the attribute fork - this also destroys the in-core fork */ + xfs_attr_fork_remove(dp, trans); error = xfs_trans_commit(trans, XFS_TRANS_RELEASE_LOG_RES); - xfs_iunlock(dp, XFS_ILOCK_EXCL); - + xfs_iunlock(dp, lock_mode); return error; -out: - xfs_trans_cancel(trans, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT); - xfs_iunlock(dp, XFS_ILOCK_EXCL); +out_cancel: + xfs_trans_cancel(trans, cancel_flags); +out_destroy_fork: + /* kill the in-core attr fork before we drop the inode lock */ + if (dp->i_afp) + xfs_idestroy_fork(dp, XFS_ATTR_FORK); + if (lock_mode) + xfs_iunlock(dp, lock_mode); return error; } diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index d6ebc85192b7..1117dd3ba123 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1946,21 +1946,17 @@ xfs_inactive( /* * If there are attributes associated with the file then blow them away * now. The code calls a routine that recursively deconstructs the - * attribute fork. We need to just commit the current transaction - * because we can't use it for xfs_attr_inactive(). + * attribute fork. If also blows away the in-core attribute fork. */ - if (ip->i_d.di_anextents > 0) { - ASSERT(ip->i_d.di_forkoff != 0); - + if (XFS_IFORK_Q(ip)) { error = xfs_attr_inactive(ip); if (error) return; } - if (ip->i_afp) - xfs_idestroy_fork(ip, XFS_ATTR_FORK); - + ASSERT(!ip->i_afp); ASSERT(ip->i_d.di_anextents == 0); + ASSERT(ip->i_d.di_forkoff == 0); /* * Free the inode. -- cgit v1.2.3-55-g7522 From 22419ac9fe5e79483596cebdbd1d1209c18bac1a Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Fri, 29 May 2015 08:14:55 +1000 Subject: xfs: fix broken i_nlink accounting for whiteout tmpfile inode XFS uses the internal tmpfile() infrastructure for the whiteout inode used for RENAME_WHITEOUT operations. For tmpfile inodes, XFS allocates the inode, drops di_nlink, adds the inode to the agi unlinked list, calls d_tmpfile() which correspondingly drops i_nlink of the vfs inode, and then finishes the common inode setup (e.g., clear I_NEW and unlock). The d_tmpfile() call was originally made inxfs_create_tmpfile(), but was pulled up out of that function as part of the following commit to resolve a deadlock issue: 330033d6 xfs: fix tmpfile/selinux deadlock and initialize security As a result, callers of xfs_create_tmpfile() are responsible for either calling d_tmpfile() or fixing up i_nlink appropriately. The whiteout tmpfile allocation helper does neither. As a result, the vfs ->i_nlink becomes inconsistent with the on-disk ->di_nlink once xfs_rename() links it back into the source dentry and calls xfs_bumplink(). Update the assert in xfs_rename() to help detect this problem in the future and update xfs_rename_alloc_whiteout() to decrement the link count as part of the manual tmpfile inode setup. Signed-off-by: Brian Foster Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/xfs_inode.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'fs/xfs/xfs_inode.c') diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 1117dd3ba123..539a85fddbc2 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -2879,7 +2879,13 @@ xfs_rename_alloc_whiteout( if (error) return error; - /* Satisfy xfs_bumplink that this is a real tmpfile */ + /* + * Prepare the tmpfile inode as if it were created through the VFS. + * Otherwise, the link increment paths will complain about nlink 0->1. + * Drop the link count as done by d_tmpfile(), complete the inode setup + * and flag it as linkable. + */ + drop_nlink(VFS_I(tmpfile)); xfs_finish_inode_setup(tmpfile); VFS_I(tmpfile)->i_state |= I_LINKABLE; @@ -3147,7 +3153,7 @@ xfs_rename( * intermediate state on disk. */ if (wip) { - ASSERT(wip->i_d.di_nlink == 0); + ASSERT(VFS_I(wip)->i_nlink == 0 && wip->i_d.di_nlink == 0); error = xfs_bumplink(tp, wip); if (error) goto out_trans_abort; -- cgit v1.2.3-55-g7522