summaryrefslogtreecommitdiffstats
path: root/fs/xfs/xfs_icache.c
diff options
context:
space:
mode:
authorDarrick J. Wong2016-10-03 18:11:46 +0200
committerDarrick J. Wong2016-10-06 01:26:28 +0200
commit83104d449e8c44e4870a795132437257cdf80006 (patch)
treeb121d0dc8b995abc636adad9adbfa723dcbb8f3e /fs/xfs/xfs_icache.c
parentxfs: try other AGs to allocate a BMBT block (diff)
downloadkernel-qcow2-linux-83104d449e8c44e4870a795132437257cdf80006.tar.gz
kernel-qcow2-linux-83104d449e8c44e4870a795132437257cdf80006.tar.xz
kernel-qcow2-linux-83104d449e8c44e4870a795132437257cdf80006.zip
xfs: garbage collect old cowextsz reservations
Trim CoW reservations made on behalf of a cowextsz hint if they get too old or we run low on quota, so long as we don't have dirty data awaiting writeback or directio operations in progress. Garbage collection of the cowextsize extents are kept separate from prealloc extent reaping because setting the CoW prealloc lifetime to a (much) higher value than the regular prealloc extent lifetime has been useful for combatting CoW fragmentation on VM hosts where the VMs experience bursty write behaviors and we can keep the utilization ratios low enough that we don't start to run out of space. IOWs, it benefits us to keep the CoW fork reservations around for as long as we can unless we run out of blocks or hit inode reclaim. Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com> Reviewed-by: Christoph Hellwig <hch@lst.de>
Diffstat (limited to 'fs/xfs/xfs_icache.c')
-rw-r--r--fs/xfs/xfs_icache.c238
1 files changed, 209 insertions, 29 deletions
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index 2d3de02f3529..14796b744e0a 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -33,6 +33,7 @@
#include "xfs_bmap_util.h"
#include "xfs_dquot_item.h"
#include "xfs_dquot.h"
+#include "xfs_reflink.h"
#include <linux/kthread.h>
#include <linux/freezer.h>
@@ -792,6 +793,33 @@ xfs_eofblocks_worker(
xfs_queue_eofblocks(mp);
}
+/*
+ * Background scanning to trim preallocated CoW space. This is queued
+ * based on the 'speculative_cow_prealloc_lifetime' tunable (5m by default).
+ * (We'll just piggyback on the post-EOF prealloc space workqueue.)
+ */
+STATIC void
+xfs_queue_cowblocks(
+ struct xfs_mount *mp)
+{
+ rcu_read_lock();
+ if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_COWBLOCKS_TAG))
+ queue_delayed_work(mp->m_eofblocks_workqueue,
+ &mp->m_cowblocks_work,
+ msecs_to_jiffies(xfs_cowb_secs * 1000));
+ rcu_read_unlock();
+}
+
+void
+xfs_cowblocks_worker(
+ struct work_struct *work)
+{
+ struct xfs_mount *mp = container_of(to_delayed_work(work),
+ struct xfs_mount, m_cowblocks_work);
+ xfs_icache_free_cowblocks(mp, NULL);
+ xfs_queue_cowblocks(mp);
+}
+
int
xfs_inode_ag_iterator(
struct xfs_mount *mp,
@@ -1348,18 +1376,30 @@ xfs_inode_free_eofblocks(
return ret;
}
-int
-xfs_icache_free_eofblocks(
+static int
+__xfs_icache_free_eofblocks(
struct xfs_mount *mp,
- struct xfs_eofblocks *eofb)
+ struct xfs_eofblocks *eofb,
+ int (*execute)(struct xfs_inode *ip, int flags,
+ void *args),
+ int tag)
{
int flags = SYNC_TRYLOCK;
if (eofb && (eofb->eof_flags & XFS_EOF_FLAGS_SYNC))
flags = SYNC_WAIT;
- return xfs_inode_ag_iterator_tag(mp, xfs_inode_free_eofblocks, flags,
- eofb, XFS_ICI_EOFBLOCKS_TAG);
+ return xfs_inode_ag_iterator_tag(mp, execute, flags,
+ eofb, tag);
+}
+
+int
+xfs_icache_free_eofblocks(
+ struct xfs_mount *mp,
+ struct xfs_eofblocks *eofb)
+{
+ return __xfs_icache_free_eofblocks(mp, eofb, xfs_inode_free_eofblocks,
+ XFS_ICI_EOFBLOCKS_TAG);
}
/*
@@ -1368,9 +1408,11 @@ xfs_icache_free_eofblocks(
* failure. We make a best effort by including each quota under low free space
* conditions (less than 1% free space) in the scan.
*/
-int
-xfs_inode_free_quota_eofblocks(
- struct xfs_inode *ip)
+static int
+__xfs_inode_free_quota_eofblocks(
+ struct xfs_inode *ip,
+ int (*execute)(struct xfs_mount *mp,
+ struct xfs_eofblocks *eofb))
{
int scan = 0;
struct xfs_eofblocks eofb = {0};
@@ -1406,14 +1448,25 @@ xfs_inode_free_quota_eofblocks(
}
if (scan)
- xfs_icache_free_eofblocks(ip->i_mount, &eofb);
+ execute(ip->i_mount, &eofb);
return scan;
}
-void
-xfs_inode_set_eofblocks_tag(
- xfs_inode_t *ip)
+int
+xfs_inode_free_quota_eofblocks(
+ struct xfs_inode *ip)
+{
+ return __xfs_inode_free_quota_eofblocks(ip, xfs_icache_free_eofblocks);
+}
+
+static void
+__xfs_inode_set_eofblocks_tag(
+ xfs_inode_t *ip,
+ void (*execute)(struct xfs_mount *mp),
+ void (*set_tp)(struct xfs_mount *mp, xfs_agnumber_t agno,
+ int error, unsigned long caller_ip),
+ int tag)
{
struct xfs_mount *mp = ip->i_mount;
struct xfs_perag *pag;
@@ -1431,26 +1484,22 @@ xfs_inode_set_eofblocks_tag(
pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
spin_lock(&pag->pag_ici_lock);
- trace_xfs_inode_set_eofblocks_tag(ip);
- tagged = radix_tree_tagged(&pag->pag_ici_root,
- XFS_ICI_EOFBLOCKS_TAG);
+ tagged = radix_tree_tagged(&pag->pag_ici_root, tag);
radix_tree_tag_set(&pag->pag_ici_root,
- XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino),
- XFS_ICI_EOFBLOCKS_TAG);
+ XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino), tag);
if (!tagged) {
/* propagate the eofblocks tag up into the perag radix tree */
spin_lock(&ip->i_mount->m_perag_lock);
radix_tree_tag_set(&ip->i_mount->m_perag_tree,
XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
- XFS_ICI_EOFBLOCKS_TAG);
+ tag);
spin_unlock(&ip->i_mount->m_perag_lock);
/* kick off background trimming */
- xfs_queue_eofblocks(ip->i_mount);
+ execute(ip->i_mount);
- trace_xfs_perag_set_eofblocks(ip->i_mount, pag->pag_agno,
- -1, _RET_IP_);
+ set_tp(ip->i_mount, pag->pag_agno, -1, _RET_IP_);
}
spin_unlock(&pag->pag_ici_lock);
@@ -1458,9 +1507,22 @@ xfs_inode_set_eofblocks_tag(
}
void
-xfs_inode_clear_eofblocks_tag(
+xfs_inode_set_eofblocks_tag(
xfs_inode_t *ip)
{
+ trace_xfs_inode_set_eofblocks_tag(ip);
+ return __xfs_inode_set_eofblocks_tag(ip, xfs_queue_eofblocks,
+ trace_xfs_perag_set_eofblocks,
+ XFS_ICI_EOFBLOCKS_TAG);
+}
+
+static void
+__xfs_inode_clear_eofblocks_tag(
+ xfs_inode_t *ip,
+ void (*clear_tp)(struct xfs_mount *mp, xfs_agnumber_t agno,
+ int error, unsigned long caller_ip),
+ int tag)
+{
struct xfs_mount *mp = ip->i_mount;
struct xfs_perag *pag;
@@ -1470,23 +1532,141 @@ xfs_inode_clear_eofblocks_tag(
pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
spin_lock(&pag->pag_ici_lock);
- trace_xfs_inode_clear_eofblocks_tag(ip);
radix_tree_tag_clear(&pag->pag_ici_root,
- XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino),
- XFS_ICI_EOFBLOCKS_TAG);
- if (!radix_tree_tagged(&pag->pag_ici_root, XFS_ICI_EOFBLOCKS_TAG)) {
+ XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino), tag);
+ if (!radix_tree_tagged(&pag->pag_ici_root, tag)) {
/* clear the eofblocks tag from the perag radix tree */
spin_lock(&ip->i_mount->m_perag_lock);
radix_tree_tag_clear(&ip->i_mount->m_perag_tree,
XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
- XFS_ICI_EOFBLOCKS_TAG);
+ tag);
spin_unlock(&ip->i_mount->m_perag_lock);
- trace_xfs_perag_clear_eofblocks(ip->i_mount, pag->pag_agno,
- -1, _RET_IP_);
+ clear_tp(ip->i_mount, pag->pag_agno, -1, _RET_IP_);
}
spin_unlock(&pag->pag_ici_lock);
xfs_perag_put(pag);
}
+void
+xfs_inode_clear_eofblocks_tag(
+ xfs_inode_t *ip)
+{
+ trace_xfs_inode_clear_eofblocks_tag(ip);
+ return __xfs_inode_clear_eofblocks_tag(ip,
+ trace_xfs_perag_clear_eofblocks, XFS_ICI_EOFBLOCKS_TAG);
+}
+
+/*
+ * Automatic CoW Reservation Freeing
+ *
+ * These functions automatically garbage collect leftover CoW reservations
+ * that were made on behalf of a cowextsize hint when we start to run out
+ * of quota or when the reservations sit around for too long. If the file
+ * has dirty pages or is undergoing writeback, its CoW reservations will
+ * be retained.
+ *
+ * The actual garbage collection piggybacks off the same code that runs
+ * the speculative EOF preallocation garbage collector.
+ */
+STATIC int
+xfs_inode_free_cowblocks(
+ struct xfs_inode *ip,
+ int flags,
+ void *args)
+{
+ int ret;
+ struct xfs_eofblocks *eofb = args;
+ bool need_iolock = true;
+ int match;
+
+ ASSERT(!eofb || (eofb && eofb->eof_scan_owner != 0));
+
+ if (!xfs_reflink_has_real_cow_blocks(ip)) {
+ trace_xfs_inode_free_cowblocks_invalid(ip);
+ xfs_inode_clear_cowblocks_tag(ip);
+ return 0;
+ }
+
+ /*
+ * If the mapping is dirty or under writeback we cannot touch the
+ * CoW fork. Leave it alone if we're in the midst of a directio.
+ */
+ if (mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_DIRTY) ||
+ mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_WRITEBACK) ||
+ atomic_read(&VFS_I(ip)->i_dio_count))
+ return 0;
+
+ if (eofb) {
+ if (eofb->eof_flags & XFS_EOF_FLAGS_UNION)
+ match = xfs_inode_match_id_union(ip, eofb);
+ else
+ match = xfs_inode_match_id(ip, eofb);
+ if (!match)
+ return 0;
+
+ /* skip the inode if the file size is too small */
+ if (eofb->eof_flags & XFS_EOF_FLAGS_MINFILESIZE &&
+ XFS_ISIZE(ip) < eofb->eof_min_file_size)
+ return 0;
+
+ /*
+ * A scan owner implies we already hold the iolock. Skip it in
+ * xfs_free_eofblocks() to avoid deadlock. This also eliminates
+ * the possibility of EAGAIN being returned.
+ */
+ if (eofb->eof_scan_owner == ip->i_ino)
+ need_iolock = false;
+ }
+
+ /* Free the CoW blocks */
+ if (need_iolock) {
+ xfs_ilock(ip, XFS_IOLOCK_EXCL);
+ xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
+ }
+
+ ret = xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF);
+
+ if (need_iolock) {
+ xfs_iunlock(ip, XFS_MMAPLOCK_EXCL);
+ xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+ }
+
+ return ret;
+}
+
+int
+xfs_icache_free_cowblocks(
+ struct xfs_mount *mp,
+ struct xfs_eofblocks *eofb)
+{
+ return __xfs_icache_free_eofblocks(mp, eofb, xfs_inode_free_cowblocks,
+ XFS_ICI_COWBLOCKS_TAG);
+}
+
+int
+xfs_inode_free_quota_cowblocks(
+ struct xfs_inode *ip)
+{
+ return __xfs_inode_free_quota_eofblocks(ip, xfs_icache_free_cowblocks);
+}
+
+void
+xfs_inode_set_cowblocks_tag(
+ xfs_inode_t *ip)
+{
+ trace_xfs_inode_set_eofblocks_tag(ip);
+ return __xfs_inode_set_eofblocks_tag(ip, xfs_queue_cowblocks,
+ trace_xfs_perag_set_eofblocks,
+ XFS_ICI_COWBLOCKS_TAG);
+}
+
+void
+xfs_inode_clear_cowblocks_tag(
+ xfs_inode_t *ip)
+{
+ trace_xfs_inode_clear_eofblocks_tag(ip);
+ return __xfs_inode_clear_eofblocks_tag(ip,
+ trace_xfs_perag_clear_eofblocks, XFS_ICI_COWBLOCKS_TAG);
+}