summaryrefslogtreecommitdiffstats
path: root/fs/xfs
diff options
context:
space:
mode:
Diffstat (limited to 'fs/xfs')
-rw-r--r--fs/xfs/Kconfig2
-rw-r--r--fs/xfs/Makefile18
-rw-r--r--fs/xfs/kmem.c5
-rw-r--r--fs/xfs/kmem.h8
-rw-r--r--fs/xfs/libxfs/xfs_ag.c160
-rw-r--r--fs/xfs/libxfs/xfs_ag.h2
-rw-r--r--fs/xfs/libxfs/xfs_ag_resv.c10
-rw-r--r--fs/xfs/libxfs/xfs_alloc.c252
-rw-r--r--fs/xfs/libxfs/xfs_alloc_btree.c77
-rw-r--r--fs/xfs/libxfs/xfs_attr.c57
-rw-r--r--fs/xfs/libxfs/xfs_attr.h12
-rw-r--r--fs/xfs/libxfs/xfs_attr_leaf.c36
-rw-r--r--fs/xfs/libxfs/xfs_attr_remote.c22
-rw-r--r--fs/xfs/libxfs/xfs_bit.c1
-rw-r--r--fs/xfs/libxfs/xfs_bmap.c353
-rw-r--r--fs/xfs/libxfs/xfs_bmap.h16
-rw-r--r--fs/xfs/libxfs/xfs_bmap_btree.c18
-rw-r--r--fs/xfs/libxfs/xfs_btree.c49
-rw-r--r--fs/xfs/libxfs/xfs_btree.h14
-rw-r--r--fs/xfs/libxfs/xfs_da_btree.c59
-rw-r--r--fs/xfs/libxfs/xfs_da_format.c3
-rw-r--r--fs/xfs/libxfs/xfs_da_format.h3
-rw-r--r--fs/xfs/libxfs/xfs_defer.c16
-rw-r--r--fs/xfs/libxfs/xfs_dir2.c23
-rw-r--r--fs/xfs/libxfs/xfs_dir2.h1
-rw-r--r--fs/xfs/libxfs/xfs_dir2_block.c21
-rw-r--r--fs/xfs/libxfs/xfs_dir2_data.c26
-rw-r--r--fs/xfs/libxfs/xfs_dir2_leaf.c148
-rw-r--r--fs/xfs/libxfs/xfs_dir2_node.c38
-rw-r--r--fs/xfs/libxfs/xfs_dir2_sf.c5
-rw-r--r--fs/xfs/libxfs/xfs_dquot_buf.c18
-rw-r--r--fs/xfs/libxfs/xfs_errortag.h4
-rw-r--r--fs/xfs/libxfs/xfs_format.h2
-rw-r--r--fs/xfs/libxfs/xfs_fs.h263
-rw-r--r--fs/xfs/libxfs/xfs_health.h190
-rw-r--r--fs/xfs/libxfs/xfs_ialloc.c248
-rw-r--r--fs/xfs/libxfs/xfs_ialloc.h18
-rw-r--r--fs/xfs/libxfs/xfs_ialloc_btree.c94
-rw-r--r--fs/xfs/libxfs/xfs_ialloc_btree.h3
-rw-r--r--fs/xfs/libxfs/xfs_iext_tree.c19
-rw-r--r--fs/xfs/libxfs/xfs_inode_buf.c20
-rw-r--r--fs/xfs/libxfs/xfs_inode_fork.c4
-rw-r--r--fs/xfs/libxfs/xfs_inode_fork.h2
-rw-r--r--fs/xfs/libxfs/xfs_log_rlimit.c2
-rw-r--r--fs/xfs/libxfs/xfs_quota_defs.h2
-rw-r--r--fs/xfs/libxfs/xfs_refcount.c2
-rw-r--r--fs/xfs/libxfs/xfs_refcount_btree.c16
-rw-r--r--fs/xfs/libxfs/xfs_rmap.c7
-rw-r--r--fs/xfs/libxfs/xfs_rmap_btree.c18
-rw-r--r--fs/xfs/libxfs/xfs_rtbitmap.c8
-rw-r--r--fs/xfs/libxfs/xfs_sb.c64
-rw-r--r--fs/xfs/libxfs/xfs_sb.h2
-rw-r--r--fs/xfs/libxfs/xfs_shared.h53
-rw-r--r--fs/xfs/libxfs/xfs_symlink_remote.c13
-rw-r--r--fs/xfs/libxfs/xfs_trans_inode.c (renamed from fs/xfs/xfs_trans_inode.c)7
-rw-r--r--fs/xfs/libxfs/xfs_trans_resv.c23
-rw-r--r--fs/xfs/libxfs/xfs_trans_space.h7
-rw-r--r--fs/xfs/libxfs/xfs_types.c39
-rw-r--r--fs/xfs/libxfs/xfs_types.h5
-rw-r--r--fs/xfs/scrub/agheader.c41
-rw-r--r--fs/xfs/scrub/agheader_repair.c17
-rw-r--r--fs/xfs/scrub/alloc.c7
-rw-r--r--fs/xfs/scrub/attr.c131
-rw-r--r--fs/xfs/scrub/attr.h71
-rw-r--r--fs/xfs/scrub/bitmap.c5
-rw-r--r--fs/xfs/scrub/bmap.c35
-rw-r--r--fs/xfs/scrub/btree.c18
-rw-r--r--fs/xfs/scrub/common.c55
-rw-r--r--fs/xfs/scrub/common.h4
-rw-r--r--fs/xfs/scrub/dabtree.c13
-rw-r--r--fs/xfs/scrub/dir.c16
-rw-r--r--fs/xfs/scrub/fscounters.c354
-rw-r--r--fs/xfs/scrub/health.c229
-rw-r--r--fs/xfs/scrub/health.h14
-rw-r--r--fs/xfs/scrub/ialloc.c343
-rw-r--r--fs/xfs/scrub/inode.c10
-rw-r--r--fs/xfs/scrub/parent.c10
-rw-r--r--fs/xfs/scrub/quota.c15
-rw-r--r--fs/xfs/scrub/refcount.c10
-rw-r--r--fs/xfs/scrub/repair.c51
-rw-r--r--fs/xfs/scrub/repair.h8
-rw-r--r--fs/xfs/scrub/rmap.c9
-rw-r--r--fs/xfs/scrub/rtbitmap.c12
-rw-r--r--fs/xfs/scrub/scrub.c69
-rw-r--r--fs/xfs/scrub/scrub.h27
-rw-r--r--fs/xfs/scrub/symlink.c8
-rw-r--r--fs/xfs/scrub/trace.c6
-rw-r--r--fs/xfs/scrub/trace.h108
-rw-r--r--fs/xfs/xfs_acl.c4
-rw-r--r--fs/xfs/xfs_aops.c503
-rw-r--r--fs/xfs/xfs_aops.h26
-rw-r--r--fs/xfs/xfs_attr_inactive.c7
-rw-r--r--fs/xfs/xfs_attr_list.c8
-rw-r--r--fs/xfs/xfs_bio_io.c61
-rw-r--r--fs/xfs/xfs_bmap_item.c350
-rw-r--r--fs/xfs/xfs_bmap_item.h2
-rw-r--r--fs/xfs/xfs_bmap_util.c22
-rw-r--r--fs/xfs/xfs_buf.c239
-rw-r--r--fs/xfs/xfs_buf.h61
-rw-r--r--fs/xfs/xfs_buf_item.c44
-rw-r--r--fs/xfs/xfs_buf_item.h6
-rw-r--r--fs/xfs/xfs_dir2_readdir.c5
-rw-r--r--fs/xfs/xfs_discard.c15
-rw-r--r--fs/xfs/xfs_dquot.c23
-rw-r--r--fs/xfs/xfs_dquot.h1
-rw-r--r--fs/xfs/xfs_dquot_item.c118
-rw-r--r--fs/xfs/xfs_dquot_item.h4
-rw-r--r--fs/xfs/xfs_error.c9
-rw-r--r--fs/xfs/xfs_error.h1
-rw-r--r--fs/xfs/xfs_export.c4
-rw-r--r--fs/xfs/xfs_extfree_item.c410
-rw-r--r--fs/xfs/xfs_extfree_item.h6
-rw-r--r--fs/xfs/xfs_file.c106
-rw-r--r--fs/xfs/xfs_filestream.c5
-rw-r--r--fs/xfs/xfs_fsmap.c4
-rw-r--r--fs/xfs/xfs_fsops.c12
-rw-r--r--fs/xfs/xfs_fsops.h2
-rw-r--r--fs/xfs/xfs_globals.c6
-rw-r--r--fs/xfs/xfs_health.c388
-rw-r--r--fs/xfs/xfs_icache.c15
-rw-r--r--fs/xfs/xfs_icache.h4
-rw-r--r--fs/xfs/xfs_icreate_item.c75
-rw-r--r--fs/xfs/xfs_inode.c842
-rw-r--r--fs/xfs/xfs_inode.h20
-rw-r--r--fs/xfs/xfs_inode_item.c16
-rw-r--r--fs/xfs/xfs_inode_item.h2
-rw-r--r--fs/xfs/xfs_ioctl.c504
-rw-r--r--fs/xfs/xfs_ioctl.h8
-rw-r--r--fs/xfs/xfs_ioctl32.c170
-rw-r--r--fs/xfs/xfs_ioctl32.h14
-rw-r--r--fs/xfs/xfs_iomap.c523
-rw-r--r--fs/xfs/xfs_iomap.h7
-rw-r--r--fs/xfs/xfs_iops.c31
-rw-r--r--fs/xfs/xfs_itable.c751
-rw-r--r--fs/xfs/xfs_itable.h106
-rw-r--r--fs/xfs/xfs_iwalk.c720
-rw-r--r--fs/xfs/xfs_iwalk.h46
-rw-r--r--fs/xfs/xfs_linux.h5
-rw-r--r--fs/xfs/xfs_log.c658
-rw-r--r--fs/xfs/xfs_log.h17
-rw-r--r--fs/xfs/xfs_log_cil.c72
-rw-r--r--fs/xfs/xfs_log_priv.h36
-rw-r--r--fs/xfs/xfs_log_recover.c487
-rw-r--r--fs/xfs/xfs_message.c2
-rw-r--r--fs/xfs/xfs_mount.c142
-rw-r--r--fs/xfs/xfs_mount.h64
-rw-r--r--fs/xfs/xfs_ondisk.h26
-rw-r--r--fs/xfs/xfs_pnfs.c11
-rw-r--r--fs/xfs/xfs_pwork.c136
-rw-r--r--fs/xfs/xfs_pwork.h61
-rw-r--r--fs/xfs/xfs_qm.c71
-rw-r--r--fs/xfs/xfs_qm.h8
-rw-r--r--fs/xfs/xfs_qm_bhv.c2
-rw-r--r--fs/xfs/xfs_qm_syscalls.c5
-rw-r--r--fs/xfs/xfs_quota.h37
-rw-r--r--fs/xfs/xfs_quotaops.c3
-rw-r--r--fs/xfs/xfs_refcount_item.c357
-rw-r--r--fs/xfs/xfs_refcount_item.h2
-rw-r--r--fs/xfs/xfs_reflink.c165
-rw-r--r--fs/xfs/xfs_reflink.h18
-rw-r--r--fs/xfs/xfs_rmap_item.c380
-rw-r--r--fs/xfs/xfs_rmap_item.h2
-rw-r--r--fs/xfs/xfs_rtalloc.c6
-rw-r--r--fs/xfs/xfs_stats.c1
-rw-r--r--fs/xfs/xfs_super.c105
-rw-r--r--fs/xfs/xfs_super.h14
-rw-r--r--fs/xfs/xfs_symlink.c9
-rw-r--r--fs/xfs/xfs_sysctl.c3
-rw-r--r--fs/xfs/xfs_sysctl.h4
-rw-r--r--fs/xfs/xfs_sysfs.c66
-rw-r--r--fs/xfs/xfs_trace.c8
-rw-r--r--fs/xfs/xfs_trace.h252
-rw-r--r--fs/xfs/xfs_trans.c43
-rw-r--r--fs/xfs/xfs_trans.h70
-rw-r--r--fs/xfs/xfs_trans_ail.c53
-rw-r--r--fs/xfs/xfs_trans_bmap.c233
-rw-r--r--fs/xfs/xfs_trans_buf.c13
-rw-r--r--fs/xfs/xfs_trans_dquot.c63
-rw-r--r--fs/xfs/xfs_trans_extfree.c287
-rw-r--r--fs/xfs/xfs_trans_priv.h4
-rw-r--r--fs/xfs/xfs_trans_refcount.c241
-rw-r--r--fs/xfs/xfs_trans_rmap.c258
-rw-r--r--fs/xfs/xfs_xattr.c8
183 files changed, 8758 insertions, 6360 deletions
diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig
index 457ac9f97377..e685299eb3d2 100644
--- a/fs/xfs/Kconfig
+++ b/fs/xfs/Kconfig
@@ -1,7 +1,7 @@
+# SPDX-License-Identifier: GPL-2.0-only
config XFS_FS
tristate "XFS filesystem support"
depends on BLOCK
- depends on (64BIT || LBDAF)
select EXPORTFS
select LIBCRC32C
select FS_IOMAP
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index 7f96bdadc372..06b68b6115bc 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -4,8 +4,8 @@
# All Rights Reserved.
#
-ccflags-y += -I$(src) # needed for trace events
-ccflags-y += -I$(src)/libxfs
+ccflags-y += -I $(srctree)/$(src) # needed for trace events
+ccflags-y += -I $(srctree)/$(src)/libxfs
ccflags-$(CONFIG_XFS_DEBUG) += -g
@@ -49,6 +49,7 @@ xfs-y += $(addprefix libxfs/, \
xfs_refcount_btree.o \
xfs_sb.o \
xfs_symlink_remote.o \
+ xfs_trans_inode.o \
xfs_trans_resv.o \
xfs_types.o \
)
@@ -62,6 +63,7 @@ xfs-y += xfs_aops.o \
xfs_attr_inactive.o \
xfs_attr_list.o \
xfs_bmap_util.o \
+ xfs_bio_io.o \
xfs_buf.o \
xfs_dir2_readdir.o \
xfs_discard.o \
@@ -73,15 +75,18 @@ xfs-y += xfs_aops.o \
xfs_fsmap.o \
xfs_fsops.o \
xfs_globals.o \
+ xfs_health.o \
xfs_icache.o \
xfs_ioctl.o \
xfs_iomap.o \
xfs_iops.o \
xfs_inode.o \
xfs_itable.o \
+ xfs_iwalk.o \
xfs_message.o \
xfs_mount.o \
xfs_mru_cache.o \
+ xfs_pwork.o \
xfs_reflink.o \
xfs_stats.o \
xfs_super.o \
@@ -103,12 +108,7 @@ xfs-y += xfs_log.o \
xfs_rmap_item.o \
xfs_log_recover.o \
xfs_trans_ail.o \
- xfs_trans_bmap.o \
- xfs_trans_buf.o \
- xfs_trans_extfree.o \
- xfs_trans_inode.o \
- xfs_trans_refcount.o \
- xfs_trans_rmap.o \
+ xfs_trans_buf.o
# optional features
xfs-$(CONFIG_XFS_QUOTA) += xfs_dquot.o \
@@ -142,6 +142,8 @@ xfs-y += $(addprefix scrub/, \
common.o \
dabtree.o \
dir.o \
+ fscounters.o \
+ health.o \
ialloc.o \
inode.o \
parent.o \
diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c
index fdd9d6ede25c..16bb9a328678 100644
--- a/fs/xfs/kmem.c
+++ b/fs/xfs/kmem.c
@@ -3,12 +3,7 @@
* Copyright (c) 2000-2005 Silicon Graphics, Inc.
* All Rights Reserved.
*/
-#include <linux/mm.h>
#include <linux/sched/mm.h>
-#include <linux/highmem.h>
-#include <linux/slab.h>
-#include <linux/swap.h>
-#include <linux/blkdev.h>
#include <linux/backing-dev.h>
#include "kmem.h"
#include "xfs_message.h"
diff --git a/fs/xfs/kmem.h b/fs/xfs/kmem.h
index 8e6b3ba81c03..267655acd426 100644
--- a/fs/xfs/kmem.h
+++ b/fs/xfs/kmem.h
@@ -124,4 +124,12 @@ kmem_zone_zalloc(kmem_zone_t *zone, xfs_km_flags_t flags)
return kmem_zone_alloc(zone, flags | KM_ZERO);
}
+static inline struct page *
+kmem_to_page(void *addr)
+{
+ if (is_vmalloc_addr(addr))
+ return vmalloc_to_page(addr);
+ return virt_to_page(addr);
+}
+
#endif /* __XFS_SUPPORT_KMEM_H__ */
diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c
index 999ad8d00d43..5de296b34ab1 100644
--- a/fs/xfs/libxfs/xfs_ag.c
+++ b/fs/xfs/libxfs/xfs_ag.c
@@ -10,6 +10,7 @@
#include "xfs_shared.h"
#include "xfs_format.h"
#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
#include "xfs_sb.h"
#include "xfs_mount.h"
#include "xfs_btree.h"
@@ -19,6 +20,8 @@
#include "xfs_ialloc.h"
#include "xfs_rmap.h"
#include "xfs_ag.h"
+#include "xfs_ag_resv.h"
+#include "xfs_health.h"
static struct xfs_buf *
xfs_get_aghdr_buf(
@@ -42,6 +45,12 @@ xfs_get_aghdr_buf(
return bp;
}
+static inline bool is_log_ag(struct xfs_mount *mp, struct aghdr_init_data *id)
+{
+ return mp->m_sb.sb_logstart > 0 &&
+ id->agno == XFS_FSB_TO_AGNO(mp, mp->m_sb.sb_logstart);
+}
+
/*
* Generic btree root block init function
*/
@@ -51,40 +60,85 @@ xfs_btroot_init(
struct xfs_buf *bp,
struct aghdr_init_data *id)
{
- xfs_btree_init_block(mp, bp, id->type, 0, 0, id->agno, 0);
+ xfs_btree_init_block(mp, bp, id->type, 0, 0, id->agno);
}
-/*
- * Alloc btree root block init functions
- */
+/* Finish initializing a free space btree. */
static void
-xfs_bnoroot_init(
+xfs_freesp_init_recs(
struct xfs_mount *mp,
struct xfs_buf *bp,
struct aghdr_init_data *id)
{
struct xfs_alloc_rec *arec;
+ struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
- xfs_btree_init_block(mp, bp, XFS_BTNUM_BNO, 0, 1, id->agno, 0);
arec = XFS_ALLOC_REC_ADDR(mp, XFS_BUF_TO_BLOCK(bp), 1);
arec->ar_startblock = cpu_to_be32(mp->m_ag_prealloc_blocks);
+
+ if (is_log_ag(mp, id)) {
+ struct xfs_alloc_rec *nrec;
+ xfs_agblock_t start = XFS_FSB_TO_AGBNO(mp,
+ mp->m_sb.sb_logstart);
+
+ ASSERT(start >= mp->m_ag_prealloc_blocks);
+ if (start != mp->m_ag_prealloc_blocks) {
+ /*
+ * Modify first record to pad stripe align of log
+ */
+ arec->ar_blockcount = cpu_to_be32(start -
+ mp->m_ag_prealloc_blocks);
+ nrec = arec + 1;
+
+ /*
+ * Insert second record at start of internal log
+ * which then gets trimmed.
+ */
+ nrec->ar_startblock = cpu_to_be32(
+ be32_to_cpu(arec->ar_startblock) +
+ be32_to_cpu(arec->ar_blockcount));
+ arec = nrec;
+ be16_add_cpu(&block->bb_numrecs, 1);
+ }
+ /*
+ * Change record start to after the internal log
+ */
+ be32_add_cpu(&arec->ar_startblock, mp->m_sb.sb_logblocks);
+ }
+
+ /*
+ * Calculate the record block count and check for the case where
+ * the log might have consumed all available space in the AG. If
+ * so, reset the record count to 0 to avoid exposure of an invalid
+ * record start block.
+ */
arec->ar_blockcount = cpu_to_be32(id->agsize -
be32_to_cpu(arec->ar_startblock));
+ if (!arec->ar_blockcount)
+ block->bb_numrecs = 0;
}
+/*
+ * Alloc btree root block init functions
+ */
static void
-xfs_cntroot_init(
+xfs_bnoroot_init(
struct xfs_mount *mp,
struct xfs_buf *bp,
struct aghdr_init_data *id)
{
- struct xfs_alloc_rec *arec;
+ xfs_btree_init_block(mp, bp, XFS_BTNUM_BNO, 0, 1, id->agno);
+ xfs_freesp_init_recs(mp, bp, id);
+}
- xfs_btree_init_block(mp, bp, XFS_BTNUM_CNT, 0, 1, id->agno, 0);
- arec = XFS_ALLOC_REC_ADDR(mp, XFS_BUF_TO_BLOCK(bp), 1);
- arec->ar_startblock = cpu_to_be32(mp->m_ag_prealloc_blocks);
- arec->ar_blockcount = cpu_to_be32(id->agsize -
- be32_to_cpu(arec->ar_startblock));
+static void
+xfs_cntroot_init(
+ struct xfs_mount *mp,
+ struct xfs_buf *bp,
+ struct aghdr_init_data *id)
+{
+ xfs_btree_init_block(mp, bp, XFS_BTNUM_CNT, 0, 1, id->agno);
+ xfs_freesp_init_recs(mp, bp, id);
}
/*
@@ -99,7 +153,7 @@ xfs_rmaproot_init(
struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
struct xfs_rmap_rec *rrec;
- xfs_btree_init_block(mp, bp, XFS_BTNUM_RMAP, 0, 4, id->agno, 0);
+ xfs_btree_init_block(mp, bp, XFS_BTNUM_RMAP, 0, 4, id->agno);
/*
* mark the AG header regions as static metadata The BNO
@@ -147,6 +201,18 @@ xfs_rmaproot_init(
rrec->rm_offset = 0;
be16_add_cpu(&block->bb_numrecs, 1);
}
+
+ /* account for the log space */
+ if (is_log_ag(mp, id)) {
+ rrec = XFS_RMAP_REC_ADDR(block,
+ be16_to_cpu(block->bb_numrecs) + 1);
+ rrec->rm_startblock = cpu_to_be32(
+ XFS_FSB_TO_AGBNO(mp, mp->m_sb.sb_logstart));
+ rrec->rm_blockcount = cpu_to_be32(mp->m_sb.sb_logblocks);
+ rrec->rm_owner = cpu_to_be64(XFS_RMAP_OWN_LOG);
+ rrec->rm_offset = 0;
+ be16_add_cpu(&block->bb_numrecs, 1);
+ }
}
/*
@@ -207,6 +273,14 @@ xfs_agfblock_init(
agf->agf_refcount_level = cpu_to_be32(1);
agf->agf_refcount_blocks = cpu_to_be32(1);
}
+
+ if (is_log_ag(mp, id)) {
+ int64_t logblocks = mp->m_sb.sb_logblocks;
+
+ be32_add_cpu(&agf->agf_freeblks, -logblocks);
+ agf->agf_longest = cpu_to_be32(id->agsize -
+ XFS_FSB_TO_AGBNO(mp, mp->m_sb.sb_logstart) - logblocks);
+ }
}
static void
@@ -339,14 +413,14 @@ xfs_ag_init_headers(
{ /* BNO root block */
.daddr = XFS_AGB_TO_DADDR(mp, id->agno, XFS_BNO_BLOCK(mp)),
.numblks = BTOBB(mp->m_sb.sb_blocksize),
- .ops = &xfs_allocbt_buf_ops,
+ .ops = &xfs_bnobt_buf_ops,
.work = &xfs_bnoroot_init,
.need_init = true
},
{ /* CNT root block */
.daddr = XFS_AGB_TO_DADDR(mp, id->agno, XFS_CNT_BLOCK(mp)),
.numblks = BTOBB(mp->m_sb.sb_blocksize),
- .ops = &xfs_allocbt_buf_ops,
+ .ops = &xfs_cntbt_buf_ops,
.work = &xfs_cntroot_init,
.need_init = true
},
@@ -361,7 +435,7 @@ xfs_ag_init_headers(
{ /* FINO root block */
.daddr = XFS_AGB_TO_DADDR(mp, id->agno, XFS_FIBT_BLOCK(mp)),
.numblks = BTOBB(mp->m_sb.sb_blocksize),
- .ops = &xfs_inobt_buf_ops,
+ .ops = &xfs_finobt_buf_ops,
.work = &xfs_btroot_init,
.type = XFS_BTNUM_FINO,
.need_init = xfs_sb_version_hasfinobt(&mp->m_sb)
@@ -461,3 +535,55 @@ xfs_ag_extend_space(
len, &XFS_RMAP_OINFO_SKIP_UPDATE,
XFS_AG_RESV_NONE);
}
+
+/* Retrieve AG geometry. */
+int
+xfs_ag_get_geometry(
+ struct xfs_mount *mp,
+ xfs_agnumber_t agno,
+ struct xfs_ag_geometry *ageo)
+{
+ struct xfs_buf *agi_bp;
+ struct xfs_buf *agf_bp;
+ struct xfs_agi *agi;
+ struct xfs_agf *agf;
+ struct xfs_perag *pag;
+ unsigned int freeblks;
+ int error;
+
+ if (agno >= mp->m_sb.sb_agcount)
+ return -EINVAL;
+
+ /* Lock the AG headers. */
+ error = xfs_ialloc_read_agi(mp, NULL, agno, &agi_bp);
+ if (error)
+ return error;
+ error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agf_bp);
+ if (error)
+ goto out_agi;
+ pag = xfs_perag_get(mp, agno);
+
+ /* Fill out form. */
+ memset(ageo, 0, sizeof(*ageo));
+ ageo->ag_number = agno;
+
+ agi = XFS_BUF_TO_AGI(agi_bp);
+ ageo->ag_icount = be32_to_cpu(agi->agi_count);
+ ageo->ag_ifree = be32_to_cpu(agi->agi_freecount);
+
+ agf = XFS_BUF_TO_AGF(agf_bp);
+ ageo->ag_length = be32_to_cpu(agf->agf_length);
+ freeblks = pag->pagf_freeblks +
+ pag->pagf_flcount +
+ pag->pagf_btreeblks -
+ xfs_ag_resv_needed(pag, XFS_AG_RESV_NONE);
+ ageo->ag_freeblks = freeblks;
+ xfs_ag_geom_health(pag, ageo);
+
+ /* Release resources. */
+ xfs_perag_put(pag);
+ xfs_buf_relse(agf_bp);
+out_agi:
+ xfs_buf_relse(agi_bp);
+ return error;
+}
diff --git a/fs/xfs/libxfs/xfs_ag.h b/fs/xfs/libxfs/xfs_ag.h
index 412702e23f61..5166322807e7 100644
--- a/fs/xfs/libxfs/xfs_ag.h
+++ b/fs/xfs/libxfs/xfs_ag.h
@@ -26,5 +26,7 @@ struct aghdr_init_data {
int xfs_ag_init_headers(struct xfs_mount *mp, struct aghdr_init_data *id);
int xfs_ag_extend_space(struct xfs_mount *mp, struct xfs_trans *tp,
struct aghdr_init_data *id, xfs_extlen_t len);
+int xfs_ag_get_geometry(struct xfs_mount *mp, xfs_agnumber_t agno,
+ struct xfs_ag_geometry *ageo);
#endif /* __LIBXFS_AG_H */
diff --git a/fs/xfs/libxfs/xfs_ag_resv.c b/fs/xfs/libxfs/xfs_ag_resv.c
index e701ebc36c06..87a9747f1d36 100644
--- a/fs/xfs/libxfs/xfs_ag_resv.c
+++ b/fs/xfs/libxfs/xfs_ag_resv.c
@@ -9,20 +9,12 @@
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
-#include "xfs_sb.h"
#include "xfs_mount.h"
-#include "xfs_defer.h"
#include "xfs_alloc.h"
#include "xfs_errortag.h"
#include "xfs_error.h"
#include "xfs_trace.h"
-#include "xfs_cksum.h"
#include "xfs_trans.h"
-#include "xfs_bit.h"
-#include "xfs_bmap.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_ag_resv.h"
-#include "xfs_trans_space.h"
#include "xfs_rmap_btree.h"
#include "xfs_btree.h"
#include "xfs_refcount_btree.h"
@@ -281,7 +273,7 @@ xfs_ag_resv_init(
*/
ask = used = 0;
- mp->m_inotbt_nores = true;
+ mp->m_finobt_nores = true;
error = xfs_refcountbt_calc_reserves(mp, tp, agno, &ask,
&used);
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index b715668886a4..372ad55631fc 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -13,7 +13,6 @@
#include "xfs_sb.h"
#include "xfs_mount.h"
#include "xfs_defer.h"
-#include "xfs_inode.h"
#include "xfs_btree.h"
#include "xfs_rmap.h"
#include "xfs_alloc_btree.h"
@@ -21,7 +20,6 @@
#include "xfs_extent_busy.h"
#include "xfs_errortag.h"
#include "xfs_error.h"
-#include "xfs_cksum.h"
#include "xfs_trace.h"
#include "xfs_trans.h"
#include "xfs_buf_item.h"
@@ -41,8 +39,6 @@ struct workqueue_struct *xfs_alloc_wq;
STATIC int xfs_alloc_ag_vextent_exact(xfs_alloc_arg_t *);
STATIC int xfs_alloc_ag_vextent_near(xfs_alloc_arg_t *);
STATIC int xfs_alloc_ag_vextent_size(xfs_alloc_arg_t *);
-STATIC int xfs_alloc_ag_vextent_small(xfs_alloc_arg_t *,
- xfs_btree_cur_t *, xfs_agblock_t *, xfs_extlen_t *, int *);
/*
* Size of the AGFL. For CRC-enabled filesystes we steal a couple of slots in
@@ -555,7 +551,7 @@ static xfs_failaddr_t
xfs_agfl_verify(
struct xfs_buf *bp)
{
- struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_mount *mp = bp->b_mount;
struct xfs_agfl *agfl = XFS_BUF_TO_AGFL(bp);
int i;
@@ -568,9 +564,9 @@ xfs_agfl_verify(
if (!xfs_sb_version_hascrc(&mp->m_sb))
return NULL;
- if (!uuid_equal(&agfl->agfl_uuid, &mp->m_sb.sb_meta_uuid))
+ if (!xfs_verify_magic(bp, agfl->agfl_magicnum))
return __this_address;
- if (be32_to_cpu(agfl->agfl_magicnum) != XFS_AGFL_MAGIC)
+ if (!uuid_equal(&agfl->agfl_uuid, &mp->m_sb.sb_meta_uuid))
return __this_address;
/*
* during growfs operations, the perag is not fully initialised,
@@ -596,7 +592,7 @@ static void
xfs_agfl_read_verify(
struct xfs_buf *bp)
{
- struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_mount *mp = bp->b_mount;
xfs_failaddr_t fa;
/*
@@ -621,7 +617,7 @@ static void
xfs_agfl_write_verify(
struct xfs_buf *bp)
{
- struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_mount *mp = bp->b_mount;
struct xfs_buf_log_item *bip = bp->b_log_item;
xfs_failaddr_t fa;
@@ -643,6 +639,7 @@ xfs_agfl_write_verify(
const struct xfs_buf_ops xfs_agfl_buf_ops = {
.name = "xfs_agfl",
+ .magic = { cpu_to_be32(XFS_AGFL_MAGIC), cpu_to_be32(XFS_AGFL_MAGIC) },
.verify_read = xfs_agfl_read_verify,
.verify_write = xfs_agfl_write_verify,
.verify_struct = xfs_agfl_verify,
@@ -699,6 +696,107 @@ xfs_alloc_update_counters(
*/
/*
+ * Deal with the case where only small freespaces remain. Either return the
+ * contents of the last freespace record, or allocate space from the freelist if
+ * there is nothing in the tree.
+ */
+STATIC int /* error */
+xfs_alloc_ag_vextent_small(
+ struct xfs_alloc_arg *args, /* allocation argument structure */
+ struct xfs_btree_cur *ccur, /* optional by-size cursor */
+ xfs_agblock_t *fbnop, /* result block number */
+ xfs_extlen_t *flenp, /* result length */
+ int *stat) /* status: 0-freelist, 1-normal/none */
+{
+ int error = 0;
+ xfs_agblock_t fbno = NULLAGBLOCK;
+ xfs_extlen_t flen = 0;
+ int i = 0;
+
+ /*
+ * If a cntbt cursor is provided, try to allocate the largest record in
+ * the tree. Try the AGFL if the cntbt is empty, otherwise fail the
+ * allocation. Make sure to respect minleft even when pulling from the
+ * freelist.
+ */
+ if (ccur)
+ error = xfs_btree_decrement(ccur, 0, &i);
+ if (error)
+ goto error;
+ if (i) {
+ error = xfs_alloc_get_rec(ccur, &fbno, &flen, &i);
+ if (error)
+ goto error;
+ XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error);
+ goto out;
+ }
+
+ if (args->minlen != 1 || args->alignment != 1 ||
+ args->resv == XFS_AG_RESV_AGFL ||
+ (be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_flcount) <=
+ args->minleft))
+ goto out;
+
+ error = xfs_alloc_get_freelist(args->tp, args->agbp, &fbno, 0);
+ if (error)
+ goto error;
+ if (fbno == NULLAGBLOCK)
+ goto out;
+
+ xfs_extent_busy_reuse(args->mp, args->agno, fbno, 1,
+ xfs_alloc_allow_busy_reuse(args->datatype));
+
+ if (xfs_alloc_is_userdata(args->datatype)) {
+ struct xfs_buf *bp;
+
+ bp = xfs_btree_get_bufs(args->mp, args->tp, args->agno, fbno);
+ if (!bp) {
+ error = -EFSCORRUPTED;
+ goto error;
+ }
+ xfs_trans_binval(args->tp, bp);
+ }
+ *fbnop = args->agbno = fbno;
+ *flenp = args->len = 1;
+ XFS_WANT_CORRUPTED_GOTO(args->mp,
+ fbno < be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length),
+ error);
+ args->wasfromfl = 1;
+ trace_xfs_alloc_small_freelist(args);
+
+ /*
+ * If we're feeding an AGFL block to something that doesn't live in the
+ * free space, we need to clear out the OWN_AG rmap.
+ */
+ error = xfs_rmap_free(args->tp, args->agbp, args->agno, fbno, 1,
+ &XFS_RMAP_OINFO_AG);
+ if (error)
+ goto error;
+
+ *stat = 0;
+ return 0;
+
+out:
+ /*
+ * Can't do the allocation, give up.
+ */
+ if (flen < args->minlen) {
+ args->agbno = NULLAGBLOCK;
+ trace_xfs_alloc_small_notenough(args);
+ flen = 0;
+ }
+ *fbnop = fbno;
+ *flenp = flen;
+ *stat = 1;
+ trace_xfs_alloc_small_done(args);
+ return 0;
+
+error:
+ trace_xfs_alloc_small_error(args);
+ return error;
+}
+
+/*
* Allocate a variable extent in the allocation group agno.
* Type and bno are used to determine where in the allocation group the
* extent will start.
@@ -1582,112 +1680,6 @@ out_nominleft:
}
/*
- * Deal with the case where only small freespaces remain.
- * Either return the contents of the last freespace record,
- * or allocate space from the freelist if there is nothing in the tree.
- */
-STATIC int /* error */
-xfs_alloc_ag_vextent_small(
- xfs_alloc_arg_t *args, /* allocation argument structure */
- xfs_btree_cur_t *ccur, /* by-size cursor */
- xfs_agblock_t *fbnop, /* result block number */
- xfs_extlen_t *flenp, /* result length */
- int *stat) /* status: 0-freelist, 1-normal/none */
-{
- int error;
- xfs_agblock_t fbno;
- xfs_extlen_t flen;
- int i;
-
- if ((error = xfs_btree_decrement(ccur, 0, &i)))
- goto error0;
- if (i) {
- if ((error = xfs_alloc_get_rec(ccur, &fbno, &flen, &i)))
- goto error0;
- XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0);
- }
- /*
- * Nothing in the btree, try the freelist. Make sure
- * to respect minleft even when pulling from the
- * freelist.
- */
- else if (args->minlen == 1 && args->alignment == 1 &&
- args->resv != XFS_AG_RESV_AGFL &&
- (be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_flcount)
- > args->minleft)) {
- error = xfs_alloc_get_freelist(args->tp, args->agbp, &fbno, 0);
- if (error)
- goto error0;
- if (fbno != NULLAGBLOCK) {
- xfs_extent_busy_reuse(args->mp, args->agno, fbno, 1,
- xfs_alloc_allow_busy_reuse(args->datatype));
-
- if (xfs_alloc_is_userdata(args->datatype)) {
- xfs_buf_t *bp;
-
- bp = xfs_btree_get_bufs(args->mp, args->tp,
- args->agno, fbno, 0);
- if (!bp) {
- error = -EFSCORRUPTED;
- goto error0;
- }
- xfs_trans_binval(args->tp, bp);
- }
- args->len = 1;
- args->agbno = fbno;
- XFS_WANT_CORRUPTED_GOTO(args->mp,
- args->agbno + args->len <=
- be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length),
- error0);
- args->wasfromfl = 1;
- trace_xfs_alloc_small_freelist(args);
-
- /*
- * If we're feeding an AGFL block to something that
- * doesn't live in the free space, we need to clear
- * out the OWN_AG rmap.
- */
- error = xfs_rmap_free(args->tp, args->agbp, args->agno,
- fbno, 1, &XFS_RMAP_OINFO_AG);
- if (error)
- goto error0;
-
- *stat = 0;
- return 0;
- }
- /*
- * Nothing in the freelist.
- */
- else
- flen = 0;
- }
- /*
- * Can't allocate from the freelist for some reason.
- */
- else {
- fbno = NULLAGBLOCK;
- flen = 0;
- }
- /*
- * Can't do the allocation, give up.
- */
- if (flen < args->minlen) {
- args->agbno = NULLAGBLOCK;
- trace_xfs_alloc_small_notenough(args);
- flen = 0;
- }
- *fbnop = fbno;
- *flenp = flen;
- *stat = 1;
- trace_xfs_alloc_small_done(args);
- return 0;
-
-error0:
- trace_xfs_alloc_small_error(args);
- return error;
-}
-
-/*
* Free the extent starting at agno/bno for length.
*/
STATIC int
@@ -2041,6 +2033,7 @@ xfs_alloc_space_available(
xfs_extlen_t alloc_len, longest;
xfs_extlen_t reservation; /* blocks that are still reserved */
int available;
+ xfs_extlen_t agflcount;
if (flags & XFS_ALLOC_FLAG_FREEING)
return true;
@@ -2053,8 +2046,13 @@ xfs_alloc_space_available(
if (longest < alloc_len)
return false;
- /* do we have enough free space remaining for the allocation? */
- available = (int)(pag->pagf_freeblks + pag->pagf_flcount -
+ /*
+ * Do we have enough free space remaining for the allocation? Don't
+ * account extra agfl blocks because we are about to defer free them,
+ * making them unavailable until the current transaction commits.
+ */
+ agflcount = min_t(xfs_extlen_t, pag->pagf_flcount, min_free);
+ available = (int)(pag->pagf_freeblks + agflcount -
reservation - min_free - args->minleft);
if (available < (int)max(args->total, alloc_len))
return false;
@@ -2088,7 +2086,7 @@ xfs_free_agfl_block(
if (error)
return error;
- bp = xfs_btree_get_bufs(tp->t_mountp, tp, agno, agbno, 0);
+ bp = xfs_btree_get_bufs(tp->t_mountp, tp, agno, agbno);
if (!bp)
return -EFSCORRUPTED;
xfs_trans_binval(tp, bp);
@@ -2236,6 +2234,9 @@ xfs_alloc_fix_freelist(
xfs_extlen_t need; /* total blocks needed in freelist */
int error = 0;
+ /* deferred ops (AGFL block frees) require permanent transactions */
+ ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
+
if (!pag->pagf_init) {
error = xfs_alloc_read_agf(mp, tp, args->agno, flags, &agbp);
if (error)
@@ -2576,7 +2577,7 @@ static xfs_failaddr_t
xfs_agf_verify(
struct xfs_buf *bp)
{
- struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_mount *mp = bp->b_mount;
struct xfs_agf *agf = XFS_BUF_TO_AGF(bp);
if (xfs_sb_version_hascrc(&mp->m_sb)) {
@@ -2587,8 +2588,10 @@ xfs_agf_verify(
return __this_address;
}
- if (!(agf->agf_magicnum == cpu_to_be32(XFS_AGF_MAGIC) &&
- XFS_AGF_GOOD_VERSION(be32_to_cpu(agf->agf_versionnum)) &&
+ if (!xfs_verify_magic(bp, agf->agf_magicnum))
+ return __this_address;
+
+ if (!(XFS_AGF_GOOD_VERSION(be32_to_cpu(agf->agf_versionnum)) &&
be32_to_cpu(agf->agf_freeblks) <= be32_to_cpu(agf->agf_length) &&
be32_to_cpu(agf->agf_flfirst) < xfs_agfl_size(mp) &&
be32_to_cpu(agf->agf_fllast) < xfs_agfl_size(mp) &&
@@ -2632,7 +2635,7 @@ static void
xfs_agf_read_verify(
struct xfs_buf *bp)
{
- struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_mount *mp = bp->b_mount;
xfs_failaddr_t fa;
if (xfs_sb_version_hascrc(&mp->m_sb) &&
@@ -2649,7 +2652,7 @@ static void
xfs_agf_write_verify(
struct xfs_buf *bp)
{
- struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_mount *mp = bp->b_mount;
struct xfs_buf_log_item *bip = bp->b_log_item;
xfs_failaddr_t fa;
@@ -2670,6 +2673,7 @@ xfs_agf_write_verify(
const struct xfs_buf_ops xfs_agf_buf_ops = {
.name = "xfs_agf",
+ .magic = { cpu_to_be32(XFS_AGF_MAGIC), cpu_to_be32(XFS_AGF_MAGIC) },
.verify_read = xfs_agf_read_verify,
.verify_write = xfs_agf_write_verify,
.verify_struct = xfs_agf_verify,
@@ -3133,7 +3137,7 @@ xfs_alloc_has_record(
/*
* Walk all the blocks in the AGFL. The @walk_fn can return any negative
- * error code or XFS_BTREE_QUERY_RANGE_ABORT.
+ * error code or XFS_ITER_*.
*/
int
xfs_agfl_walk(
diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c
index 4e59cc8a2802..2a94543857a1 100644
--- a/fs/xfs/libxfs/xfs_alloc_btree.c
+++ b/fs/xfs/libxfs/xfs_alloc_btree.c
@@ -17,7 +17,6 @@
#include "xfs_extent_busy.h"
#include "xfs_error.h"
#include "xfs_trace.h"
-#include "xfs_cksum.h"
#include "xfs_trans.h"
@@ -292,53 +291,39 @@ static xfs_failaddr_t
xfs_allocbt_verify(
struct xfs_buf *bp)
{
- struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_mount *mp = bp->b_mount;
struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
struct xfs_perag *pag = bp->b_pag;
xfs_failaddr_t fa;
unsigned int level;
+ xfs_btnum_t btnum = XFS_BTNUM_BNOi;
+
+ if (!xfs_verify_magic(bp, block->bb_magic))
+ return __this_address;
+
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ fa = xfs_btree_sblock_v5hdr_verify(bp);
+ if (fa)
+ return fa;
+ }
/*
- * magic number and level verification
- *
- * During growfs operations, we can't verify the exact level or owner as
- * the perag is not fully initialised and hence not attached to the
- * buffer. In this case, check against the maximum tree depth.
+ * The perag may not be attached during grow operations or fully
+ * initialized from the AGF during log recovery. Therefore we can only
+ * check against maximum tree depth from those contexts.
*
- * Similarly, during log recovery we will have a perag structure
- * attached, but the agf information will not yet have been initialised
- * from the on disk AGF. Again, we can only check against maximum limits
- * in this case.
+ * Otherwise check against the per-tree limit. Peek at one of the
+ * verifier magic values to determine the type of tree we're verifying
+ * against.
*/
level = be16_to_cpu(block->bb_level);
- switch (block->bb_magic) {
- case cpu_to_be32(XFS_ABTB_CRC_MAGIC):
- fa = xfs_btree_sblock_v5hdr_verify(bp);
- if (fa)
- return fa;
- /* fall through */
- case cpu_to_be32(XFS_ABTB_MAGIC):
- if (pag && pag->pagf_init) {
- if (level >= pag->pagf_levels[XFS_BTNUM_BNOi])
- return __this_address;
- } else if (level >= mp->m_ag_maxlevels)
+ if (bp->b_ops->magic[0] == cpu_to_be32(XFS_ABTC_MAGIC))
+ btnum = XFS_BTNUM_CNTi;
+ if (pag && pag->pagf_init) {
+ if (level >= pag->pagf_levels[btnum])
return __this_address;
- break;
- case cpu_to_be32(XFS_ABTC_CRC_MAGIC):
- fa = xfs_btree_sblock_v5hdr_verify(bp);
- if (fa)
- return fa;
- /* fall through */
- case cpu_to_be32(XFS_ABTC_MAGIC):
- if (pag && pag->pagf_init) {
- if (level >= pag->pagf_levels[XFS_BTNUM_CNTi])
- return __this_address;
- } else if (level >= mp->m_ag_maxlevels)
- return __this_address;
- break;
- default:
+ } else if (level >= mp->m_ag_maxlevels)
return __this_address;
- }
return xfs_btree_sblock_verify(bp, mp->m_alloc_mxr[level != 0]);
}
@@ -377,13 +362,23 @@ xfs_allocbt_write_verify(
}
-const struct xfs_buf_ops xfs_allocbt_buf_ops = {
- .name = "xfs_allocbt",
+const struct xfs_buf_ops xfs_bnobt_buf_ops = {
+ .name = "xfs_bnobt",
+ .magic = { cpu_to_be32(XFS_ABTB_MAGIC),
+ cpu_to_be32(XFS_ABTB_CRC_MAGIC) },
.verify_read = xfs_allocbt_read_verify,
.verify_write = xfs_allocbt_write_verify,
.verify_struct = xfs_allocbt_verify,
};
+const struct xfs_buf_ops xfs_cntbt_buf_ops = {
+ .name = "xfs_cntbt",
+ .magic = { cpu_to_be32(XFS_ABTC_MAGIC),
+ cpu_to_be32(XFS_ABTC_CRC_MAGIC) },
+ .verify_read = xfs_allocbt_read_verify,
+ .verify_write = xfs_allocbt_write_verify,
+ .verify_struct = xfs_allocbt_verify,
+};
STATIC int
xfs_bnobt_keys_inorder(
@@ -448,7 +443,7 @@ static const struct xfs_btree_ops xfs_bnobt_ops = {
.init_rec_from_cur = xfs_allocbt_init_rec_from_cur,
.init_ptr_from_cur = xfs_allocbt_init_ptr_from_cur,
.key_diff = xfs_bnobt_key_diff,
- .buf_ops = &xfs_allocbt_buf_ops,
+ .buf_ops = &xfs_bnobt_buf_ops,
.diff_two_keys = xfs_bnobt_diff_two_keys,
.keys_inorder = xfs_bnobt_keys_inorder,
.recs_inorder = xfs_bnobt_recs_inorder,
@@ -470,7 +465,7 @@ static const struct xfs_btree_ops xfs_cntbt_ops = {
.init_rec_from_cur = xfs_allocbt_init_rec_from_cur,
.init_ptr_from_cur = xfs_allocbt_init_ptr_from_cur,
.key_diff = xfs_cntbt_key_diff,
- .buf_ops = &xfs_allocbt_buf_ops,
+ .buf_ops = &xfs_cntbt_buf_ops,
.diff_two_keys = xfs_cntbt_diff_two_keys,
.keys_inorder = xfs_cntbt_keys_inorder,
.recs_inorder = xfs_cntbt_recs_inorder,
diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c
index 844ed87b1900..d48fcf11cc35 100644
--- a/fs/xfs/libxfs/xfs_attr.c
+++ b/fs/xfs/libxfs/xfs_attr.c
@@ -9,23 +9,18 @@
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
-#include "xfs_bit.h"
#include "xfs_mount.h"
#include "xfs_defer.h"
#include "xfs_da_format.h"
#include "xfs_da_btree.h"
#include "xfs_attr_sf.h"
#include "xfs_inode.h"
-#include "xfs_alloc.h"
#include "xfs_trans.h"
-#include "xfs_inode_item.h"
#include "xfs_bmap.h"
-#include "xfs_bmap_util.h"
#include "xfs_bmap_btree.h"
#include "xfs_attr.h"
#include "xfs_attr_leaf.h"
#include "xfs_attr_remote.h"
-#include "xfs_error.h"
#include "xfs_quota.h"
#include "xfs_trans_space.h"
#include "xfs_trace.h"
@@ -224,10 +219,10 @@ xfs_attr_try_sf_addname(
*/
int
xfs_attr_set_args(
- struct xfs_da_args *args,
- struct xfs_buf **leaf_bp)
+ struct xfs_da_args *args)
{
struct xfs_inode *dp = args->dp;
+ struct xfs_buf *leaf_bp = NULL;
int error;
/*
@@ -255,7 +250,7 @@ xfs_attr_set_args(
* It won't fit in the shortform, transform to a leaf block.
* GROT: another possible req'mt for a double-split btree op.
*/
- error = xfs_attr_shortform_to_leaf(args, leaf_bp);
+ error = xfs_attr_shortform_to_leaf(args, &leaf_bp);
if (error)
return error;
@@ -263,23 +258,16 @@ xfs_attr_set_args(
* Prevent the leaf buffer from being unlocked so that a
* concurrent AIL push cannot grab the half-baked leaf
* buffer and run into problems with the write verifier.
+ * Once we're done rolling the transaction we can release
+ * the hold and add the attr to the leaf.
*/
- xfs_trans_bhold(args->trans, *leaf_bp);
-
+ xfs_trans_bhold(args->trans, leaf_bp);
error = xfs_defer_finish(&args->trans);
- if (error)
- return error;
-
- /*
- * Commit the leaf transformation. We'll need another
- * (linked) transaction to add the new attribute to the
- * leaf.
- */
- error = xfs_trans_roll_inode(&args->trans, dp);
- if (error)
+ xfs_trans_bhold_release(args->trans, leaf_bp);
+ if (error) {
+ xfs_trans_brelse(args->trans, leaf_bp);
return error;
- xfs_trans_bjoin(args->trans, *leaf_bp);
- *leaf_bp = NULL;
+ }
}
if (xfs_bmap_one_block(dp, XFS_ATTR_FORK))
@@ -322,7 +310,6 @@ xfs_attr_set(
int flags)
{
struct xfs_mount *mp = dp->i_mount;
- struct xfs_buf *leaf_bp = NULL;
struct xfs_da_args args;
struct xfs_trans_res tres;
int rsvd = (flags & ATTR_ROOT) != 0;
@@ -381,9 +368,9 @@ xfs_attr_set(
goto out_trans_cancel;
xfs_trans_ijoin(args.trans, dp, 0);
- error = xfs_attr_set_args(&args, &leaf_bp);
+ error = xfs_attr_set_args(&args);
if (error)
- goto out_release_leaf;
+ goto out_trans_cancel;
if (!args.trans) {
/* shortform attribute has already been committed */
goto out_unlock;
@@ -408,9 +395,6 @@ out_unlock:
xfs_iunlock(dp, XFS_ILOCK_EXCL);
return error;
-out_release_leaf:
- if (leaf_bp)
- xfs_trans_brelse(args.trans, leaf_bp);
out_trans_cancel:
if (args.trans)
xfs_trans_cancel(args.trans);
@@ -1336,3 +1320,20 @@ xfs_attr_node_get(xfs_da_args_t *args)
xfs_da_state_free(state);
return retval;
}
+
+/* Returns true if the attribute entry name is valid. */
+bool
+xfs_attr_namecheck(
+ const void *name,
+ size_t length)
+{
+ /*
+ * MAXNAMELEN includes the trailing null, but (name/length) leave it
+ * out, so use >= for the length check.
+ */
+ if (length >= MAXNAMELEN)
+ return false;
+
+ /* There shouldn't be any nulls here */
+ return !memchr(name, 0, length);
+}
diff --git a/fs/xfs/libxfs/xfs_attr.h b/fs/xfs/libxfs/xfs_attr.h
index bdf52a333f3f..ff28ebf3b635 100644
--- a/fs/xfs/libxfs/xfs_attr.h
+++ b/fs/xfs/libxfs/xfs_attr.h
@@ -112,7 +112,13 @@ typedef struct xfs_attr_list_context {
struct xfs_inode *dp; /* inode */
struct attrlist_cursor_kern *cursor; /* position in list */
char *alist; /* output buffer */
- int seen_enough; /* T/F: seen enough of list? */
+
+ /*
+ * Abort attribute list iteration if non-zero. Can be used to pass
+ * error values to the xfs_attr_list caller.
+ */
+ int seen_enough;
+
ssize_t count; /* num used entries */
int dupcnt; /* count dup hashvals seen */
int bufsize; /* total buffer size */
@@ -140,11 +146,11 @@ int xfs_attr_get(struct xfs_inode *ip, const unsigned char *name,
unsigned char *value, int *valuelenp, int flags);
int xfs_attr_set(struct xfs_inode *dp, const unsigned char *name,
unsigned char *value, int valuelen, int flags);
-int xfs_attr_set_args(struct xfs_da_args *args, struct xfs_buf **leaf_bp);
+int xfs_attr_set_args(struct xfs_da_args *args);
int xfs_attr_remove(struct xfs_inode *dp, const unsigned char *name, int flags);
int xfs_attr_remove_args(struct xfs_da_args *args);
int xfs_attr_list(struct xfs_inode *dp, char *buffer, int bufsize,
int flags, struct attrlist_cursor_kern *cursor);
-
+bool xfs_attr_namecheck(const void *name, size_t length);
#endif /* __XFS_ATTR_H__ */
diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c
index 2652d00842d6..70eb941d02e4 100644
--- a/fs/xfs/libxfs/xfs_attr_leaf.c
+++ b/fs/xfs/libxfs/xfs_attr_leaf.c
@@ -10,14 +10,12 @@
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
-#include "xfs_bit.h"
#include "xfs_sb.h"
#include "xfs_mount.h"
#include "xfs_da_format.h"
#include "xfs_da_btree.h"
#include "xfs_inode.h"
#include "xfs_trans.h"
-#include "xfs_inode_item.h"
#include "xfs_bmap_btree.h"
#include "xfs_bmap.h"
#include "xfs_attr_sf.h"
@@ -27,7 +25,6 @@
#include "xfs_error.h"
#include "xfs_trace.h"
#include "xfs_buf_item.h"
-#include "xfs_cksum.h"
#include "xfs_dir2.h"
#include "xfs_log.h"
@@ -240,30 +237,19 @@ xfs_attr3_leaf_verify(
struct xfs_buf *bp)
{
struct xfs_attr3_icleaf_hdr ichdr;
- struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_mount *mp = bp->b_mount;
struct xfs_attr_leafblock *leaf = bp->b_addr;
struct xfs_attr_leaf_entry *entries;
uint32_t end; /* must be 32bit - see below */
int i;
+ xfs_failaddr_t fa;
xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr, leaf);
- if (xfs_sb_version_hascrc(&mp->m_sb)) {
- struct xfs_da3_node_hdr *hdr3 = bp->b_addr;
-
- if (ichdr.magic != XFS_ATTR3_LEAF_MAGIC)
- return __this_address;
+ fa = xfs_da3_blkinfo_verify(bp, bp->b_addr);
+ if (fa)
+ return fa;
- if (!uuid_equal(&hdr3->info.uuid, &mp->m_sb.sb_meta_uuid))
- return __this_address;
- if (be64_to_cpu(hdr3->info.blkno) != bp->b_bn)
- return __this_address;
- if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->info.lsn)))
- return __this_address;
- } else {
- if (ichdr.magic != XFS_ATTR_LEAF_MAGIC)
- return __this_address;
- }
/*
* In recovery there is a transient state where count == 0 is valid
* because we may have transitioned an empty shortform attr to a leaf
@@ -324,7 +310,7 @@ static void
xfs_attr3_leaf_write_verify(
struct xfs_buf *bp)
{
- struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_mount *mp = bp->b_mount;
struct xfs_buf_log_item *bip = bp->b_log_item;
struct xfs_attr3_leaf_hdr *hdr3 = bp->b_addr;
xfs_failaddr_t fa;
@@ -354,7 +340,7 @@ static void
xfs_attr3_leaf_read_verify(
struct xfs_buf *bp)
{
- struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_mount *mp = bp->b_mount;
xfs_failaddr_t fa;
if (xfs_sb_version_hascrc(&mp->m_sb) &&
@@ -369,6 +355,8 @@ xfs_attr3_leaf_read_verify(
const struct xfs_buf_ops xfs_attr3_leaf_buf_ops = {
.name = "xfs_attr3_leaf",
+ .magic16 = { cpu_to_be16(XFS_ATTR_LEAF_MAGIC),
+ cpu_to_be16(XFS_ATTR3_LEAF_MAGIC) },
.verify_read = xfs_attr3_leaf_read_verify,
.verify_write = xfs_attr3_leaf_write_verify,
.verify_struct = xfs_attr3_leaf_verify,
@@ -874,7 +862,7 @@ xfs_attr_shortform_allfit(
struct xfs_attr3_icleaf_hdr leafhdr;
int bytes;
int i;
- struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_mount *mp = bp->b_mount;
leaf = bp->b_addr;
xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &leafhdr, leaf);
@@ -1534,7 +1522,7 @@ xfs_attr_leaf_order(
{
struct xfs_attr3_icleaf_hdr ichdr1;
struct xfs_attr3_icleaf_hdr ichdr2;
- struct xfs_mount *mp = leaf1_bp->b_target->bt_mount;
+ struct xfs_mount *mp = leaf1_bp->b_mount;
xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr1, leaf1_bp->b_addr);
xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr2, leaf2_bp->b_addr);
@@ -2577,7 +2565,7 @@ xfs_attr_leaf_lasthash(
{
struct xfs_attr3_icleaf_hdr ichdr;
struct xfs_attr_leaf_entry *entries;
- struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_mount *mp = bp->b_mount;
xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr, bp->b_addr);
entries = xfs_attr3_leaf_entryp(bp->b_addr);
diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c
index d89363c6b523..4eb30d357045 100644
--- a/fs/xfs/libxfs/xfs_attr_remote.c
+++ b/fs/xfs/libxfs/xfs_attr_remote.c
@@ -16,18 +16,10 @@
#include "xfs_da_format.h"
#include "xfs_da_btree.h"
#include "xfs_inode.h"
-#include "xfs_alloc.h"
#include "xfs_trans.h"
-#include "xfs_inode_item.h"
#include "xfs_bmap.h"
-#include "xfs_bmap_util.h"
#include "xfs_attr.h"
-#include "xfs_attr_leaf.h"
-#include "xfs_attr_remote.h"
-#include "xfs_trans_space.h"
#include "xfs_trace.h"
-#include "xfs_cksum.h"
-#include "xfs_buf_item.h"
#include "xfs_error.h"
#define ATTR_RMTVALUE_MAPSIZE 1 /* # of map entries at once */
@@ -79,6 +71,7 @@ xfs_attr3_rmt_hdr_ok(
static xfs_failaddr_t
xfs_attr3_rmt_verify(
struct xfs_mount *mp,
+ struct xfs_buf *bp,
void *ptr,
int fsbsize,
xfs_daddr_t bno)
@@ -87,7 +80,7 @@ xfs_attr3_rmt_verify(
if (!xfs_sb_version_hascrc(&mp->m_sb))
return __this_address;
- if (rmt->rm_magic != cpu_to_be32(XFS_ATTR3_RMT_MAGIC))
+ if (!xfs_verify_magic(bp, rmt->rm_magic))
return __this_address;
if (!uuid_equal(&rmt->rm_uuid, &mp->m_sb.sb_meta_uuid))
return __this_address;
@@ -110,7 +103,7 @@ __xfs_attr3_rmt_read_verify(
bool check_crc,
xfs_failaddr_t *failaddr)
{
- struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_mount *mp = bp->b_mount;
char *ptr;
int len;
xfs_daddr_t bno;
@@ -131,7 +124,7 @@ __xfs_attr3_rmt_read_verify(
*failaddr = __this_address;
return -EFSBADCRC;
}
- *failaddr = xfs_attr3_rmt_verify(mp, ptr, blksize, bno);
+ *failaddr = xfs_attr3_rmt_verify(mp, bp, ptr, blksize, bno);
if (*failaddr)
return -EFSCORRUPTED;
len -= blksize;
@@ -174,7 +167,7 @@ static void
xfs_attr3_rmt_write_verify(
struct xfs_buf *bp)
{
- struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_mount *mp = bp->b_mount;
xfs_failaddr_t fa;
int blksize = mp->m_attr_geo->blksize;
char *ptr;
@@ -193,7 +186,7 @@ xfs_attr3_rmt_write_verify(
while (len > 0) {
struct xfs_attr3_rmt_hdr *rmt = (struct xfs_attr3_rmt_hdr *)ptr;
- fa = xfs_attr3_rmt_verify(mp, ptr, blksize, bno);
+ fa = xfs_attr3_rmt_verify(mp, bp, ptr, blksize, bno);
if (fa) {
xfs_verifier_error(bp, -EFSCORRUPTED, fa);
return;
@@ -220,6 +213,7 @@ xfs_attr3_rmt_write_verify(
const struct xfs_buf_ops xfs_attr3_rmt_buf_ops = {
.name = "xfs_attr3_rmt",
+ .magic = { 0, cpu_to_be32(XFS_ATTR3_RMT_MAGIC) },
.verify_read = xfs_attr3_rmt_read_verify,
.verify_write = xfs_attr3_rmt_write_verify,
.verify_struct = xfs_attr3_rmt_verify_struct,
@@ -533,7 +527,7 @@ xfs_attr_rmtval_set(
dblkno = XFS_FSB_TO_DADDR(mp, map.br_startblock),
dblkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount);
- bp = xfs_buf_get(mp->m_ddev_targp, dblkno, dblkcnt, 0);
+ bp = xfs_buf_get(mp->m_ddev_targp, dblkno, dblkcnt);
if (!bp)
return -ENOMEM;
bp->b_ops = &xfs_attr3_rmt_buf_ops;
diff --git a/fs/xfs/libxfs/xfs_bit.c b/fs/xfs/libxfs/xfs_bit.c
index 40ce5f3094d1..7071ff98fdbc 100644
--- a/fs/xfs/libxfs/xfs_bit.c
+++ b/fs/xfs/libxfs/xfs_bit.c
@@ -5,7 +5,6 @@
*/
#include "xfs.h"
#include "xfs_log_format.h"
-#include "xfs_bit.h"
/*
* XFS bit manipulation routines, used in non-realtime code.
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 332eefa2700b..baf0b72c0a37 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -13,14 +13,10 @@
#include "xfs_sb.h"
#include "xfs_mount.h"
#include "xfs_defer.h"
-#include "xfs_da_format.h"
-#include "xfs_da_btree.h"
#include "xfs_dir2.h"
#include "xfs_inode.h"
#include "xfs_btree.h"
#include "xfs_trans.h"
-#include "xfs_inode_item.h"
-#include "xfs_extfree_item.h"
#include "xfs_alloc.h"
#include "xfs_bmap.h"
#include "xfs_bmap_util.h"
@@ -32,7 +28,6 @@
#include "xfs_trans_space.h"
#include "xfs_buf_item.h"
#include "xfs_trace.h"
-#include "xfs_symlink.h"
#include "xfs_attr_leaf.h"
#include "xfs_filestream.h"
#include "xfs_rmap.h"
@@ -370,7 +365,7 @@ xfs_bmap_check_leaf_extents(
bp = xfs_bmap_get_bp(cur, XFS_FSB_TO_DADDR(mp, bno));
if (!bp) {
bp_release = 1;
- error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp,
+ error = xfs_btree_read_bufl(mp, NULL, bno, &bp,
XFS_BMAP_BTREE_REF,
&xfs_bmbt_buf_ops);
if (error)
@@ -454,7 +449,7 @@ xfs_bmap_check_leaf_extents(
bp = xfs_bmap_get_bp(cur, XFS_FSB_TO_DADDR(mp, bno));
if (!bp) {
bp_release = 1;
- error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp,
+ error = xfs_btree_read_bufl(mp, NULL, bno, &bp,
XFS_BMAP_BTREE_REF,
&xfs_bmbt_buf_ops);
if (error)
@@ -577,47 +572,49 @@ __xfs_bmap_add_free(
*/
/*
- * Transform a btree format file with only one leaf node, where the
- * extents list will fit in the inode, into an extents format file.
- * Since the file extents are already in-core, all we have to do is
- * give up the space for the btree root and pitch the leaf block.
+ * Convert the inode format to extent format if it currently is in btree format,
+ * but the extent list is small enough that it fits into the extent format.
+ *
+ * Since the extents are already in-core, all we have to do is give up the space
+ * for the btree root and pitch the leaf block.
*/
STATIC int /* error */
xfs_bmap_btree_to_extents(
- xfs_trans_t *tp, /* transaction pointer */
- xfs_inode_t *ip, /* incore inode pointer */
- xfs_btree_cur_t *cur, /* btree cursor */
+ struct xfs_trans *tp, /* transaction pointer */
+ struct xfs_inode *ip, /* incore inode pointer */
+ struct xfs_btree_cur *cur, /* btree cursor */
int *logflagsp, /* inode logging flags */
int whichfork) /* data or attr fork */
{
- /* REFERENCED */
+ struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_btree_block *rblock = ifp->if_broot;
struct xfs_btree_block *cblock;/* child btree block */
xfs_fsblock_t cbno; /* child block number */
xfs_buf_t *cbp; /* child block's buffer */
int error; /* error return value */
- struct xfs_ifork *ifp; /* inode fork data */
- xfs_mount_t *mp; /* mount point structure */
__be64 *pp; /* ptr to block address */
- struct xfs_btree_block *rblock;/* root btree block */
struct xfs_owner_info oinfo;
- mp = ip->i_mount;
- ifp = XFS_IFORK_PTR(ip, whichfork);
+ /* check if we actually need the extent format first: */
+ if (!xfs_bmap_wants_extents(ip, whichfork))
+ return 0;
+
+ ASSERT(cur);
ASSERT(whichfork != XFS_COW_FORK);
ASSERT(ifp->if_flags & XFS_IFEXTENTS);
ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE);
- rblock = ifp->if_broot;
ASSERT(be16_to_cpu(rblock->bb_level) == 1);
ASSERT(be16_to_cpu(rblock->bb_numrecs) == 1);
ASSERT(xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0) == 1);
+
pp = XFS_BMAP_BROOT_PTR_ADDR(mp, rblock, 1, ifp->if_broot_bytes);
cbno = be64_to_cpu(*pp);
- *logflagsp = 0;
#ifdef DEBUG
XFS_WANT_CORRUPTED_RETURN(cur->bc_mp,
xfs_btree_check_lptr(cur, cbno, 1));
#endif
- error = xfs_btree_read_bufl(mp, tp, cbno, 0, &cbp, XFS_BMAP_BTREE_REF,
+ error = xfs_btree_read_bufl(mp, tp, cbno, &cbp, XFS_BMAP_BTREE_REF,
&xfs_bmbt_buf_ops);
if (error)
return error;
@@ -635,7 +632,7 @@ xfs_bmap_btree_to_extents(
ASSERT(ifp->if_broot == NULL);
ASSERT((ifp->if_flags & XFS_IFBROOT) == 0);
XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS);
- *logflagsp = XFS_ILOG_CORE | xfs_ilog_fext(whichfork);
+ *logflagsp |= XFS_ILOG_CORE | xfs_ilog_fext(whichfork);
return 0;
}
@@ -730,7 +727,7 @@ xfs_bmap_extents_to_btree(
cur->bc_private.b.allocated++;
ip->i_d.di_nblocks++;
xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, 1L);
- abp = xfs_btree_get_bufl(mp, tp, args.fsbno, 0);
+ abp = xfs_btree_get_bufl(mp, tp, args.fsbno);
if (!abp) {
error = -EFSCORRUPTED;
goto out_unreserve_dquot;
@@ -876,7 +873,7 @@ xfs_bmap_local_to_extents(
ASSERT(args.fsbno != NULLFSBLOCK);
ASSERT(args.len == 1);
tp->t_firstblock = args.fsbno;
- bp = xfs_btree_get_bufl(args.mp, tp, args.fsbno, 0);
+ bp = xfs_btree_get_bufl(args.mp, tp, args.fsbno);
/*
* Initialize the block, copy the data and log the remote buffer.
@@ -1189,7 +1186,10 @@ xfs_iread_extents(
* Root level must use BMAP_BROOT_PTR_ADDR macro to get ptr out.
*/
level = be16_to_cpu(block->bb_level);
- ASSERT(level > 0);
+ if (unlikely(level == 0)) {
+ XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp);
+ return -EFSCORRUPTED;
+ }
pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes);
bno = be64_to_cpu(*pp);
@@ -1198,7 +1198,7 @@ xfs_iread_extents(
* pointer (leftmost) at each level.
*/
while (level-- > 0) {
- error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp,
+ error = xfs_btree_read_bufl(mp, tp, bno, &bp,
XFS_BMAP_BTREE_REF, &xfs_bmbt_buf_ops);
if (error)
goto out;
@@ -1271,7 +1271,7 @@ xfs_iread_extents(
*/
if (bno == NULLFSBLOCK)
break;
- error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp,
+ error = xfs_btree_read_bufl(mp, tp, bno, &bp,
XFS_BMAP_BTREE_REF, &xfs_bmbt_buf_ops);
if (error)
goto out;
@@ -2004,6 +2004,9 @@ xfs_bmap_add_extent_delay_real(
goto done;
}
+ if (da_new != da_old)
+ xfs_mod_delalloc(mp, (int64_t)da_new - da_old);
+
if (bma->cur) {
da_new += bma->cur->bc_private.b.allocated;
bma->cur->bc_private.b.allocated = 0;
@@ -2029,7 +2032,7 @@ done:
/*
* Convert an unwritten allocation to a real allocation or vice versa.
*/
-STATIC int /* error */
+int /* error */
xfs_bmap_add_extent_unwritten_real(
struct xfs_trans *tp,
xfs_inode_t *ip, /* incore inode pointer */
@@ -2635,6 +2638,7 @@ xfs_bmap_add_extent_hole_delay(
/*
* Nothing to do for disk quota accounting here.
*/
+ xfs_mod_delalloc(ip->i_mount, (int64_t)newlen - oldlen);
}
}
@@ -3347,8 +3351,10 @@ xfs_bmap_btalloc_accounting(
* already have quota reservation and there's nothing to do
* yet.
*/
- if (ap->wasdel)
+ if (ap->wasdel) {
+ xfs_mod_delalloc(ap->ip->i_mount, -(int64_t)args->len);
return;
+ }
/*
* Otherwise, we've allocated blocks in a hole. The transaction
@@ -3367,8 +3373,10 @@ xfs_bmap_btalloc_accounting(
/* data/attr fork only */
ap->ip->i_d.di_nblocks += args->len;
xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE);
- if (ap->wasdel)
+ if (ap->wasdel) {
ap->ip->i_delayed_blks -= args->len;
+ xfs_mod_delalloc(ap->ip->i_mount, -(int64_t)args->len);
+ }
xfs_trans_mod_dquot_byino(ap->tp, ap->ip,
ap->wasdel ? XFS_TRANS_DQ_DELBCOUNT : XFS_TRANS_DQ_BCOUNT,
args->len);
@@ -3685,17 +3693,6 @@ xfs_trim_extent(
}
}
-/* trim extent to within eof */
-void
-xfs_trim_extent_eof(
- struct xfs_bmbt_irec *irec,
- struct xfs_inode *ip)
-
-{
- xfs_trim_extent(irec, 0, XFS_B_TO_FSB(ip->i_mount,
- i_size_read(VFS_I(ip))));
-}
-
/*
* Trim the returned map to the required bounds
*/
@@ -3975,6 +3972,7 @@ xfs_bmapi_reserve_delalloc(
ip->i_delayed_blks += alen;
+ xfs_mod_delalloc(ip->i_mount, alen + indlen);
got->br_startoff = aoff;
got->br_startblock = nullstartblock(indlen);
@@ -4203,6 +4201,44 @@ xfs_bmapi_convert_unwritten(
return 0;
}
+static inline xfs_extlen_t
+xfs_bmapi_minleft(
+ struct xfs_trans *tp,
+ struct xfs_inode *ip,
+ int fork)
+{
+ if (tp && tp->t_firstblock != NULLFSBLOCK)
+ return 0;
+ if (XFS_IFORK_FORMAT(ip, fork) != XFS_DINODE_FMT_BTREE)
+ return 1;
+ return be16_to_cpu(XFS_IFORK_PTR(ip, fork)->if_broot->bb_level) + 1;
+}
+
+/*
+ * Log whatever the flags say, even if error. Otherwise we might miss detecting
+ * a case where the data is changed, there's an error, and it's not logged so we
+ * don't shutdown when we should. Don't bother logging extents/btree changes if
+ * we converted to the other format.
+ */
+static void
+xfs_bmapi_finish(
+ struct xfs_bmalloca *bma,
+ int whichfork,
+ int error)
+{
+ if ((bma->logflags & xfs_ilog_fext(whichfork)) &&
+ XFS_IFORK_FORMAT(bma->ip, whichfork) != XFS_DINODE_FMT_EXTENTS)
+ bma->logflags &= ~xfs_ilog_fext(whichfork);
+ else if ((bma->logflags & xfs_ilog_fbroot(whichfork)) &&
+ XFS_IFORK_FORMAT(bma->ip, whichfork) != XFS_DINODE_FMT_BTREE)
+ bma->logflags &= ~xfs_ilog_fbroot(whichfork);
+
+ if (bma->logflags)
+ xfs_trans_log_inode(bma->tp, bma->ip, bma->logflags);
+ if (bma->cur)
+ xfs_btree_del_cursor(bma->cur, error);
+}
+
/*
* Map file blocks to filesystem blocks, and allocate blocks or convert the
* extent state if necessary. Details behaviour is controlled by the flags
@@ -4220,9 +4256,13 @@ xfs_bmapi_write(
struct xfs_bmbt_irec *mval, /* output: map values */
int *nmap) /* i/o: mval size/count */
{
+ struct xfs_bmalloca bma = {
+ .tp = tp,
+ .ip = ip,
+ .total = total,
+ };
struct xfs_mount *mp = ip->i_mount;
struct xfs_ifork *ifp;
- struct xfs_bmalloca bma = { NULL }; /* args for xfs_bmap_alloc */
xfs_fileoff_t end; /* end of mapped file region */
bool eof = false; /* after the end of extents */
int error; /* error return */
@@ -4247,9 +4287,7 @@ xfs_bmapi_write(
ASSERT(*nmap >= 1);
ASSERT(*nmap <= XFS_BMAP_MAX_NMAP);
- ASSERT(tp != NULL ||
- (flags & (XFS_BMAPI_CONVERT | XFS_BMAPI_COWFORK)) ==
- (XFS_BMAPI_CONVERT | XFS_BMAPI_COWFORK));
+ ASSERT(tp != NULL);
ASSERT(len > 0);
ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL);
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
@@ -4282,34 +4320,21 @@ xfs_bmapi_write(
XFS_STATS_INC(mp, xs_blk_mapw);
- if (!tp || tp->t_firstblock == NULLFSBLOCK) {
- if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE)
- bma.minleft = be16_to_cpu(ifp->if_broot->bb_level) + 1;
- else
- bma.minleft = 1;
- } else {
- bma.minleft = 0;
- }
-
if (!(ifp->if_flags & XFS_IFEXTENTS)) {
error = xfs_iread_extents(tp, ip, whichfork);
if (error)
goto error0;
}
- n = 0;
- end = bno + len;
- obno = bno;
-
if (!xfs_iext_lookup_extent(ip, ifp, bno, &bma.icur, &bma.got))
eof = true;
if (!xfs_iext_peek_prev_extent(ifp, &bma.icur, &bma.prev))
bma.prev.br_startoff = NULLFILEOFF;
- bma.tp = tp;
- bma.ip = ip;
- bma.total = total;
- bma.datatype = 0;
+ bma.minleft = xfs_bmapi_minleft(tp, ip, whichfork);
+ n = 0;
+ end = bno + len;
+ obno = bno;
while (bno < end && n < *nmap) {
bool need_alloc = false, wasdelay = false;
@@ -4323,26 +4348,7 @@ xfs_bmapi_write(
ASSERT(!((flags & XFS_BMAPI_CONVERT) &&
(flags & XFS_BMAPI_COWFORK)));
- if (flags & XFS_BMAPI_DELALLOC) {
- /*
- * For the COW fork we can reasonably get a
- * request for converting an extent that races
- * with other threads already having converted
- * part of it, as there converting COW to
- * regular blocks is not protected using the
- * IOLOCK.
- */
- ASSERT(flags & XFS_BMAPI_COWFORK);
- if (!(flags & XFS_BMAPI_COWFORK)) {
- error = -EIO;
- goto error0;
- }
-
- if (eof || bno >= end)
- break;
- } else {
- need_alloc = true;
- }
+ need_alloc = true;
} else if (isnullstartblock(bma.got.br_startblock)) {
wasdelay = true;
}
@@ -4351,8 +4357,7 @@ xfs_bmapi_write(
* First, deal with the hole before the allocated space
* that we found, if any.
*/
- if ((need_alloc || wasdelay) &&
- !(flags & XFS_BMAPI_CONVERT_ONLY)) {
+ if (need_alloc || wasdelay) {
bma.eof = eof;
bma.conv = !!(flags & XFS_BMAPI_CONVERT);
bma.wasdel = wasdelay;
@@ -4420,49 +4425,130 @@ xfs_bmapi_write(
}
*nmap = n;
- /*
- * Transform from btree to extents, give it cur.
- */
- if (xfs_bmap_wants_extents(ip, whichfork)) {
- int tmp_logflags = 0;
-
- ASSERT(bma.cur);
- error = xfs_bmap_btree_to_extents(tp, ip, bma.cur,
- &tmp_logflags, whichfork);
- bma.logflags |= tmp_logflags;
- if (error)
- goto error0;
- }
+ error = xfs_bmap_btree_to_extents(tp, ip, bma.cur, &bma.logflags,
+ whichfork);
+ if (error)
+ goto error0;
ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE ||
XFS_IFORK_NEXTENTS(ip, whichfork) >
XFS_IFORK_MAXEXT(ip, whichfork));
- error = 0;
+ xfs_bmapi_finish(&bma, whichfork, 0);
+ xfs_bmap_validate_ret(orig_bno, orig_len, orig_flags, orig_mval,
+ orig_nmap, *nmap);
+ return 0;
error0:
+ xfs_bmapi_finish(&bma, whichfork, error);
+ return error;
+}
+
+/*
+ * Convert an existing delalloc extent to real blocks based on file offset. This
+ * attempts to allocate the entire delalloc extent and may require multiple
+ * invocations to allocate the target offset if a large enough physical extent
+ * is not available.
+ */
+int
+xfs_bmapi_convert_delalloc(
+ struct xfs_inode *ip,
+ int whichfork,
+ xfs_fileoff_t offset_fsb,
+ struct xfs_bmbt_irec *imap,
+ unsigned int *seq)
+{
+ struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_bmalloca bma = { NULL };
+ struct xfs_trans *tp;
+ int error;
+
/*
- * Log everything. Do this after conversion, there's no point in
- * logging the extent records if we've converted to btree format.
+ * Space for the extent and indirect blocks was reserved when the
+ * delalloc extent was created so there's no need to do so here.
*/
- if ((bma.logflags & xfs_ilog_fext(whichfork)) &&
- XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS)
- bma.logflags &= ~xfs_ilog_fext(whichfork);
- else if ((bma.logflags & xfs_ilog_fbroot(whichfork)) &&
- XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)
- bma.logflags &= ~xfs_ilog_fbroot(whichfork);
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, 0, 0,
+ XFS_TRANS_RESERVE, &tp);
+ if (error)
+ return error;
+
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, ip, 0);
+
+ if (!xfs_iext_lookup_extent(ip, ifp, offset_fsb, &bma.icur, &bma.got) ||
+ bma.got.br_startoff > offset_fsb) {
+ /*
+ * No extent found in the range we are trying to convert. This
+ * should only happen for the COW fork, where another thread
+ * might have moved the extent to the data fork in the meantime.
+ */
+ WARN_ON_ONCE(whichfork != XFS_COW_FORK);
+ error = -EAGAIN;
+ goto out_trans_cancel;
+ }
+
/*
- * Log whatever the flags say, even if error. Otherwise we might miss
- * detecting a case where the data is changed, there's an error,
- * and it's not logged so we don't shutdown when we should.
+ * If we find a real extent here we raced with another thread converting
+ * the extent. Just return the real extent at this offset.
*/
- if (bma.logflags)
- xfs_trans_log_inode(tp, ip, bma.logflags);
+ if (!isnullstartblock(bma.got.br_startblock)) {
+ *imap = bma.got;
+ *seq = READ_ONCE(ifp->if_seq);
+ goto out_trans_cancel;
+ }
+
+ bma.tp = tp;
+ bma.ip = ip;
+ bma.wasdel = true;
+ bma.offset = bma.got.br_startoff;
+ bma.length = max_t(xfs_filblks_t, bma.got.br_blockcount, MAXEXTLEN);
+ bma.total = XFS_EXTENTADD_SPACE_RES(ip->i_mount, XFS_DATA_FORK);
+ bma.minleft = xfs_bmapi_minleft(tp, ip, whichfork);
+ if (whichfork == XFS_COW_FORK)
+ bma.flags = XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC;
+
+ if (!xfs_iext_peek_prev_extent(ifp, &bma.icur, &bma.prev))
+ bma.prev.br_startoff = NULLFILEOFF;
+
+ error = xfs_bmapi_allocate(&bma);
+ if (error)
+ goto out_finish;
+
+ error = -ENOSPC;
+ if (WARN_ON_ONCE(bma.blkno == NULLFSBLOCK))
+ goto out_finish;
+ error = -EFSCORRUPTED;
+ if (WARN_ON_ONCE(!bma.got.br_startblock && !XFS_IS_REALTIME_INODE(ip)))
+ goto out_finish;
+
+ XFS_STATS_ADD(mp, xs_xstrat_bytes, XFS_FSB_TO_B(mp, bma.length));
+ XFS_STATS_INC(mp, xs_xstrat_quick);
- if (bma.cur) {
- xfs_btree_del_cursor(bma.cur, error);
+ ASSERT(!isnullstartblock(bma.got.br_startblock));
+ *imap = bma.got;
+ *seq = READ_ONCE(ifp->if_seq);
+
+ if (whichfork == XFS_COW_FORK) {
+ error = xfs_refcount_alloc_cow_extent(tp, bma.blkno,
+ bma.length);
+ if (error)
+ goto out_finish;
}
- if (!error)
- xfs_bmap_validate_ret(orig_bno, orig_len, orig_flags, orig_mval,
- orig_nmap, *nmap);
+
+ error = xfs_bmap_btree_to_extents(tp, ip, bma.cur, &bma.logflags,
+ whichfork);
+ if (error)
+ goto out_finish;
+
+ xfs_bmapi_finish(&bma, whichfork, 0);
+ error = xfs_trans_commit(tp);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ return error;
+
+out_finish:
+ xfs_bmapi_finish(&bma, whichfork, error);
+out_trans_cancel:
+ xfs_trans_cancel(tp);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
return error;
}
@@ -4536,13 +4622,7 @@ xfs_bmapi_remap(
if (error)
goto error0;
- if (xfs_bmap_wants_extents(ip, whichfork)) {
- int tmp_logflags = 0;
-
- error = xfs_bmap_btree_to_extents(tp, ip, cur,
- &tmp_logflags, whichfork);
- logflags |= tmp_logflags;
- }
+ error = xfs_bmap_btree_to_extents(tp, ip, cur, &logflags, whichfork);
error0:
if (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS)
@@ -4764,8 +4844,10 @@ xfs_bmap_del_extent_delay(
da_diff = da_old - da_new;
if (!isrt)
da_diff += del->br_blockcount;
- if (da_diff)
+ if (da_diff) {
xfs_mod_fdblocks(mp, da_diff, false);
+ xfs_mod_delalloc(mp, -da_diff);
+ }
return error;
}
@@ -5406,24 +5488,11 @@ nodelete:
error = xfs_bmap_extents_to_btree(tp, ip, &cur, 0,
&tmp_logflags, whichfork);
logflags |= tmp_logflags;
- if (error)
- goto error0;
- }
- /*
- * transform from btree to extents, give it cur
- */
- else if (xfs_bmap_wants_extents(ip, whichfork)) {
- ASSERT(cur != NULL);
- error = xfs_bmap_btree_to_extents(tp, ip, cur, &tmp_logflags,
+ } else {
+ error = xfs_bmap_btree_to_extents(tp, ip, cur, &logflags,
whichfork);
- logflags |= tmp_logflags;
- if (error)
- goto error0;
}
- /*
- * transform from extents to local?
- */
- error = 0;
+
error0:
/*
* Log everything. Do this after conversion, there's no point in
diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h
index 09d3ea97cc15..8f597f9abdbe 100644
--- a/fs/xfs/libxfs/xfs_bmap.h
+++ b/fs/xfs/libxfs/xfs_bmap.h
@@ -95,12 +95,6 @@ struct xfs_extent_free_item
/* Map something in the CoW fork. */
#define XFS_BMAPI_COWFORK 0x200
-/* Only convert delalloc space, don't allocate entirely new extents */
-#define XFS_BMAPI_DELALLOC 0x400
-
-/* Only convert unwritten extents, don't allocate new blocks */
-#define XFS_BMAPI_CONVERT_ONLY 0x800
-
/* Skip online discard of freed extents */
#define XFS_BMAPI_NODISCARD 0x1000
@@ -117,8 +111,6 @@ struct xfs_extent_free_item
{ XFS_BMAPI_ZERO, "ZERO" }, \
{ XFS_BMAPI_REMAP, "REMAP" }, \
{ XFS_BMAPI_COWFORK, "COWFORK" }, \
- { XFS_BMAPI_DELALLOC, "DELALLOC" }, \
- { XFS_BMAPI_CONVERT_ONLY, "CONVERT_ONLY" }, \
{ XFS_BMAPI_NODISCARD, "NODISCARD" }, \
{ XFS_BMAPI_NORMAP, "NORMAP" }
@@ -181,7 +173,6 @@ static inline bool xfs_bmap_is_real_extent(struct xfs_bmbt_irec *irec)
void xfs_trim_extent(struct xfs_bmbt_irec *irec, xfs_fileoff_t bno,
xfs_filblks_t len);
-void xfs_trim_extent_eof(struct xfs_bmbt_irec *, struct xfs_inode *);
int xfs_bmap_add_attrfork(struct xfs_inode *ip, int size, int rsvd);
int xfs_bmap_set_attrforkoff(struct xfs_inode *ip, int size, int *version);
void xfs_bmap_local_to_extents_empty(struct xfs_inode *ip, int whichfork);
@@ -228,6 +219,13 @@ int xfs_bmapi_reserve_delalloc(struct xfs_inode *ip, int whichfork,
xfs_fileoff_t off, xfs_filblks_t len, xfs_filblks_t prealloc,
struct xfs_bmbt_irec *got, struct xfs_iext_cursor *cur,
int eof);
+int xfs_bmapi_convert_delalloc(struct xfs_inode *ip, int whichfork,
+ xfs_fileoff_t offset_fsb, struct xfs_bmbt_irec *imap,
+ unsigned int *seq);
+int xfs_bmap_add_extent_unwritten_real(struct xfs_trans *tp,
+ struct xfs_inode *ip, int whichfork,
+ struct xfs_iext_cursor *icur, struct xfs_btree_cur **curp,
+ struct xfs_bmbt_irec *new, int *logflagsp);
static inline void
xfs_bmap_add_free(
diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c
index cdb74d2e2a43..fbb18ba5d905 100644
--- a/fs/xfs/libxfs/xfs_bmap_btree.c
+++ b/fs/xfs/libxfs/xfs_bmap_btree.c
@@ -11,10 +11,8 @@
#include "xfs_trans_resv.h"
#include "xfs_bit.h"
#include "xfs_mount.h"
-#include "xfs_defer.h"
#include "xfs_inode.h"
#include "xfs_trans.h"
-#include "xfs_inode_item.h"
#include "xfs_alloc.h"
#include "xfs_btree.h"
#include "xfs_bmap_btree.h"
@@ -22,7 +20,6 @@
#include "xfs_error.h"
#include "xfs_quota.h"
#include "xfs_trace.h"
-#include "xfs_cksum.h"
#include "xfs_rmap.h"
/*
@@ -411,13 +408,15 @@ static xfs_failaddr_t
xfs_bmbt_verify(
struct xfs_buf *bp)
{
- struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_mount *mp = bp->b_mount;
struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
xfs_failaddr_t fa;
unsigned int level;
- switch (block->bb_magic) {
- case cpu_to_be32(XFS_BMAP_CRC_MAGIC):
+ if (!xfs_verify_magic(bp, block->bb_magic))
+ return __this_address;
+
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
/*
* XXX: need a better way of verifying the owner here. Right now
* just make sure there has been one set.
@@ -425,11 +424,6 @@ xfs_bmbt_verify(
fa = xfs_btree_lblock_v5hdr_verify(bp, XFS_RMAP_OWN_UNKNOWN);
if (fa)
return fa;
- /* fall through */
- case cpu_to_be32(XFS_BMAP_MAGIC):
- break;
- default:
- return __this_address;
}
/*
@@ -481,6 +475,8 @@ xfs_bmbt_write_verify(
const struct xfs_buf_ops xfs_bmbt_buf_ops = {
.name = "xfs_bmbt",
+ .magic = { cpu_to_be32(XFS_BMAP_MAGIC),
+ cpu_to_be32(XFS_BMAP_CRC_MAGIC) },
.verify_read = xfs_bmbt_read_verify,
.verify_write = xfs_bmbt_write_verify,
.verify_struct = xfs_bmbt_verify,
diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index bbdae2b4559f..f1048efa4268 100644
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -11,16 +11,13 @@
#include "xfs_trans_resv.h"
#include "xfs_bit.h"
#include "xfs_mount.h"
-#include "xfs_defer.h"
#include "xfs_inode.h"
#include "xfs_trans.h"
-#include "xfs_inode_item.h"
#include "xfs_buf_item.h"
#include "xfs_btree.h"
#include "xfs_errortag.h"
#include "xfs_error.h"
#include "xfs_trace.h"
-#include "xfs_cksum.h"
#include "xfs_alloc.h"
#include "xfs_log.h"
@@ -276,7 +273,7 @@ xfs_btree_lblock_calc_crc(
struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
struct xfs_buf_log_item *bip = bp->b_log_item;
- if (!xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb))
+ if (!xfs_sb_version_hascrc(&bp->b_mount->m_sb))
return;
if (bip)
block->bb_u.l.bb_lsn = cpu_to_be64(bip->bli_item.li_lsn);
@@ -288,7 +285,7 @@ xfs_btree_lblock_verify_crc(
struct xfs_buf *bp)
{
struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
- struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_mount *mp = bp->b_mount;
if (xfs_sb_version_hascrc(&mp->m_sb)) {
if (!xfs_log_check_lsn(mp, be64_to_cpu(block->bb_u.l.bb_lsn)))
@@ -314,7 +311,7 @@ xfs_btree_sblock_calc_crc(
struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
struct xfs_buf_log_item *bip = bp->b_log_item;
- if (!xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb))
+ if (!xfs_sb_version_hascrc(&bp->b_mount->m_sb))
return;
if (bip)
block->bb_u.s.bb_lsn = cpu_to_be64(bip->bli_item.li_lsn);
@@ -326,7 +323,7 @@ xfs_btree_sblock_verify_crc(
struct xfs_buf *bp)
{
struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
- struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_mount *mp = bp->b_mount;
if (xfs_sb_version_hascrc(&mp->m_sb)) {
if (!xfs_log_check_lsn(mp, be64_to_cpu(block->bb_u.s.bb_lsn)))
@@ -691,14 +688,13 @@ xfs_buf_t * /* buffer for fsbno */
xfs_btree_get_bufl(
xfs_mount_t *mp, /* file system mount point */
xfs_trans_t *tp, /* transaction pointer */
- xfs_fsblock_t fsbno, /* file system block number */
- uint lock) /* lock flags for get_buf */
+ xfs_fsblock_t fsbno) /* file system block number */
{
xfs_daddr_t d; /* real disk block address */
ASSERT(fsbno != NULLFSBLOCK);
d = XFS_FSB_TO_DADDR(mp, fsbno);
- return xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize, lock);
+ return xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize, 0);
}
/*
@@ -710,15 +706,14 @@ xfs_btree_get_bufs(
xfs_mount_t *mp, /* file system mount point */
xfs_trans_t *tp, /* transaction pointer */
xfs_agnumber_t agno, /* allocation group number */
- xfs_agblock_t agbno, /* allocation group block number */
- uint lock) /* lock flags for get_buf */
+ xfs_agblock_t agbno) /* allocation group block number */
{
xfs_daddr_t d; /* real disk block address */
ASSERT(agno != NULLAGNUMBER);
ASSERT(agbno != NULLAGBLOCK);
d = XFS_AGB_TO_DADDR(mp, agno, agbno);
- return xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize, lock);
+ return xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize, 0);
}
/*
@@ -845,7 +840,6 @@ xfs_btree_read_bufl(
struct xfs_mount *mp, /* file system mount point */
struct xfs_trans *tp, /* transaction pointer */
xfs_fsblock_t fsbno, /* file system block number */
- uint lock, /* lock flags for read_buf */
struct xfs_buf **bpp, /* buffer for fsbno */
int refval, /* ref count value for buffer */
const struct xfs_buf_ops *ops)
@@ -858,7 +852,7 @@ xfs_btree_read_bufl(
return -EFSCORRUPTED;
d = XFS_FSB_TO_DADDR(mp, fsbno);
error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d,
- mp->m_bsize, lock, &bp, ops);
+ mp->m_bsize, 0, &bp, ops);
if (error)
return error;
if (bp)
@@ -1185,11 +1179,10 @@ xfs_btree_init_block(
xfs_btnum_t btnum,
__u16 level,
__u16 numrecs,
- __u64 owner,
- unsigned int flags)
+ __u64 owner)
{
xfs_btree_init_block_int(mp, XFS_BUF_TO_BLOCK(bp), bp->b_bn,
- btnum, level, numrecs, owner, flags);
+ btnum, level, numrecs, owner, 0);
}
STATIC void
@@ -1288,7 +1281,6 @@ STATIC int
xfs_btree_get_buf_block(
struct xfs_btree_cur *cur,
union xfs_btree_ptr *ptr,
- int flags,
struct xfs_btree_block **block,
struct xfs_buf **bpp)
{
@@ -1296,14 +1288,11 @@ xfs_btree_get_buf_block(
xfs_daddr_t d;
int error;
- /* need to sort out how callers deal with failures first */
- ASSERT(!(flags & XBF_TRYLOCK));
-
error = xfs_btree_ptr_to_daddr(cur, ptr, &d);
if (error)
return error;
*bpp = xfs_trans_get_buf(cur->bc_tp, mp->m_ddev_targp, d,
- mp->m_bsize, flags);
+ mp->m_bsize, 0);
if (!*bpp)
return -ENOMEM;
@@ -2706,7 +2695,7 @@ __xfs_btree_split(
XFS_BTREE_STATS_INC(cur, alloc);
/* Set up the new block as "right". */
- error = xfs_btree_get_buf_block(cur, &rptr, 0, &right, &rbp);
+ error = xfs_btree_get_buf_block(cur, &rptr, &right, &rbp);
if (error)
goto error0;
@@ -2961,7 +2950,7 @@ xfs_btree_new_iroot(
XFS_BTREE_STATS_INC(cur, alloc);
/* Copy the root into a real block. */
- error = xfs_btree_get_buf_block(cur, &nptr, 0, &cblock, &cbp);
+ error = xfs_btree_get_buf_block(cur, &nptr, &cblock, &cbp);
if (error)
goto error0;
@@ -3058,7 +3047,7 @@ xfs_btree_new_root(
XFS_BTREE_STATS_INC(cur, alloc);
/* Set up the new block. */
- error = xfs_btree_get_buf_block(cur, &lptr, 0, &new, &nbp);
+ error = xfs_btree_get_buf_block(cur, &lptr, &new, &nbp);
if (error)
goto error0;
@@ -4433,7 +4422,7 @@ xfs_btree_lblock_v5hdr_verify(
struct xfs_buf *bp,
uint64_t owner)
{
- struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_mount *mp = bp->b_mount;
struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
if (!xfs_sb_version_hascrc(&mp->m_sb))
@@ -4454,7 +4443,7 @@ xfs_btree_lblock_verify(
struct xfs_buf *bp,
unsigned int max_recs)
{
- struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_mount *mp = bp->b_mount;
struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
/* numrecs verification */
@@ -4484,7 +4473,7 @@ xfs_failaddr_t
xfs_btree_sblock_v5hdr_verify(
struct xfs_buf *bp)
{
- struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_mount *mp = bp->b_mount;
struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
struct xfs_perag *pag = bp->b_pag;
@@ -4510,7 +4499,7 @@ xfs_btree_sblock_verify(
struct xfs_buf *bp,
unsigned int max_recs)
{
- struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_mount *mp = bp->b_mount;
struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
xfs_agblock_t agno;
diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h
index e3b3e9dce5da..fa3cd8ab9aba 100644
--- a/fs/xfs/libxfs/xfs_btree.h
+++ b/fs/xfs/libxfs/xfs_btree.h
@@ -301,8 +301,7 @@ struct xfs_buf * /* buffer for fsbno */
xfs_btree_get_bufl(
struct xfs_mount *mp, /* file system mount point */
struct xfs_trans *tp, /* transaction pointer */
- xfs_fsblock_t fsbno, /* file system block number */
- uint lock); /* lock flags for get_buf */
+ xfs_fsblock_t fsbno); /* file system block number */
/*
* Get a buffer for the block, return it with no data read.
@@ -313,8 +312,7 @@ xfs_btree_get_bufs(
struct xfs_mount *mp, /* file system mount point */
struct xfs_trans *tp, /* transaction pointer */
xfs_agnumber_t agno, /* allocation group number */
- xfs_agblock_t agbno, /* allocation group block number */
- uint lock); /* lock flags for get_buf */
+ xfs_agblock_t agbno); /* allocation group block number */
/*
* Check for the cursor referring to the last block at the given level.
@@ -345,7 +343,6 @@ xfs_btree_read_bufl(
struct xfs_mount *mp, /* file system mount point */
struct xfs_trans *tp, /* transaction pointer */
xfs_fsblock_t fsbno, /* file system block number */
- uint lock, /* lock flags for read_buf */
struct xfs_buf **bpp, /* buffer for fsbno */
int refval, /* ref count value for buffer */
const struct xfs_buf_ops *ops);
@@ -383,8 +380,7 @@ xfs_btree_init_block(
xfs_btnum_t btnum,
__u16 level,
__u16 numrecs,
- __u64 owner,
- unsigned int flags);
+ __u64 owner);
void
xfs_btree_init_block_int(
@@ -469,8 +465,8 @@ uint xfs_btree_compute_maxlevels(uint *limits, unsigned long len);
unsigned long long xfs_btree_calc_size(uint *limits, unsigned long long len);
/* return codes */
-#define XFS_BTREE_QUERY_RANGE_CONTINUE 0 /* keep iterating */
-#define XFS_BTREE_QUERY_RANGE_ABORT 1 /* stop iterating */
+#define XFS_BTREE_QUERY_RANGE_CONTINUE (XFS_ITER_CONTINUE) /* keep iterating */
+#define XFS_BTREE_QUERY_RANGE_ABORT (XFS_ITER_ABORT) /* stop iterating */
typedef int (*xfs_btree_query_range_fn)(struct xfs_btree_cur *cur,
union xfs_btree_rec *rec, void *priv);
diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c
index 376bee94b5dd..d1c77fd0815d 100644
--- a/fs/xfs/libxfs/xfs_da_btree.c
+++ b/fs/xfs/libxfs/xfs_da_btree.c
@@ -12,20 +12,14 @@
#include "xfs_trans_resv.h"
#include "xfs_bit.h"
#include "xfs_mount.h"
-#include "xfs_da_format.h"
-#include "xfs_da_btree.h"
#include "xfs_dir2.h"
#include "xfs_dir2_priv.h"
#include "xfs_inode.h"
#include "xfs_trans.h"
-#include "xfs_inode_item.h"
-#include "xfs_alloc.h"
#include "xfs_bmap.h"
-#include "xfs_attr.h"
#include "xfs_attr_leaf.h"
#include "xfs_error.h"
#include "xfs_trace.h"
-#include "xfs_cksum.h"
#include "xfs_buf_item.h"
#include "xfs_log.h"
@@ -116,35 +110,52 @@ xfs_da_state_free(xfs_da_state_t *state)
kmem_zone_free(xfs_da_state_zone, state);
}
+/*
+ * Verify an xfs_da3_blkinfo structure. Note that the da3 fields are only
+ * accessible on v5 filesystems. This header format is common across da node,
+ * attr leaf and dir leaf blocks.
+ */
+xfs_failaddr_t
+xfs_da3_blkinfo_verify(
+ struct xfs_buf *bp,
+ struct xfs_da3_blkinfo *hdr3)
+{
+ struct xfs_mount *mp = bp->b_mount;
+ struct xfs_da_blkinfo *hdr = &hdr3->hdr;
+
+ if (!xfs_verify_magic16(bp, hdr->magic))
+ return __this_address;
+
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_meta_uuid))
+ return __this_address;
+ if (be64_to_cpu(hdr3->blkno) != bp->b_bn)
+ return __this_address;
+ if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->lsn)))
+ return __this_address;
+ }
+
+ return NULL;
+}
+
static xfs_failaddr_t
xfs_da3_node_verify(
struct xfs_buf *bp)
{
- struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_mount *mp = bp->b_mount;
struct xfs_da_intnode *hdr = bp->b_addr;
struct xfs_da3_icnode_hdr ichdr;
const struct xfs_dir_ops *ops;
+ xfs_failaddr_t fa;
ops = xfs_dir_get_ops(mp, NULL);
ops->node_hdr_from_disk(&ichdr, hdr);
- if (xfs_sb_version_hascrc(&mp->m_sb)) {
- struct xfs_da3_node_hdr *hdr3 = bp->b_addr;
-
- if (ichdr.magic != XFS_DA3_NODE_MAGIC)
- return __this_address;
+ fa = xfs_da3_blkinfo_verify(bp, bp->b_addr);
+ if (fa)
+ return fa;
- if (!uuid_equal(&hdr3->info.uuid, &mp->m_sb.sb_meta_uuid))
- return __this_address;
- if (be64_to_cpu(hdr3->info.blkno) != bp->b_bn)
- return __this_address;
- if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->info.lsn)))
- return __this_address;
- } else {
- if (ichdr.magic != XFS_DA_NODE_MAGIC)
- return __this_address;
- }
if (ichdr.level == 0)
return __this_address;
if (ichdr.level > XFS_DA_NODE_MAXDEPTH)
@@ -169,7 +180,7 @@ static void
xfs_da3_node_write_verify(
struct xfs_buf *bp)
{
- struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_mount *mp = bp->b_mount;
struct xfs_buf_log_item *bip = bp->b_log_item;
struct xfs_da3_node_hdr *hdr3 = bp->b_addr;
xfs_failaddr_t fa;
@@ -257,6 +268,8 @@ xfs_da3_node_verify_struct(
const struct xfs_buf_ops xfs_da3_node_buf_ops = {
.name = "xfs_da3_node",
+ .magic16 = { cpu_to_be16(XFS_DA_NODE_MAGIC),
+ cpu_to_be16(XFS_DA3_NODE_MAGIC) },
.verify_read = xfs_da3_node_read_verify,
.verify_write = xfs_da3_node_write_verify,
.verify_struct = xfs_da3_node_verify_struct,
diff --git a/fs/xfs/libxfs/xfs_da_format.c b/fs/xfs/libxfs/xfs_da_format.c
index b39053dcb643..b1ae572496b6 100644
--- a/fs/xfs/libxfs/xfs_da_format.c
+++ b/fs/xfs/libxfs/xfs_da_format.c
@@ -11,11 +11,8 @@
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
-#include "xfs_da_format.h"
-#include "xfs_da_btree.h"
#include "xfs_inode.h"
#include "xfs_dir2.h"
-#include "xfs_dir2_priv.h"
/*
* Shortform directory ops
diff --git a/fs/xfs/libxfs/xfs_da_format.h b/fs/xfs/libxfs/xfs_da_format.h
index 5d5bf3bffc78..ae654e06b2fb 100644
--- a/fs/xfs/libxfs/xfs_da_format.h
+++ b/fs/xfs/libxfs/xfs_da_format.h
@@ -869,4 +869,7 @@ static inline unsigned int xfs_dir2_dirblock_bytes(struct xfs_sb *sbp)
return 1 << (sbp->sb_blocklog + sbp->sb_dirblklog);
}
+xfs_failaddr_t xfs_da3_blkinfo_verify(struct xfs_buf *bp,
+ struct xfs_da3_blkinfo *hdr3);
+
#endif /* __XFS_DA_FORMAT_H__ */
diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c
index 94f00427de98..eb2be2a6a25a 100644
--- a/fs/xfs/libxfs/xfs_defer.c
+++ b/fs/xfs/libxfs/xfs_defer.c
@@ -9,8 +9,6 @@
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
-#include "xfs_bit.h"
-#include "xfs_sb.h"
#include "xfs_mount.h"
#include "xfs_defer.h"
#include "xfs_trans.h"
@@ -274,13 +272,15 @@ xfs_defer_trans_roll(
trace_xfs_defer_trans_roll(tp, _RET_IP_);
- /* Roll the transaction. */
+ /*
+ * Roll the transaction. Rolling always given a new transaction (even
+ * if committing the old one fails!) to hand back to the caller, so we
+ * join the held resources to the new transaction so that we always
+ * return with the held resources joined to @tpp, no matter what
+ * happened.
+ */
error = xfs_trans_roll(tpp);
tp = *tpp;
- if (error) {
- trace_xfs_defer_trans_roll_error(tp, error);
- return error;
- }
/* Rejoin the joined inodes. */
for (i = 0; i < ipcount; i++)
@@ -292,6 +292,8 @@ xfs_defer_trans_roll(
xfs_trans_bhold(tp, bplist[i]);
}
+ if (error)
+ trace_xfs_defer_trans_roll_error(tp, error);
return error;
}
diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c
index 229152cd1a24..67840723edbb 100644
--- a/fs/xfs/libxfs/xfs_dir2.c
+++ b/fs/xfs/libxfs/xfs_dir2.c
@@ -5,20 +5,16 @@
*/
#include "xfs.h"
#include "xfs_fs.h"
+#include "xfs_shared.h"
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
-#include "xfs_defer.h"
-#include "xfs_da_format.h"
-#include "xfs_da_btree.h"
#include "xfs_inode.h"
#include "xfs_trans.h"
-#include "xfs_inode_item.h"
#include "xfs_bmap.h"
#include "xfs_dir2.h"
#include "xfs_dir2_priv.h"
-#include "xfs_ialloc.h"
#include "xfs_errortag.h"
#include "xfs_error.h"
#include "xfs_trace.h"
@@ -703,3 +699,20 @@ xfs_dir2_shrink_inode(
xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
return 0;
}
+
+/* Returns true if the directory entry name is valid. */
+bool
+xfs_dir2_namecheck(
+ const void *name,
+ size_t length)
+{
+ /*
+ * MAXNAMELEN includes the trailing null, but (name/length) leave it
+ * out, so use >= for the length check.
+ */
+ if (length >= MAXNAMELEN)
+ return false;
+
+ /* There shouldn't be any slashes or nulls here */
+ return !memchr(name, '/', length) && !memchr(name, 0, length);
+}
diff --git a/fs/xfs/libxfs/xfs_dir2.h b/fs/xfs/libxfs/xfs_dir2.h
index c3e3f6b813d8..f54244779492 100644
--- a/fs/xfs/libxfs/xfs_dir2.h
+++ b/fs/xfs/libxfs/xfs_dir2.h
@@ -326,5 +326,6 @@ xfs_dir2_leaf_tail_p(struct xfs_da_geometry *geo, struct xfs_dir2_leaf *lp)
unsigned char xfs_dir3_get_dtype(struct xfs_mount *mp, uint8_t filetype);
void *xfs_dir3_data_endp(struct xfs_da_geometry *geo,
struct xfs_dir2_data_hdr *hdr);
+bool xfs_dir2_namecheck(const void *name, size_t length);
#endif /* __XFS_DIR2_H__ */
diff --git a/fs/xfs/libxfs/xfs_dir2_block.c b/fs/xfs/libxfs/xfs_dir2_block.c
index 30ed5919da72..a6fb0cc2085e 100644
--- a/fs/xfs/libxfs/xfs_dir2_block.c
+++ b/fs/xfs/libxfs/xfs_dir2_block.c
@@ -6,22 +6,19 @@
*/
#include "xfs.h"
#include "xfs_fs.h"
+#include "xfs_shared.h"
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
-#include "xfs_da_format.h"
-#include "xfs_da_btree.h"
#include "xfs_inode.h"
#include "xfs_trans.h"
-#include "xfs_inode_item.h"
#include "xfs_bmap.h"
#include "xfs_buf_item.h"
#include "xfs_dir2.h"
#include "xfs_dir2_priv.h"
#include "xfs_error.h"
#include "xfs_trace.h"
-#include "xfs_cksum.h"
#include "xfs_log.h"
/*
@@ -50,21 +47,19 @@ static xfs_failaddr_t
xfs_dir3_block_verify(
struct xfs_buf *bp)
{
- struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_mount *mp = bp->b_mount;
struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr;
+ if (!xfs_verify_magic(bp, hdr3->magic))
+ return __this_address;
+
if (xfs_sb_version_hascrc(&mp->m_sb)) {
- if (hdr3->magic != cpu_to_be32(XFS_DIR3_BLOCK_MAGIC))
- return __this_address;
if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_meta_uuid))
return __this_address;
if (be64_to_cpu(hdr3->blkno) != bp->b_bn)
return __this_address;
if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->lsn)))
return __this_address;
- } else {
- if (hdr3->magic != cpu_to_be32(XFS_DIR2_BLOCK_MAGIC))
- return __this_address;
}
return __xfs_dir3_data_check(NULL, bp);
}
@@ -73,7 +68,7 @@ static void
xfs_dir3_block_read_verify(
struct xfs_buf *bp)
{
- struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_mount *mp = bp->b_mount;
xfs_failaddr_t fa;
if (xfs_sb_version_hascrc(&mp->m_sb) &&
@@ -90,7 +85,7 @@ static void
xfs_dir3_block_write_verify(
struct xfs_buf *bp)
{
- struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_mount *mp = bp->b_mount;
struct xfs_buf_log_item *bip = bp->b_log_item;
struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr;
xfs_failaddr_t fa;
@@ -112,6 +107,8 @@ xfs_dir3_block_write_verify(
const struct xfs_buf_ops xfs_dir3_block_buf_ops = {
.name = "xfs_dir3_block",
+ .magic = { cpu_to_be32(XFS_DIR2_BLOCK_MAGIC),
+ cpu_to_be32(XFS_DIR3_BLOCK_MAGIC) },
.verify_read = xfs_dir3_block_read_verify,
.verify_write = xfs_dir3_block_write_verify,
.verify_struct = xfs_dir3_block_verify,
diff --git a/fs/xfs/libxfs/xfs_dir2_data.c b/fs/xfs/libxfs/xfs_dir2_data.c
index 01162c62ec8f..2c79be4c3153 100644
--- a/fs/xfs/libxfs/xfs_dir2_data.c
+++ b/fs/xfs/libxfs/xfs_dir2_data.c
@@ -6,19 +6,16 @@
*/
#include "xfs.h"
#include "xfs_fs.h"
+#include "xfs_shared.h"
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
-#include "xfs_da_format.h"
-#include "xfs_da_btree.h"
#include "xfs_inode.h"
#include "xfs_dir2.h"
-#include "xfs_dir2_priv.h"
#include "xfs_error.h"
#include "xfs_trans.h"
#include "xfs_buf_item.h"
-#include "xfs_cksum.h"
#include "xfs_log.h"
static xfs_failaddr_t xfs_dir2_data_freefind_verify(
@@ -50,14 +47,13 @@ __xfs_dir3_data_check(
int i; /* leaf index */
int lastfree; /* last entry was unused */
xfs_dir2_leaf_entry_t *lep=NULL; /* block leaf entries */
- xfs_mount_t *mp; /* filesystem mount point */
+ struct xfs_mount *mp = bp->b_mount;
char *p; /* current data position */
int stale; /* count of stale leaves */
struct xfs_name name;
const struct xfs_dir_ops *ops;
struct xfs_da_geometry *geo;
- mp = bp->b_target->bt_mount;
geo = mp->m_dir_geo;
/*
@@ -249,21 +245,19 @@ static xfs_failaddr_t
xfs_dir3_data_verify(
struct xfs_buf *bp)
{
- struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_mount *mp = bp->b_mount;
struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr;
+ if (!xfs_verify_magic(bp, hdr3->magic))
+ return __this_address;
+
if (xfs_sb_version_hascrc(&mp->m_sb)) {
- if (hdr3->magic != cpu_to_be32(XFS_DIR3_DATA_MAGIC))
- return __this_address;
if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_meta_uuid))
return __this_address;
if (be64_to_cpu(hdr3->blkno) != bp->b_bn)
return __this_address;
if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->lsn)))
return __this_address;
- } else {
- if (hdr3->magic != cpu_to_be32(XFS_DIR2_DATA_MAGIC))
- return __this_address;
}
return __xfs_dir3_data_check(NULL, bp);
}
@@ -300,7 +294,7 @@ static void
xfs_dir3_data_read_verify(
struct xfs_buf *bp)
{
- struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_mount *mp = bp->b_mount;
xfs_failaddr_t fa;
if (xfs_sb_version_hascrc(&mp->m_sb) &&
@@ -317,7 +311,7 @@ static void
xfs_dir3_data_write_verify(
struct xfs_buf *bp)
{
- struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_mount *mp = bp->b_mount;
struct xfs_buf_log_item *bip = bp->b_log_item;
struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr;
xfs_failaddr_t fa;
@@ -339,6 +333,8 @@ xfs_dir3_data_write_verify(
const struct xfs_buf_ops xfs_dir3_data_buf_ops = {
.name = "xfs_dir3_data",
+ .magic = { cpu_to_be32(XFS_DIR2_DATA_MAGIC),
+ cpu_to_be32(XFS_DIR3_DATA_MAGIC) },
.verify_read = xfs_dir3_data_read_verify,
.verify_write = xfs_dir3_data_write_verify,
.verify_struct = xfs_dir3_data_verify,
@@ -346,6 +342,8 @@ const struct xfs_buf_ops xfs_dir3_data_buf_ops = {
static const struct xfs_buf_ops xfs_dir3_data_reada_buf_ops = {
.name = "xfs_dir3_data_reada",
+ .magic = { cpu_to_be32(XFS_DIR2_DATA_MAGIC),
+ cpu_to_be32(XFS_DIR3_DATA_MAGIC) },
.verify_read = xfs_dir3_data_reada_verify,
.verify_write = xfs_dir3_data_write_verify,
};
diff --git a/fs/xfs/libxfs/xfs_dir2_leaf.c b/fs/xfs/libxfs/xfs_dir2_leaf.c
index 1728a3e6f5cf..a53e4585a2f3 100644
--- a/fs/xfs/libxfs/xfs_dir2_leaf.c
+++ b/fs/xfs/libxfs/xfs_dir2_leaf.c
@@ -6,12 +6,11 @@
*/
#include "xfs.h"
#include "xfs_fs.h"
+#include "xfs_shared.h"
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
-#include "xfs_da_format.h"
-#include "xfs_da_btree.h"
#include "xfs_inode.h"
#include "xfs_bmap.h"
#include "xfs_dir2.h"
@@ -20,8 +19,6 @@
#include "xfs_trace.h"
#include "xfs_trans.h"
#include "xfs_buf_item.h"
-#include "xfs_cksum.h"
-#include "xfs_log.h"
/*
* Local function declarations.
@@ -142,66 +139,46 @@ xfs_dir3_leaf_check_int(
*/
static xfs_failaddr_t
xfs_dir3_leaf_verify(
- struct xfs_buf *bp,
- uint16_t magic)
+ struct xfs_buf *bp)
{
- struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_mount *mp = bp->b_mount;
struct xfs_dir2_leaf *leaf = bp->b_addr;
+ xfs_failaddr_t fa;
- ASSERT(magic == XFS_DIR2_LEAF1_MAGIC || magic == XFS_DIR2_LEAFN_MAGIC);
-
- if (xfs_sb_version_hascrc(&mp->m_sb)) {
- struct xfs_dir3_leaf_hdr *leaf3 = bp->b_addr;
- uint16_t magic3;
-
- magic3 = (magic == XFS_DIR2_LEAF1_MAGIC) ? XFS_DIR3_LEAF1_MAGIC
- : XFS_DIR3_LEAFN_MAGIC;
-
- if (leaf3->info.hdr.magic != cpu_to_be16(magic3))
- return __this_address;
- if (!uuid_equal(&leaf3->info.uuid, &mp->m_sb.sb_meta_uuid))
- return __this_address;
- if (be64_to_cpu(leaf3->info.blkno) != bp->b_bn)
- return __this_address;
- if (!xfs_log_check_lsn(mp, be64_to_cpu(leaf3->info.lsn)))
- return __this_address;
- } else {
- if (leaf->hdr.info.magic != cpu_to_be16(magic))
- return __this_address;
- }
+ fa = xfs_da3_blkinfo_verify(bp, bp->b_addr);
+ if (fa)
+ return fa;
return xfs_dir3_leaf_check_int(mp, NULL, NULL, leaf);
}
static void
-__read_verify(
- struct xfs_buf *bp,
- uint16_t magic)
+xfs_dir3_leaf_read_verify(
+ struct xfs_buf *bp)
{
- struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_mount *mp = bp->b_mount;
xfs_failaddr_t fa;
if (xfs_sb_version_hascrc(&mp->m_sb) &&
!xfs_buf_verify_cksum(bp, XFS_DIR3_LEAF_CRC_OFF))
xfs_verifier_error(bp, -EFSBADCRC, __this_address);
else {
- fa = xfs_dir3_leaf_verify(bp, magic);
+ fa = xfs_dir3_leaf_verify(bp);
if (fa)
xfs_verifier_error(bp, -EFSCORRUPTED, fa);
}
}
static void
-__write_verify(
- struct xfs_buf *bp,
- uint16_t magic)
+xfs_dir3_leaf_write_verify(
+ struct xfs_buf *bp)
{
- struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_mount *mp = bp->b_mount;
struct xfs_buf_log_item *bip = bp->b_log_item;
struct xfs_dir3_leaf_hdr *hdr3 = bp->b_addr;
xfs_failaddr_t fa;
- fa = xfs_dir3_leaf_verify(bp, magic);
+ fa = xfs_dir3_leaf_verify(bp);
if (fa) {
xfs_verifier_error(bp, -EFSCORRUPTED, fa);
return;
@@ -216,60 +193,22 @@ __write_verify(
xfs_buf_update_cksum(bp, XFS_DIR3_LEAF_CRC_OFF);
}
-static xfs_failaddr_t
-xfs_dir3_leaf1_verify(
- struct xfs_buf *bp)
-{
- return xfs_dir3_leaf_verify(bp, XFS_DIR2_LEAF1_MAGIC);
-}
-
-static void
-xfs_dir3_leaf1_read_verify(
- struct xfs_buf *bp)
-{
- __read_verify(bp, XFS_DIR2_LEAF1_MAGIC);
-}
-
-static void
-xfs_dir3_leaf1_write_verify(
- struct xfs_buf *bp)
-{
- __write_verify(bp, XFS_DIR2_LEAF1_MAGIC);
-}
-
-static xfs_failaddr_t
-xfs_dir3_leafn_verify(
- struct xfs_buf *bp)
-{
- return xfs_dir3_leaf_verify(bp, XFS_DIR2_LEAFN_MAGIC);
-}
-
-static void
-xfs_dir3_leafn_read_verify(
- struct xfs_buf *bp)
-{
- __read_verify(bp, XFS_DIR2_LEAFN_MAGIC);
-}
-
-static void
-xfs_dir3_leafn_write_verify(
- struct xfs_buf *bp)
-{
- __write_verify(bp, XFS_DIR2_LEAFN_MAGIC);
-}
-
const struct xfs_buf_ops xfs_dir3_leaf1_buf_ops = {
.name = "xfs_dir3_leaf1",
- .verify_read = xfs_dir3_leaf1_read_verify,
- .verify_write = xfs_dir3_leaf1_write_verify,
- .verify_struct = xfs_dir3_leaf1_verify,
+ .magic16 = { cpu_to_be16(XFS_DIR2_LEAF1_MAGIC),
+ cpu_to_be16(XFS_DIR3_LEAF1_MAGIC) },
+ .verify_read = xfs_dir3_leaf_read_verify,
+ .verify_write = xfs_dir3_leaf_write_verify,
+ .verify_struct = xfs_dir3_leaf_verify,
};
const struct xfs_buf_ops xfs_dir3_leafn_buf_ops = {
.name = "xfs_dir3_leafn",
- .verify_read = xfs_dir3_leafn_read_verify,
- .verify_write = xfs_dir3_leafn_write_verify,
- .verify_struct = xfs_dir3_leafn_verify,
+ .magic16 = { cpu_to_be16(XFS_DIR2_LEAFN_MAGIC),
+ cpu_to_be16(XFS_DIR3_LEAFN_MAGIC) },
+ .verify_read = xfs_dir3_leaf_read_verify,
+ .verify_write = xfs_dir3_leaf_write_verify,
+ .verify_struct = xfs_dir3_leaf_verify,
};
int
@@ -621,43 +560,40 @@ xfs_dir3_leaf_find_entry(
*/
int /* error */
xfs_dir2_leaf_addname(
- xfs_da_args_t *args) /* operation arguments */
+ struct xfs_da_args *args) /* operation arguments */
{
+ struct xfs_dir3_icleaf_hdr leafhdr;
+ struct xfs_trans *tp = args->trans;
__be16 *bestsp; /* freespace table in leaf */
- int compact; /* need to compact leaves */
- xfs_dir2_data_hdr_t *hdr; /* data block header */
+ __be16 *tagp; /* end of data entry */
struct xfs_buf *dbp; /* data block buffer */
- xfs_dir2_data_entry_t *dep; /* data block entry */
- xfs_inode_t *dp; /* incore directory inode */
- xfs_dir2_data_unused_t *dup; /* data unused entry */
+ struct xfs_buf *lbp; /* leaf's buffer */
+ struct xfs_dir2_leaf *leaf; /* leaf structure */
+ struct xfs_inode *dp = args->dp; /* incore directory inode */
+ struct xfs_dir2_data_hdr *hdr; /* data block header */
+ struct xfs_dir2_data_entry *dep; /* data block entry */
+ struct xfs_dir2_leaf_entry *lep; /* leaf entry table pointer */
+ struct xfs_dir2_leaf_entry *ents;
+ struct xfs_dir2_data_unused *dup; /* data unused entry */
+ struct xfs_dir2_leaf_tail *ltp; /* leaf tail pointer */
+ struct xfs_dir2_data_free *bf; /* bestfree table */
+ int compact; /* need to compact leaves */
int error; /* error return value */
int grown; /* allocated new data block */
- int highstale; /* index of next stale leaf */
+ int highstale = 0; /* index of next stale leaf */
int i; /* temporary, index */
int index; /* leaf table position */
- struct xfs_buf *lbp; /* leaf's buffer */
- xfs_dir2_leaf_t *leaf; /* leaf structure */
int length; /* length of new entry */
- xfs_dir2_leaf_entry_t *lep; /* leaf entry table pointer */
int lfloglow; /* low leaf logging index */
int lfloghigh; /* high leaf logging index */
- int lowstale; /* index of prev stale leaf */
- xfs_dir2_leaf_tail_t *ltp; /* leaf tail pointer */
+ int lowstale = 0; /* index of prev stale leaf */
int needbytes; /* leaf block bytes needed */
int needlog; /* need to log data header */
int needscan; /* need to rescan data free */
- __be16 *tagp; /* end of data entry */
- xfs_trans_t *tp; /* transaction pointer */
xfs_dir2_db_t use_block; /* data block number */
- struct xfs_dir2_data_free *bf; /* bestfree table */
- struct xfs_dir2_leaf_entry *ents;
- struct xfs_dir3_icleaf_hdr leafhdr;
trace_xfs_dir2_leaf_addname(args);
- dp = args->dp;
- tp = args->trans;
-
error = xfs_dir3_leaf_read(tp, dp, args->geo->leafblk, -1, &lbp);
if (error)
return error;
diff --git a/fs/xfs/libxfs/xfs_dir2_node.c b/fs/xfs/libxfs/xfs_dir2_node.c
index f1bb3434f51c..afcc6642690a 100644
--- a/fs/xfs/libxfs/xfs_dir2_node.c
+++ b/fs/xfs/libxfs/xfs_dir2_node.c
@@ -6,12 +6,11 @@
*/
#include "xfs.h"
#include "xfs_fs.h"
+#include "xfs_shared.h"
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
-#include "xfs_da_format.h"
-#include "xfs_da_btree.h"
#include "xfs_inode.h"
#include "xfs_bmap.h"
#include "xfs_dir2.h"
@@ -20,7 +19,6 @@
#include "xfs_trace.h"
#include "xfs_trans.h"
#include "xfs_buf_item.h"
-#include "xfs_cksum.h"
#include "xfs_log.h"
/*
@@ -84,23 +82,21 @@ static xfs_failaddr_t
xfs_dir3_free_verify(
struct xfs_buf *bp)
{
- struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_mount *mp = bp->b_mount;
struct xfs_dir2_free_hdr *hdr = bp->b_addr;
+ if (!xfs_verify_magic(bp, hdr->magic))
+ return __this_address;
+
if (xfs_sb_version_hascrc(&mp->m_sb)) {
struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr;
- if (hdr3->magic != cpu_to_be32(XFS_DIR3_FREE_MAGIC))
- return __this_address;
if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_meta_uuid))
return __this_address;
if (be64_to_cpu(hdr3->blkno) != bp->b_bn)
return __this_address;
if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->lsn)))
return __this_address;
- } else {
- if (hdr->magic != cpu_to_be32(XFS_DIR2_FREE_MAGIC))
- return __this_address;
}
/* XXX: should bounds check the xfs_dir3_icfree_hdr here */
@@ -112,7 +108,7 @@ static void
xfs_dir3_free_read_verify(
struct xfs_buf *bp)
{
- struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_mount *mp = bp->b_mount;
xfs_failaddr_t fa;
if (xfs_sb_version_hascrc(&mp->m_sb) &&
@@ -129,7 +125,7 @@ static void
xfs_dir3_free_write_verify(
struct xfs_buf *bp)
{
- struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_mount *mp = bp->b_mount;
struct xfs_buf_log_item *bip = bp->b_log_item;
struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr;
xfs_failaddr_t fa;
@@ -151,6 +147,8 @@ xfs_dir3_free_write_verify(
const struct xfs_buf_ops xfs_dir3_free_buf_ops = {
.name = "xfs_dir3_free",
+ .magic = { cpu_to_be32(XFS_DIR2_FREE_MAGIC),
+ cpu_to_be32(XFS_DIR3_FREE_MAGIC) },
.verify_read = xfs_dir3_free_read_verify,
.verify_write = xfs_dir3_free_write_verify,
.verify_struct = xfs_dir3_free_verify,
@@ -426,24 +424,22 @@ xfs_dir2_leaf_to_node(
static int /* error */
xfs_dir2_leafn_add(
struct xfs_buf *bp, /* leaf buffer */
- xfs_da_args_t *args, /* operation arguments */
+ struct xfs_da_args *args, /* operation arguments */
int index) /* insertion pt for new entry */
{
+ struct xfs_dir3_icleaf_hdr leafhdr;
+ struct xfs_inode *dp = args->dp;
+ struct xfs_dir2_leaf *leaf = bp->b_addr;
+ struct xfs_dir2_leaf_entry *lep;
+ struct xfs_dir2_leaf_entry *ents;
int compact; /* compacting stale leaves */
- xfs_inode_t *dp; /* incore directory inode */
- int highstale; /* next stale entry */
- xfs_dir2_leaf_t *leaf; /* leaf structure */
- xfs_dir2_leaf_entry_t *lep; /* leaf entry */
+ int highstale = 0; /* next stale entry */
int lfloghigh; /* high leaf entry logging */
int lfloglow; /* low leaf entry logging */
- int lowstale; /* previous stale entry */
- struct xfs_dir3_icleaf_hdr leafhdr;
- struct xfs_dir2_leaf_entry *ents;
+ int lowstale = 0; /* previous stale entry */
trace_xfs_dir2_leafn_add(args, index);
- dp = args->dp;
- leaf = bp->b_addr;
dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
ents = dp->d_ops->leaf_ents_p(leaf);
diff --git a/fs/xfs/libxfs/xfs_dir2_sf.c b/fs/xfs/libxfs/xfs_dir2_sf.c
index 585dfdb7b6b6..033589257f54 100644
--- a/fs/xfs/libxfs/xfs_dir2_sf.c
+++ b/fs/xfs/libxfs/xfs_dir2_sf.c
@@ -5,16 +5,13 @@
*/
#include "xfs.h"
#include "xfs_fs.h"
+#include "xfs_shared.h"
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
-#include "xfs_da_format.h"
-#include "xfs_da_btree.h"
#include "xfs_inode.h"
#include "xfs_trans.h"
-#include "xfs_inode_item.h"
-#include "xfs_error.h"
#include "xfs_dir2.h"
#include "xfs_dir2_priv.h"
#include "xfs_trace.h"
diff --git a/fs/xfs/libxfs/xfs_dquot_buf.c b/fs/xfs/libxfs/xfs_dquot_buf.c
index d293f371dd54..e8bd688a4073 100644
--- a/fs/xfs/libxfs/xfs_dquot_buf.c
+++ b/fs/xfs/libxfs/xfs_dquot_buf.c
@@ -16,8 +16,6 @@
#include "xfs_trans.h"
#include "xfs_qm.h"
#include "xfs_error.h"
-#include "xfs_cksum.h"
-#include "xfs_trace.h"
int
xfs_calc_dquots_per_chunk(
@@ -110,7 +108,7 @@ xfs_dqblk_verify(
/*
* Do some primitive error checking on ondisk dquot data structures.
*/
-int
+void
xfs_dqblk_repair(
struct xfs_mount *mp,
struct xfs_dqblk *dqb,
@@ -133,8 +131,6 @@ xfs_dqblk_repair(
xfs_update_cksum((char *)dqb, sizeof(struct xfs_dqblk),
XFS_DQUOT_CRC_OFF);
}
-
- return 0;
}
STATIC bool
@@ -226,7 +222,7 @@ static xfs_failaddr_t
xfs_dquot_buf_verify_struct(
struct xfs_buf *bp)
{
- struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_mount *mp = bp->b_mount;
return xfs_dquot_buf_verify(mp, bp, false);
}
@@ -235,7 +231,7 @@ static void
xfs_dquot_buf_read_verify(
struct xfs_buf *bp)
{
- struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_mount *mp = bp->b_mount;
if (!xfs_dquot_buf_verify_crc(mp, bp, false))
return;
@@ -252,7 +248,7 @@ static void
xfs_dquot_buf_readahead_verify(
struct xfs_buf *bp)
{
- struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_mount *mp = bp->b_mount;
if (!xfs_dquot_buf_verify_crc(mp, bp, true) ||
xfs_dquot_buf_verify(mp, bp, true) != NULL) {
@@ -270,13 +266,15 @@ static void
xfs_dquot_buf_write_verify(
struct xfs_buf *bp)
{
- struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_mount *mp = bp->b_mount;
xfs_dquot_buf_verify(mp, bp, false);
}
const struct xfs_buf_ops xfs_dquot_buf_ops = {
.name = "xfs_dquot",
+ .magic16 = { cpu_to_be16(XFS_DQUOT_MAGIC),
+ cpu_to_be16(XFS_DQUOT_MAGIC) },
.verify_read = xfs_dquot_buf_read_verify,
.verify_write = xfs_dquot_buf_write_verify,
.verify_struct = xfs_dquot_buf_verify_struct,
@@ -284,6 +282,8 @@ const struct xfs_buf_ops xfs_dquot_buf_ops = {
const struct xfs_buf_ops xfs_dquot_buf_ra_ops = {
.name = "xfs_dquot_ra",
+ .magic16 = { cpu_to_be16(XFS_DQUOT_MAGIC),
+ cpu_to_be16(XFS_DQUOT_MAGIC) },
.verify_read = xfs_dquot_buf_readahead_verify,
.verify_write = xfs_dquot_buf_write_verify,
};
diff --git a/fs/xfs/libxfs/xfs_errortag.h b/fs/xfs/libxfs/xfs_errortag.h
index 66077a105cbb..79e6c4fb1d8a 100644
--- a/fs/xfs/libxfs/xfs_errortag.h
+++ b/fs/xfs/libxfs/xfs_errortag.h
@@ -54,7 +54,8 @@
#define XFS_ERRTAG_BUF_LRU_REF 31
#define XFS_ERRTAG_FORCE_SCRUB_REPAIR 32
#define XFS_ERRTAG_FORCE_SUMMARY_RECALC 33
-#define XFS_ERRTAG_MAX 34
+#define XFS_ERRTAG_IUNLINK_FALLBACK 34
+#define XFS_ERRTAG_MAX 35
/*
* Random factors for above tags, 1 means always, 2 means 1/2 time, etc.
@@ -93,5 +94,6 @@
#define XFS_RANDOM_BUF_LRU_REF 2
#define XFS_RANDOM_FORCE_SCRUB_REPAIR 1
#define XFS_RANDOM_FORCE_SUMMARY_RECALC 1
+#define XFS_RANDOM_IUNLINK_FALLBACK (XFS_RANDOM_DEFAULT/10)
#endif /* __XFS_ERRORTAG_H_ */
diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
index 9bb3c48843ec..c968b60cee15 100644
--- a/fs/xfs/libxfs/xfs_format.h
+++ b/fs/xfs/libxfs/xfs_format.h
@@ -1071,7 +1071,7 @@ static inline void xfs_dinode_put_rdev(struct xfs_dinode *dip, xfs_dev_t rdev)
#define XFS_INO_MASK(k) (uint32_t)((1ULL << (k)) - 1)
#define XFS_INO_OFFSET_BITS(mp) (mp)->m_sb.sb_inopblog
#define XFS_INO_AGBNO_BITS(mp) (mp)->m_sb.sb_agblklog
-#define XFS_INO_AGINO_BITS(mp) (mp)->m_agino_log
+#define XFS_INO_AGINO_BITS(mp) ((mp)->m_ino_geo.agino_log)
#define XFS_INO_AGNO_BITS(mp) (mp)->m_agno_log
#define XFS_INO_BITS(mp) \
XFS_INO_AGNO_BITS(mp) + XFS_INO_AGINO_BITS(mp)
diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h
index f3aa59302fef..52d03a3a02a4 100644
--- a/fs/xfs/libxfs/xfs_fs.h
+++ b/fs/xfs/libxfs/xfs_fs.h
@@ -97,7 +97,7 @@ struct getbmapx {
* For use by backup and restore programs to set the XFS on-disk inode
* fields di_dmevmask and di_dmstate. These must be set to exactly and
* only values previously obtained via xfs_bulkstat! (Specifically the
- * xfs_bstat_t fields bs_dmevmask and bs_dmstate.)
+ * struct xfs_bstat fields bs_dmevmask and bs_dmstate.)
*/
#ifndef HAVE_FSDMIDATA
struct fsdmidata {
@@ -124,7 +124,7 @@ typedef struct xfs_flock64 {
/*
* Output for XFS_IOC_FSGEOMETRY_V1
*/
-typedef struct xfs_fsop_geom_v1 {
+struct xfs_fsop_geom_v1 {
__u32 blocksize; /* filesystem (data) block size */
__u32 rtextsize; /* realtime extent size */
__u32 agblocks; /* fsblocks in an AG */
@@ -145,12 +145,39 @@ typedef struct xfs_fsop_geom_v1 {
__u32 logsectsize; /* log sector size, bytes */
__u32 rtsectsize; /* realtime sector size, bytes */
__u32 dirblocksize; /* directory block size, bytes */
-} xfs_fsop_geom_v1_t;
+};
+
+/*
+ * Output for XFS_IOC_FSGEOMETRY_V4
+ */
+struct xfs_fsop_geom_v4 {
+ __u32 blocksize; /* filesystem (data) block size */
+ __u32 rtextsize; /* realtime extent size */
+ __u32 agblocks; /* fsblocks in an AG */
+ __u32 agcount; /* number of allocation groups */
+ __u32 logblocks; /* fsblocks in the log */
+ __u32 sectsize; /* (data) sector size, bytes */
+ __u32 inodesize; /* inode size in bytes */
+ __u32 imaxpct; /* max allowed inode space(%) */
+ __u64 datablocks; /* fsblocks in data subvolume */
+ __u64 rtblocks; /* fsblocks in realtime subvol */
+ __u64 rtextents; /* rt extents in realtime subvol*/
+ __u64 logstart; /* starting fsblock of the log */
+ unsigned char uuid[16]; /* unique id of the filesystem */
+ __u32 sunit; /* stripe unit, fsblocks */
+ __u32 swidth; /* stripe width, fsblocks */
+ __s32 version; /* structure version */
+ __u32 flags; /* superblock version flags */
+ __u32 logsectsize; /* log sector size, bytes */
+ __u32 rtsectsize; /* realtime sector size, bytes */
+ __u32 dirblocksize; /* directory block size, bytes */
+ __u32 logsunit; /* log stripe unit, bytes */
+};
/*
* Output for XFS_IOC_FSGEOMETRY
*/
-typedef struct xfs_fsop_geom {
+struct xfs_fsop_geom {
__u32 blocksize; /* filesystem (data) block size */
__u32 rtextsize; /* realtime extent size */
__u32 agblocks; /* fsblocks in an AG */
@@ -171,8 +198,18 @@ typedef struct xfs_fsop_geom {
__u32 logsectsize; /* log sector size, bytes */
__u32 rtsectsize; /* realtime sector size, bytes */
__u32 dirblocksize; /* directory block size, bytes */
- __u32 logsunit; /* log stripe unit, bytes */
-} xfs_fsop_geom_t;
+ __u32 logsunit; /* log stripe unit, bytes */
+ uint32_t sick; /* o: unhealthy fs & rt metadata */
+ uint32_t checked; /* o: checked fs & rt metadata */
+ __u64 reserved[17]; /* reserved space */
+};
+
+#define XFS_FSOP_GEOM_SICK_COUNTERS (1 << 0) /* summary counters */
+#define XFS_FSOP_GEOM_SICK_UQUOTA (1 << 1) /* user quota */
+#define XFS_FSOP_GEOM_SICK_GQUOTA (1 << 2) /* group quota */
+#define XFS_FSOP_GEOM_SICK_PQUOTA (1 << 3) /* project quota */
+#define XFS_FSOP_GEOM_SICK_RT_BITMAP (1 << 4) /* realtime bitmap */
+#define XFS_FSOP_GEOM_SICK_RT_SUMMARY (1 << 5) /* realtime summary */
/* Output for XFS_FS_COUNTS */
typedef struct xfs_fsop_counts {
@@ -188,28 +225,30 @@ typedef struct xfs_fsop_resblks {
__u64 resblks_avail;
} xfs_fsop_resblks_t;
-#define XFS_FSOP_GEOM_VERSION 0
-
-#define XFS_FSOP_GEOM_FLAGS_ATTR 0x0001 /* attributes in use */
-#define XFS_FSOP_GEOM_FLAGS_NLINK 0x0002 /* 32-bit nlink values */
-#define XFS_FSOP_GEOM_FLAGS_QUOTA 0x0004 /* quotas enabled */
-#define XFS_FSOP_GEOM_FLAGS_IALIGN 0x0008 /* inode alignment */
-#define XFS_FSOP_GEOM_FLAGS_DALIGN 0x0010 /* large data alignment */
-#define XFS_FSOP_GEOM_FLAGS_SHARED 0x0020 /* read-only shared */
-#define XFS_FSOP_GEOM_FLAGS_EXTFLG 0x0040 /* special extent flag */
-#define XFS_FSOP_GEOM_FLAGS_DIRV2 0x0080 /* directory version 2 */
-#define XFS_FSOP_GEOM_FLAGS_LOGV2 0x0100 /* log format version 2 */
-#define XFS_FSOP_GEOM_FLAGS_SECTOR 0x0200 /* sector sizes >1BB */
-#define XFS_FSOP_GEOM_FLAGS_ATTR2 0x0400 /* inline attributes rework */
-#define XFS_FSOP_GEOM_FLAGS_PROJID32 0x0800 /* 32-bit project IDs */
-#define XFS_FSOP_GEOM_FLAGS_DIRV2CI 0x1000 /* ASCII only CI names */
-#define XFS_FSOP_GEOM_FLAGS_LAZYSB 0x4000 /* lazy superblock counters */
-#define XFS_FSOP_GEOM_FLAGS_V5SB 0x8000 /* version 5 superblock */
-#define XFS_FSOP_GEOM_FLAGS_FTYPE 0x10000 /* inode directory types */
-#define XFS_FSOP_GEOM_FLAGS_FINOBT 0x20000 /* free inode btree */
-#define XFS_FSOP_GEOM_FLAGS_SPINODES 0x40000 /* sparse inode chunks */
-#define XFS_FSOP_GEOM_FLAGS_RMAPBT 0x80000 /* reverse mapping btree */
-#define XFS_FSOP_GEOM_FLAGS_REFLINK 0x100000 /* files can share blocks */
+#define XFS_FSOP_GEOM_VERSION 0
+#define XFS_FSOP_GEOM_VERSION_V5 5
+
+#define XFS_FSOP_GEOM_FLAGS_ATTR (1 << 0) /* attributes in use */
+#define XFS_FSOP_GEOM_FLAGS_NLINK (1 << 1) /* 32-bit nlink values */
+#define XFS_FSOP_GEOM_FLAGS_QUOTA (1 << 2) /* quotas enabled */
+#define XFS_FSOP_GEOM_FLAGS_IALIGN (1 << 3) /* inode alignment */
+#define XFS_FSOP_GEOM_FLAGS_DALIGN (1 << 4) /* large data alignment */
+#define XFS_FSOP_GEOM_FLAGS_SHARED (1 << 5) /* read-only shared */
+#define XFS_FSOP_GEOM_FLAGS_EXTFLG (1 << 6) /* special extent flag */
+#define XFS_FSOP_GEOM_FLAGS_DIRV2 (1 << 7) /* directory version 2 */
+#define XFS_FSOP_GEOM_FLAGS_LOGV2 (1 << 8) /* log format version 2 */
+#define XFS_FSOP_GEOM_FLAGS_SECTOR (1 << 9) /* sector sizes >1BB */
+#define XFS_FSOP_GEOM_FLAGS_ATTR2 (1 << 10) /* inline attributes rework */
+#define XFS_FSOP_GEOM_FLAGS_PROJID32 (1 << 11) /* 32-bit project IDs */
+#define XFS_FSOP_GEOM_FLAGS_DIRV2CI (1 << 12) /* ASCII only CI names */
+ /* -- Do not use -- (1 << 13) SGI parent pointers */
+#define XFS_FSOP_GEOM_FLAGS_LAZYSB (1 << 14) /* lazy superblock counters */
+#define XFS_FSOP_GEOM_FLAGS_V5SB (1 << 15) /* version 5 superblock */
+#define XFS_FSOP_GEOM_FLAGS_FTYPE (1 << 16) /* inode directory types */
+#define XFS_FSOP_GEOM_FLAGS_FINOBT (1 << 17) /* free inode btree */
+#define XFS_FSOP_GEOM_FLAGS_SPINODES (1 << 18) /* sparse inode chunks */
+#define XFS_FSOP_GEOM_FLAGS_RMAPBT (1 << 19) /* reverse mapping btree */
+#define XFS_FSOP_GEOM_FLAGS_REFLINK (1 << 20) /* files can share blocks */
/*
* Minimum and maximum sizes need for growth checks.
@@ -238,6 +277,31 @@ typedef struct xfs_fsop_resblks {
(s)->sb_agblocks + XFS_MIN_AG_BLOCKS)
/*
+ * Output for XFS_IOC_AG_GEOMETRY
+ */
+struct xfs_ag_geometry {
+ uint32_t ag_number; /* i/o: AG number */
+ uint32_t ag_length; /* o: length in blocks */
+ uint32_t ag_freeblks; /* o: free space */
+ uint32_t ag_icount; /* o: inodes allocated */
+ uint32_t ag_ifree; /* o: inodes free */
+ uint32_t ag_sick; /* o: sick things in ag */
+ uint32_t ag_checked; /* o: checked metadata in ag */
+ uint32_t ag_reserved32; /* o: zero */
+ uint64_t ag_reserved[12];/* o: zero */
+};
+#define XFS_AG_GEOM_SICK_SB (1 << 0) /* superblock */
+#define XFS_AG_GEOM_SICK_AGF (1 << 1) /* AGF header */
+#define XFS_AG_GEOM_SICK_AGFL (1 << 2) /* AGFL header */
+#define XFS_AG_GEOM_SICK_AGI (1 << 3) /* AGI header */
+#define XFS_AG_GEOM_SICK_BNOBT (1 << 4) /* free space by block */
+#define XFS_AG_GEOM_SICK_CNTBT (1 << 5) /* free space by length */
+#define XFS_AG_GEOM_SICK_INOBT (1 << 6) /* inode index */
+#define XFS_AG_GEOM_SICK_FINOBT (1 << 7) /* free inode index */
+#define XFS_AG_GEOM_SICK_RMAPBT (1 << 8) /* reverse mappings */
+#define XFS_AG_GEOM_SICK_REFCNTBT (1 << 9) /* reference counts */
+
+/*
* Structures for XFS_IOC_FSGROWFSDATA, XFS_IOC_FSGROWFSLOG & XFS_IOC_FSGROWFSRT
*/
typedef struct xfs_growfs_data {
@@ -264,7 +328,7 @@ typedef struct xfs_bstime {
__s32 tv_nsec; /* and nanoseconds */
} xfs_bstime_t;
-typedef struct xfs_bstat {
+struct xfs_bstat {
__u64 bs_ino; /* inode number */
__u16 bs_mode; /* type and mode */
__u16 bs_nlink; /* number of links */
@@ -285,12 +349,70 @@ typedef struct xfs_bstat {
#define bs_projid bs_projid_lo /* (previously just bs_projid) */
__u16 bs_forkoff; /* inode fork offset in bytes */
__u16 bs_projid_hi; /* higher part of project id */
- unsigned char bs_pad[6]; /* pad space, unused */
+ uint16_t bs_sick; /* sick inode metadata */
+ uint16_t bs_checked; /* checked inode metadata */
+ unsigned char bs_pad[2]; /* pad space, unused */
__u32 bs_cowextsize; /* cow extent size */
__u32 bs_dmevmask; /* DMIG event mask */
__u16 bs_dmstate; /* DMIG state info */
__u16 bs_aextents; /* attribute number of extents */
-} xfs_bstat_t;
+};
+
+/* New bulkstat structure that reports v5 features and fixes padding issues */
+struct xfs_bulkstat {
+ uint64_t bs_ino; /* inode number */
+ uint64_t bs_size; /* file size */
+
+ uint64_t bs_blocks; /* number of blocks */
+ uint64_t bs_xflags; /* extended flags */
+
+ uint64_t bs_atime; /* access time, seconds */
+ uint64_t bs_mtime; /* modify time, seconds */
+
+ uint64_t bs_ctime; /* inode change time, seconds */
+ uint64_t bs_btime; /* creation time, seconds */
+
+ uint32_t bs_gen; /* generation count */
+ uint32_t bs_uid; /* user id */
+ uint32_t bs_gid; /* group id */
+ uint32_t bs_projectid; /* project id */
+
+ uint32_t bs_atime_nsec; /* access time, nanoseconds */
+ uint32_t bs_mtime_nsec; /* modify time, nanoseconds */
+ uint32_t bs_ctime_nsec; /* inode change time, nanoseconds */
+ uint32_t bs_btime_nsec; /* creation time, nanoseconds */
+
+ uint32_t bs_blksize; /* block size */
+ uint32_t bs_rdev; /* device value */
+ uint32_t bs_cowextsize_blks; /* cow extent size hint, blocks */
+ uint32_t bs_extsize_blks; /* extent size hint, blocks */
+
+ uint32_t bs_nlink; /* number of links */
+ uint32_t bs_extents; /* number of extents */
+ uint32_t bs_aextents; /* attribute number of extents */
+ uint16_t bs_version; /* structure version */
+ uint16_t bs_forkoff; /* inode fork offset in bytes */
+
+ uint16_t bs_sick; /* sick inode metadata */
+ uint16_t bs_checked; /* checked inode metadata */
+ uint16_t bs_mode; /* type and mode */
+ uint16_t bs_pad2; /* zeroed */
+
+ uint64_t bs_pad[7]; /* zeroed */
+};
+
+#define XFS_BULKSTAT_VERSION_V1 (1)
+#define XFS_BULKSTAT_VERSION_V5 (5)
+
+/* bs_sick flags */
+#define XFS_BS_SICK_INODE (1 << 0) /* inode core */
+#define XFS_BS_SICK_BMBTD (1 << 1) /* data fork */
+#define XFS_BS_SICK_BMBTA (1 << 2) /* attr fork */
+#define XFS_BS_SICK_BMBTC (1 << 3) /* cow fork */
+#define XFS_BS_SICK_DIR (1 << 4) /* directory */
+#define XFS_BS_SICK_XATTR (1 << 5) /* extended attributes */
+#define XFS_BS_SICK_SYMLINK (1 << 6) /* symbolic link remote target */
+#define XFS_BS_SICK_PARENT (1 << 7) /* parent pointers */
/*
* Project quota id helpers (previously projid was 16bit only
@@ -298,7 +420,7 @@ typedef struct xfs_bstat {
* to retain compatibility with "old" filesystems).
*/
static inline uint32_t
-bstat_get_projid(struct xfs_bstat *bs)
+bstat_get_projid(const struct xfs_bstat *bs)
{
return (uint32_t)bs->bs_projid_hi << 16 | bs->bs_projid_lo;
}
@@ -306,23 +428,79 @@ bstat_get_projid(struct xfs_bstat *bs)
/*
* The user-level BulkStat Request interface structure.
*/
-typedef struct xfs_fsop_bulkreq {
+struct xfs_fsop_bulkreq {
__u64 __user *lastip; /* last inode # pointer */
__s32 icount; /* count of entries in buffer */
void __user *ubuffer;/* user buffer for inode desc. */
__s32 __user *ocount; /* output count pointer */
-} xfs_fsop_bulkreq_t;
-
+};
/*
* Structures returned from xfs_inumbers routine (XFS_IOC_FSINUMBERS).
*/
-typedef struct xfs_inogrp {
+struct xfs_inogrp {
__u64 xi_startino; /* starting inode number */
__s32 xi_alloccount; /* # bits set in allocmask */
__u64 xi_allocmask; /* mask of allocated inodes */
-} xfs_inogrp_t;
+};
+
+/* New inumbers structure that reports v5 features and fixes padding issues */
+struct xfs_inumbers {
+ uint64_t xi_startino; /* starting inode number */
+ uint64_t xi_allocmask; /* mask of allocated inodes */
+ uint8_t xi_alloccount; /* # bits set in allocmask */
+ uint8_t xi_version; /* version */
+ uint8_t xi_padding[6]; /* zero */
+};
+
+#define XFS_INUMBERS_VERSION_V1 (1)
+#define XFS_INUMBERS_VERSION_V5 (5)
+
+/* Header for bulk inode requests. */
+struct xfs_bulk_ireq {
+ uint64_t ino; /* I/O: start with this inode */
+ uint32_t flags; /* I/O: operation flags */
+ uint32_t icount; /* I: count of entries in buffer */
+ uint32_t ocount; /* O: count of entries filled out */
+ uint32_t agno; /* I: see comment for IREQ_AGNO */
+ uint64_t reserved[5]; /* must be zero */
+};
+
+/*
+ * Only return results from the specified @agno. If @ino is zero, start
+ * with the first inode of @agno.
+ */
+#define XFS_BULK_IREQ_AGNO (1 << 0)
+
+/*
+ * Return bulkstat information for a single inode, where @ino value is a
+ * special value, not a literal inode number. See the XFS_BULK_IREQ_SPECIAL_*
+ * values below. Not compatible with XFS_BULK_IREQ_AGNO.
+ */
+#define XFS_BULK_IREQ_SPECIAL (1 << 1)
+
+#define XFS_BULK_IREQ_FLAGS_ALL (XFS_BULK_IREQ_AGNO | \
+ XFS_BULK_IREQ_SPECIAL)
+
+/* Operate on the root directory inode. */
+#define XFS_BULK_IREQ_SPECIAL_ROOT (1)
+
+/*
+ * ioctl structures for v5 bulkstat and inumbers requests
+ */
+struct xfs_bulkstat_req {
+ struct xfs_bulk_ireq hdr;
+ struct xfs_bulkstat bulkstat[];
+};
+#define XFS_BULKSTAT_REQ_SIZE(nr) (sizeof(struct xfs_bulkstat_req) + \
+ (nr) * sizeof(struct xfs_bulkstat))
+struct xfs_inumbers_req {
+ struct xfs_bulk_ireq hdr;
+ struct xfs_inumbers inumbers[];
+};
+#define XFS_INUMBERS_REQ_SIZE(nr) (sizeof(struct xfs_inumbers_req) + \
+ (nr) * sizeof(struct xfs_inumbers))
/*
* Error injection.
@@ -453,7 +631,7 @@ typedef struct xfs_swapext
xfs_off_t sx_offset; /* offset into file */
xfs_off_t sx_length; /* leng from offset */
char sx_pad[16]; /* pad space, unused */
- xfs_bstat_t sx_stat; /* stat of target b4 copy */
+ struct xfs_bstat sx_stat; /* stat of target b4 copy */
} xfs_swapext_t;
/*
@@ -502,9 +680,10 @@ struct xfs_scrub_metadata {
#define XFS_SCRUB_TYPE_UQUOTA 21 /* user quotas */
#define XFS_SCRUB_TYPE_GQUOTA 22 /* group quotas */
#define XFS_SCRUB_TYPE_PQUOTA 23 /* project quotas */
+#define XFS_SCRUB_TYPE_FSCOUNTERS 24 /* fs summary counters */
/* Number of scrub subcommands. */
-#define XFS_SCRUB_TYPE_NR 24
+#define XFS_SCRUB_TYPE_NR 25
/* i: Repair this metadata. */
#define XFS_SCRUB_IFLAG_REPAIR (1 << 0)
@@ -590,6 +769,7 @@ struct xfs_scrub_metadata {
#define XFS_IOC_FREE_EOFBLOCKS _IOR ('X', 58, struct xfs_fs_eofblocks)
/* XFS_IOC_GETFSMAP ------ hoisted 59 */
#define XFS_IOC_SCRUB_METADATA _IOWR('X', 60, struct xfs_scrub_metadata)
+#define XFS_IOC_AG_GEOMETRY _IOWR('X', 61, struct xfs_ag_geometry)
/*
* ioctl commands that replace IRIX syssgi()'s
@@ -620,8 +800,11 @@ struct xfs_scrub_metadata {
#define XFS_IOC_FSSETDM_BY_HANDLE _IOW ('X', 121, struct xfs_fsop_setdm_handlereq)
#define XFS_IOC_ATTRLIST_BY_HANDLE _IOW ('X', 122, struct xfs_fsop_attrlist_handlereq)
#define XFS_IOC_ATTRMULTI_BY_HANDLE _IOW ('X', 123, struct xfs_fsop_attrmulti_handlereq)
-#define XFS_IOC_FSGEOMETRY _IOR ('X', 124, struct xfs_fsop_geom)
+#define XFS_IOC_FSGEOMETRY_V4 _IOR ('X', 124, struct xfs_fsop_geom_v4)
#define XFS_IOC_GOINGDOWN _IOR ('X', 125, uint32_t)
+#define XFS_IOC_FSGEOMETRY _IOR ('X', 126, struct xfs_fsop_geom)
+#define XFS_IOC_BULKSTAT _IOR ('X', 127, struct xfs_bulkstat_req)
+#define XFS_IOC_INUMBERS _IOR ('X', 128, struct xfs_inumbers_req)
/* XFS_IOC_GETFSUUID ---------- deprecated 140 */
diff --git a/fs/xfs/libxfs/xfs_health.h b/fs/xfs/libxfs/xfs_health.h
new file mode 100644
index 000000000000..272005ac8c88
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_health.h
@@ -0,0 +1,190 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Copyright (C) 2019 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ */
+#ifndef __XFS_HEALTH_H__
+#define __XFS_HEALTH_H__
+
+/*
+ * In-Core Filesystem Health Assessments
+ * =====================================
+ *
+ * We'd like to be able to summarize the current health status of the
+ * filesystem so that the administrator knows when it's necessary to schedule
+ * some downtime for repairs. Until then, we would also like to avoid abrupt
+ * shutdowns due to corrupt metadata.
+ *
+ * The online scrub feature evaluates the health of all filesystem metadata.
+ * When scrub detects corruption in a piece of metadata it will set the
+ * corresponding sickness flag, and repair will clear it if successful. If
+ * problems remain at unmount time, we can also request manual intervention by
+ * logging a notice to run xfs_repair.
+ *
+ * Each health tracking group uses a pair of fields for reporting. The
+ * "checked" field tell us if a given piece of metadata has ever been examined,
+ * and the "sick" field tells us if that piece was found to need repairs.
+ * Therefore we can conclude that for a given sick flag value:
+ *
+ * - checked && sick => metadata needs repair
+ * - checked && !sick => metadata is ok
+ * - !checked => has not been examined since mount
+ */
+
+struct xfs_mount;
+struct xfs_perag;
+struct xfs_inode;
+struct xfs_fsop_geom;
+
+/* Observable health issues for metadata spanning the entire filesystem. */
+#define XFS_SICK_FS_COUNTERS (1 << 0) /* summary counters */
+#define XFS_SICK_FS_UQUOTA (1 << 1) /* user quota */
+#define XFS_SICK_FS_GQUOTA (1 << 2) /* group quota */
+#define XFS_SICK_FS_PQUOTA (1 << 3) /* project quota */
+
+/* Observable health issues for realtime volume metadata. */
+#define XFS_SICK_RT_BITMAP (1 << 0) /* realtime bitmap */
+#define XFS_SICK_RT_SUMMARY (1 << 1) /* realtime summary */
+
+/* Observable health issues for AG metadata. */
+#define XFS_SICK_AG_SB (1 << 0) /* superblock */
+#define XFS_SICK_AG_AGF (1 << 1) /* AGF header */
+#define XFS_SICK_AG_AGFL (1 << 2) /* AGFL header */
+#define XFS_SICK_AG_AGI (1 << 3) /* AGI header */
+#define XFS_SICK_AG_BNOBT (1 << 4) /* free space by block */
+#define XFS_SICK_AG_CNTBT (1 << 5) /* free space by length */
+#define XFS_SICK_AG_INOBT (1 << 6) /* inode index */
+#define XFS_SICK_AG_FINOBT (1 << 7) /* free inode index */
+#define XFS_SICK_AG_RMAPBT (1 << 8) /* reverse mappings */
+#define XFS_SICK_AG_REFCNTBT (1 << 9) /* reference counts */
+
+/* Observable health issues for inode metadata. */
+#define XFS_SICK_INO_CORE (1 << 0) /* inode core */
+#define XFS_SICK_INO_BMBTD (1 << 1) /* data fork */
+#define XFS_SICK_INO_BMBTA (1 << 2) /* attr fork */
+#define XFS_SICK_INO_BMBTC (1 << 3) /* cow fork */
+#define XFS_SICK_INO_DIR (1 << 4) /* directory */
+#define XFS_SICK_INO_XATTR (1 << 5) /* extended attributes */
+#define XFS_SICK_INO_SYMLINK (1 << 6) /* symbolic link remote target */
+#define XFS_SICK_INO_PARENT (1 << 7) /* parent pointers */
+
+/* Primary evidence of health problems in a given group. */
+#define XFS_SICK_FS_PRIMARY (XFS_SICK_FS_COUNTERS | \
+ XFS_SICK_FS_UQUOTA | \
+ XFS_SICK_FS_GQUOTA | \
+ XFS_SICK_FS_PQUOTA)
+
+#define XFS_SICK_RT_PRIMARY (XFS_SICK_RT_BITMAP | \
+ XFS_SICK_RT_SUMMARY)
+
+#define XFS_SICK_AG_PRIMARY (XFS_SICK_AG_SB | \
+ XFS_SICK_AG_AGF | \
+ XFS_SICK_AG_AGFL | \
+ XFS_SICK_AG_AGI | \
+ XFS_SICK_AG_BNOBT | \
+ XFS_SICK_AG_CNTBT | \
+ XFS_SICK_AG_INOBT | \
+ XFS_SICK_AG_FINOBT | \
+ XFS_SICK_AG_RMAPBT | \
+ XFS_SICK_AG_REFCNTBT)
+
+#define XFS_SICK_INO_PRIMARY (XFS_SICK_INO_CORE | \
+ XFS_SICK_INO_BMBTD | \
+ XFS_SICK_INO_BMBTA | \
+ XFS_SICK_INO_BMBTC | \
+ XFS_SICK_INO_DIR | \
+ XFS_SICK_INO_XATTR | \
+ XFS_SICK_INO_SYMLINK | \
+ XFS_SICK_INO_PARENT)
+
+/* These functions must be provided by the xfs implementation. */
+
+void xfs_fs_mark_sick(struct xfs_mount *mp, unsigned int mask);
+void xfs_fs_mark_healthy(struct xfs_mount *mp, unsigned int mask);
+void xfs_fs_measure_sickness(struct xfs_mount *mp, unsigned int *sick,
+ unsigned int *checked);
+
+void xfs_rt_mark_sick(struct xfs_mount *mp, unsigned int mask);
+void xfs_rt_mark_healthy(struct xfs_mount *mp, unsigned int mask);
+void xfs_rt_measure_sickness(struct xfs_mount *mp, unsigned int *sick,
+ unsigned int *checked);
+
+void xfs_ag_mark_sick(struct xfs_perag *pag, unsigned int mask);
+void xfs_ag_mark_healthy(struct xfs_perag *pag, unsigned int mask);
+void xfs_ag_measure_sickness(struct xfs_perag *pag, unsigned int *sick,
+ unsigned int *checked);
+
+void xfs_inode_mark_sick(struct xfs_inode *ip, unsigned int mask);
+void xfs_inode_mark_healthy(struct xfs_inode *ip, unsigned int mask);
+void xfs_inode_measure_sickness(struct xfs_inode *ip, unsigned int *sick,
+ unsigned int *checked);
+
+void xfs_health_unmount(struct xfs_mount *mp);
+
+/* Now some helpers. */
+
+static inline bool
+xfs_fs_has_sickness(struct xfs_mount *mp, unsigned int mask)
+{
+ unsigned int sick, checked;
+
+ xfs_fs_measure_sickness(mp, &sick, &checked);
+ return sick & mask;
+}
+
+static inline bool
+xfs_rt_has_sickness(struct xfs_mount *mp, unsigned int mask)
+{
+ unsigned int sick, checked;
+
+ xfs_rt_measure_sickness(mp, &sick, &checked);
+ return sick & mask;
+}
+
+static inline bool
+xfs_ag_has_sickness(struct xfs_perag *pag, unsigned int mask)
+{
+ unsigned int sick, checked;
+
+ xfs_ag_measure_sickness(pag, &sick, &checked);
+ return sick & mask;
+}
+
+static inline bool
+xfs_inode_has_sickness(struct xfs_inode *ip, unsigned int mask)
+{
+ unsigned int sick, checked;
+
+ xfs_inode_measure_sickness(ip, &sick, &checked);
+ return sick & mask;
+}
+
+static inline bool
+xfs_fs_is_healthy(struct xfs_mount *mp)
+{
+ return !xfs_fs_has_sickness(mp, -1U);
+}
+
+static inline bool
+xfs_rt_is_healthy(struct xfs_mount *mp)
+{
+ return !xfs_rt_has_sickness(mp, -1U);
+}
+
+static inline bool
+xfs_ag_is_healthy(struct xfs_perag *pag)
+{
+ return !xfs_ag_has_sickness(pag, -1U);
+}
+
+static inline bool
+xfs_inode_is_healthy(struct xfs_inode *ip)
+{
+ return !xfs_inode_has_sickness(ip, -1U);
+}
+
+void xfs_fsop_geom_health(struct xfs_mount *mp, struct xfs_fsop_geom *geo);
+void xfs_ag_geom_health(struct xfs_perag *pag, struct xfs_ag_geometry *ageo);
+void xfs_bulkstat_health(struct xfs_inode *ip, struct xfs_bulkstat *bs);
+
+#endif /* __XFS_HEALTH_H__ */
diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c
index d32152fc8a6c..04377ab75863 100644
--- a/fs/xfs/libxfs/xfs_ialloc.c
+++ b/fs/xfs/libxfs/xfs_ialloc.c
@@ -12,17 +12,14 @@
#include "xfs_bit.h"
#include "xfs_sb.h"
#include "xfs_mount.h"
-#include "xfs_defer.h"
#include "xfs_inode.h"
#include "xfs_btree.h"
#include "xfs_ialloc.h"
#include "xfs_ialloc_btree.h"
#include "xfs_alloc.h"
-#include "xfs_rtalloc.h"
#include "xfs_errortag.h"
#include "xfs_error.h"
#include "xfs_bmap.h"
-#include "xfs_cksum.h"
#include "xfs_trans.h"
#include "xfs_buf_item.h"
#include "xfs_icreate_item.h"
@@ -31,20 +28,6 @@
#include "xfs_log.h"
#include "xfs_rmap.h"
-
-/*
- * Allocation group level functions.
- */
-int
-xfs_ialloc_cluster_alignment(
- struct xfs_mount *mp)
-{
- if (xfs_sb_version_hasalign(&mp->m_sb) &&
- mp->m_sb.sb_inoalignmt >= xfs_icluster_size_fsb(mp))
- return mp->m_sb.sb_inoalignmt;
- return 1;
-}
-
/*
* Lookup a record by ino in the btree given by cur.
*/
@@ -299,7 +282,7 @@ xfs_ialloc_inode_init(
* sizes, manipulate the inodes in buffers which are multiples of the
* blocks size.
*/
- nbufs = length / mp->m_blocks_per_cluster;
+ nbufs = length / M_IGEO(mp)->blocks_per_cluster;
/*
* Figure out what version number to use in the inodes we create. If
@@ -343,9 +326,10 @@ xfs_ialloc_inode_init(
* Get the block.
*/
d = XFS_AGB_TO_DADDR(mp, agno, agbno +
- (j * mp->m_blocks_per_cluster));
+ (j * M_IGEO(mp)->blocks_per_cluster));
fbuf = xfs_trans_get_buf(tp, mp->m_ddev_targp, d,
- mp->m_bsize * mp->m_blocks_per_cluster,
+ mp->m_bsize *
+ M_IGEO(mp)->blocks_per_cluster,
XBF_UNMAPPED);
if (!fbuf)
return -ENOMEM;
@@ -353,7 +337,7 @@ xfs_ialloc_inode_init(
/* Initialize the inode buffers and log them appropriately. */
fbuf->b_ops = &xfs_inode_buf_ops;
xfs_buf_zero(fbuf, 0, BBTOB(fbuf->b_length));
- for (i = 0; i < mp->m_inodes_per_cluster; i++) {
+ for (i = 0; i < M_IGEO(mp)->inodes_per_cluster; i++) {
int ioffset = i << mp->m_sb.sb_inodelog;
uint isize = xfs_dinode_size(version);
@@ -616,24 +600,26 @@ error:
* Allocate new inodes in the allocation group specified by agbp.
* Return 0 for success, else error code.
*/
-STATIC int /* error code or 0 */
+STATIC int
xfs_ialloc_ag_alloc(
- xfs_trans_t *tp, /* transaction pointer */
- xfs_buf_t *agbp, /* alloc group buffer */
- int *alloc)
+ struct xfs_trans *tp,
+ struct xfs_buf *agbp,
+ int *alloc)
{
- xfs_agi_t *agi; /* allocation group header */
- xfs_alloc_arg_t args; /* allocation argument structure */
- xfs_agnumber_t agno;
- int error;
- xfs_agino_t newino; /* new first inode's number */
- xfs_agino_t newlen; /* new number of inodes */
- int isaligned = 0; /* inode allocation at stripe unit */
- /* boundary */
- uint16_t allocmask = (uint16_t) -1; /* init. to full chunk */
+ struct xfs_agi *agi;
+ struct xfs_alloc_arg args;
+ xfs_agnumber_t agno;
+ int error;
+ xfs_agino_t newino; /* new first inode's number */
+ xfs_agino_t newlen; /* new number of inodes */
+ int isaligned = 0; /* inode allocation at stripe */
+ /* unit boundary */
+ /* init. to full chunk */
+ uint16_t allocmask = (uint16_t) -1;
struct xfs_inobt_rec_incore rec;
- struct xfs_perag *pag;
- int do_sparse = 0;
+ struct xfs_perag *pag;
+ struct xfs_ino_geometry *igeo = M_IGEO(tp->t_mountp);
+ int do_sparse = 0;
memset(&args, 0, sizeof(args));
args.tp = tp;
@@ -644,7 +630,7 @@ xfs_ialloc_ag_alloc(
#ifdef DEBUG
/* randomly do sparse inode allocations */
if (xfs_sb_version_hassparseinodes(&tp->t_mountp->m_sb) &&
- args.mp->m_ialloc_min_blks < args.mp->m_ialloc_blks)
+ igeo->ialloc_min_blks < igeo->ialloc_blks)
do_sparse = prandom_u32() & 1;
#endif
@@ -652,12 +638,12 @@ xfs_ialloc_ag_alloc(
* Locking will ensure that we don't have two callers in here
* at one time.
*/
- newlen = args.mp->m_ialloc_inos;
- if (args.mp->m_maxicount &&
+ newlen = igeo->ialloc_inos;
+ if (igeo->maxicount &&
percpu_counter_read_positive(&args.mp->m_icount) + newlen >
- args.mp->m_maxicount)
+ igeo->maxicount)
return -ENOSPC;
- args.minlen = args.maxlen = args.mp->m_ialloc_blks;
+ args.minlen = args.maxlen = igeo->ialloc_blks;
/*
* First try to allocate inodes contiguous with the last-allocated
* chunk of inodes. If the filesystem is striped, this will fill
@@ -667,7 +653,7 @@ xfs_ialloc_ag_alloc(
newino = be32_to_cpu(agi->agi_newino);
agno = be32_to_cpu(agi->agi_seqno);
args.agbno = XFS_AGINO_TO_AGBNO(args.mp, newino) +
- args.mp->m_ialloc_blks;
+ igeo->ialloc_blks;
if (do_sparse)
goto sparse_alloc;
if (likely(newino != NULLAGINO &&
@@ -690,10 +676,10 @@ xfs_ialloc_ag_alloc(
* but not to use them in the actual exact allocation.
*/
args.alignment = 1;
- args.minalignslop = args.mp->m_cluster_align - 1;
+ args.minalignslop = igeo->cluster_align - 1;
/* Allow space for the inode btree to split. */
- args.minleft = args.mp->m_in_maxlevels - 1;
+ args.minleft = igeo->inobt_maxlevels - 1;
if ((error = xfs_alloc_vextent(&args)))
return error;
@@ -720,12 +706,12 @@ xfs_ialloc_ag_alloc(
* pieces, so don't need alignment anyway.
*/
isaligned = 0;
- if (args.mp->m_sinoalign) {
+ if (igeo->ialloc_align) {
ASSERT(!(args.mp->m_flags & XFS_MOUNT_NOALIGN));
args.alignment = args.mp->m_dalign;
isaligned = 1;
} else
- args.alignment = args.mp->m_cluster_align;
+ args.alignment = igeo->cluster_align;
/*
* Need to figure out where to allocate the inode blocks.
* Ideally they should be spaced out through the a.g.
@@ -741,7 +727,7 @@ xfs_ialloc_ag_alloc(
/*
* Allow space for the inode btree to split.
*/
- args.minleft = args.mp->m_in_maxlevels - 1;
+ args.minleft = igeo->inobt_maxlevels - 1;
if ((error = xfs_alloc_vextent(&args)))
return error;
}
@@ -754,7 +740,7 @@ xfs_ialloc_ag_alloc(
args.type = XFS_ALLOCTYPE_NEAR_BNO;
args.agbno = be32_to_cpu(agi->agi_root);
args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno);
- args.alignment = args.mp->m_cluster_align;
+ args.alignment = igeo->cluster_align;
if ((error = xfs_alloc_vextent(&args)))
return error;
}
@@ -764,7 +750,7 @@ xfs_ialloc_ag_alloc(
* the sparse allocation length is smaller than a full chunk.
*/
if (xfs_sb_version_hassparseinodes(&args.mp->m_sb) &&
- args.mp->m_ialloc_min_blks < args.mp->m_ialloc_blks &&
+ igeo->ialloc_min_blks < igeo->ialloc_blks &&
args.fsbno == NULLFSBLOCK) {
sparse_alloc:
args.type = XFS_ALLOCTYPE_NEAR_BNO;
@@ -773,7 +759,7 @@ sparse_alloc:
args.alignment = args.mp->m_sb.sb_spino_align;
args.prod = 1;
- args.minlen = args.mp->m_ialloc_min_blks;
+ args.minlen = igeo->ialloc_min_blks;
args.maxlen = args.minlen;
/*
@@ -789,7 +775,7 @@ sparse_alloc:
args.min_agbno = args.mp->m_sb.sb_inoalignmt;
args.max_agbno = round_down(args.mp->m_sb.sb_agblocks,
args.mp->m_sb.sb_inoalignmt) -
- args.mp->m_ialloc_blks;
+ igeo->ialloc_blks;
error = xfs_alloc_vextent(&args);
if (error)
@@ -1006,7 +992,7 @@ xfs_ialloc_ag_select(
* space needed for alignment of inode chunks when checking the
* longest contiguous free space in the AG - this prevents us
* from getting ENOSPC because we have free space larger than
- * m_ialloc_blks but alignment constraints prevent us from using
+ * ialloc_blks but alignment constraints prevent us from using
* it.
*
* If we can't find an AG with space for full alignment slack to
@@ -1015,9 +1001,9 @@ xfs_ialloc_ag_select(
* if we fail allocation due to alignment issues then it is most
* likely a real ENOSPC condition.
*/
- ineed = mp->m_ialloc_min_blks;
+ ineed = M_IGEO(mp)->ialloc_min_blks;
if (flags && ineed > 1)
- ineed += mp->m_cluster_align;
+ ineed += M_IGEO(mp)->cluster_align;
longest = pag->pagf_longest;
if (!longest)
longest = pag->pagf_flcount > 0;
@@ -1703,6 +1689,7 @@ xfs_dialloc(
int noroom = 0;
xfs_agnumber_t start_agno;
struct xfs_perag *pag;
+ struct xfs_ino_geometry *igeo = M_IGEO(mp);
int okalloc = 1;
if (*IO_agbp) {
@@ -1733,9 +1720,9 @@ xfs_dialloc(
* Read rough value of mp->m_icount by percpu_counter_read_positive,
* which will sacrifice the preciseness but improve the performance.
*/
- if (mp->m_maxicount &&
- percpu_counter_read_positive(&mp->m_icount) + mp->m_ialloc_inos
- > mp->m_maxicount) {
+ if (igeo->maxicount &&
+ percpu_counter_read_positive(&mp->m_icount) + igeo->ialloc_inos
+ > igeo->maxicount) {
noroom = 1;
okalloc = 0;
}
@@ -1852,7 +1839,8 @@ xfs_difree_inode_chunk(
if (!xfs_inobt_issparse(rec->ir_holemask)) {
/* not sparse, calculate extent info directly */
xfs_bmap_add_free(tp, XFS_AGB_TO_FSB(mp, agno, sagbno),
- mp->m_ialloc_blks, &XFS_RMAP_OINFO_INODES);
+ M_IGEO(mp)->ialloc_blks,
+ &XFS_RMAP_OINFO_INODES);
return;
}
@@ -2261,7 +2249,7 @@ xfs_imap_lookup(
/* check that the returned record contains the required inode */
if (rec.ir_startino > agino ||
- rec.ir_startino + mp->m_ialloc_inos <= agino)
+ rec.ir_startino + M_IGEO(mp)->ialloc_inos <= agino)
return -EINVAL;
/* for untrusted inodes check it is allocated first */
@@ -2352,7 +2340,7 @@ xfs_imap(
* If the inode cluster size is the same as the blocksize or
* smaller we get to the buffer by simple arithmetics.
*/
- if (mp->m_blocks_per_cluster == 1) {
+ if (M_IGEO(mp)->blocks_per_cluster == 1) {
offset = XFS_INO_TO_OFFSET(mp, ino);
ASSERT(offset < mp->m_sb.sb_inopblock);
@@ -2368,8 +2356,8 @@ xfs_imap(
* find the location. Otherwise we have to do a btree
* lookup to find the location.
*/
- if (mp->m_inoalign_mask) {
- offset_agbno = agbno & mp->m_inoalign_mask;
+ if (M_IGEO(mp)->inoalign_mask) {
+ offset_agbno = agbno & M_IGEO(mp)->inoalign_mask;
chunk_agbno = agbno - offset_agbno;
} else {
error = xfs_imap_lookup(mp, tp, agno, agino, agbno,
@@ -2381,13 +2369,13 @@ xfs_imap(
out_map:
ASSERT(agbno >= chunk_agbno);
cluster_agbno = chunk_agbno +
- ((offset_agbno / mp->m_blocks_per_cluster) *
- mp->m_blocks_per_cluster);
+ ((offset_agbno / M_IGEO(mp)->blocks_per_cluster) *
+ M_IGEO(mp)->blocks_per_cluster);
offset = ((agbno - cluster_agbno) * mp->m_sb.sb_inopblock) +
XFS_INO_TO_OFFSET(mp, ino);
imap->im_blkno = XFS_AGB_TO_DADDR(mp, agno, cluster_agbno);
- imap->im_len = XFS_FSB_TO_BB(mp, mp->m_blocks_per_cluster);
+ imap->im_len = XFS_FSB_TO_BB(mp, M_IGEO(mp)->blocks_per_cluster);
imap->im_boffset = (unsigned short)(offset << mp->m_sb.sb_inodelog);
/*
@@ -2409,20 +2397,6 @@ out_map:
}
/*
- * Compute and fill in value of m_in_maxlevels.
- */
-void
-xfs_ialloc_compute_maxlevels(
- xfs_mount_t *mp) /* file system mount structure */
-{
- uint inodes;
-
- inodes = (1LL << XFS_INO_AGINO_BITS(mp)) >> XFS_INODES_PER_CHUNK_LOG;
- mp->m_in_maxlevels = xfs_btree_compute_maxlevels(mp->m_inobt_mnr,
- inodes);
-}
-
-/*
* Log specified fields for the ag hdr (inode section). The growth of the agi
* structure over time requires that we interpret the buffer as two logical
* regions delineated by the end of the unlinked list. This is due to the size
@@ -2493,7 +2467,7 @@ static xfs_failaddr_t
xfs_agi_verify(
struct xfs_buf *bp)
{
- struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_mount *mp = bp->b_mount;
struct xfs_agi *agi = XFS_BUF_TO_AGI(bp);
int i;
@@ -2508,7 +2482,7 @@ xfs_agi_verify(
/*
* Validate the magic number of the agi block.
*/
- if (agi->agi_magicnum != cpu_to_be32(XFS_AGI_MAGIC))
+ if (!xfs_verify_magic(bp, agi->agi_magicnum))
return __this_address;
if (!XFS_AGI_GOOD_VERSION(be32_to_cpu(agi->agi_versionnum)))
return __this_address;
@@ -2545,7 +2519,7 @@ static void
xfs_agi_read_verify(
struct xfs_buf *bp)
{
- struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_mount *mp = bp->b_mount;
xfs_failaddr_t fa;
if (xfs_sb_version_hascrc(&mp->m_sb) &&
@@ -2562,7 +2536,7 @@ static void
xfs_agi_write_verify(
struct xfs_buf *bp)
{
- struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_mount *mp = bp->b_mount;
struct xfs_buf_log_item *bip = bp->b_log_item;
xfs_failaddr_t fa;
@@ -2582,6 +2556,7 @@ xfs_agi_write_verify(
const struct xfs_buf_ops xfs_agi_buf_ops = {
.name = "xfs_agi",
+ .magic = { cpu_to_be32(XFS_AGI_MAGIC), cpu_to_be32(XFS_AGI_MAGIC) },
.verify_read = xfs_agi_read_verify,
.verify_write = xfs_agi_write_verify,
.verify_struct = xfs_agi_verify,
@@ -2767,3 +2742,110 @@ xfs_ialloc_count_inodes(
*freecount = ci.freecount;
return 0;
}
+
+/*
+ * Initialize inode-related geometry information.
+ *
+ * Compute the inode btree min and max levels and set maxicount.
+ *
+ * Set the inode cluster size. This may still be overridden by the file
+ * system block size if it is larger than the chosen cluster size.
+ *
+ * For v5 filesystems, scale the cluster size with the inode size to keep a
+ * constant ratio of inode per cluster buffer, but only if mkfs has set the
+ * inode alignment value appropriately for larger cluster sizes.
+ *
+ * Then compute the inode cluster alignment information.
+ */
+void
+xfs_ialloc_setup_geometry(
+ struct xfs_mount *mp)
+{
+ struct xfs_sb *sbp = &mp->m_sb;
+ struct xfs_ino_geometry *igeo = M_IGEO(mp);
+ uint64_t icount;
+ uint inodes;
+
+ /* Compute inode btree geometry. */
+ igeo->agino_log = sbp->sb_inopblog + sbp->sb_agblklog;
+ igeo->inobt_mxr[0] = xfs_inobt_maxrecs(mp, sbp->sb_blocksize, 1);
+ igeo->inobt_mxr[1] = xfs_inobt_maxrecs(mp, sbp->sb_blocksize, 0);
+ igeo->inobt_mnr[0] = igeo->inobt_mxr[0] / 2;
+ igeo->inobt_mnr[1] = igeo->inobt_mxr[1] / 2;
+
+ igeo->ialloc_inos = max_t(uint16_t, XFS_INODES_PER_CHUNK,
+ sbp->sb_inopblock);
+ igeo->ialloc_blks = igeo->ialloc_inos >> sbp->sb_inopblog;
+
+ if (sbp->sb_spino_align)
+ igeo->ialloc_min_blks = sbp->sb_spino_align;
+ else
+ igeo->ialloc_min_blks = igeo->ialloc_blks;
+
+ /* Compute and fill in value of m_ino_geo.inobt_maxlevels. */
+ inodes = (1LL << XFS_INO_AGINO_BITS(mp)) >> XFS_INODES_PER_CHUNK_LOG;
+ igeo->inobt_maxlevels = xfs_btree_compute_maxlevels(igeo->inobt_mnr,
+ inodes);
+
+ /* Set the maximum inode count for this filesystem. */
+ if (sbp->sb_imax_pct) {
+ /*
+ * Make sure the maximum inode count is a multiple
+ * of the units we allocate inodes in.
+ */
+ icount = sbp->sb_dblocks * sbp->sb_imax_pct;
+ do_div(icount, 100);
+ do_div(icount, igeo->ialloc_blks);
+ igeo->maxicount = XFS_FSB_TO_INO(mp,
+ icount * igeo->ialloc_blks);
+ } else {
+ igeo->maxicount = 0;
+ }
+
+ /*
+ * Compute the desired size of an inode cluster buffer size, which
+ * starts at 8K and (on v5 filesystems) scales up with larger inode
+ * sizes.
+ *
+ * Preserve the desired inode cluster size because the sparse inodes
+ * feature uses that desired size (not the actual size) to compute the
+ * sparse inode alignment. The mount code validates this value, so we
+ * cannot change the behavior.
+ */
+ igeo->inode_cluster_size_raw = XFS_INODE_BIG_CLUSTER_SIZE;
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ int new_size = igeo->inode_cluster_size_raw;
+
+ new_size *= mp->m_sb.sb_inodesize / XFS_DINODE_MIN_SIZE;
+ if (mp->m_sb.sb_inoalignmt >= XFS_B_TO_FSBT(mp, new_size))
+ igeo->inode_cluster_size_raw = new_size;
+ }
+
+ /* Calculate inode cluster ratios. */
+ if (igeo->inode_cluster_size_raw > mp->m_sb.sb_blocksize)
+ igeo->blocks_per_cluster = XFS_B_TO_FSBT(mp,
+ igeo->inode_cluster_size_raw);
+ else
+ igeo->blocks_per_cluster = 1;
+ igeo->inode_cluster_size = XFS_FSB_TO_B(mp, igeo->blocks_per_cluster);
+ igeo->inodes_per_cluster = XFS_FSB_TO_INO(mp, igeo->blocks_per_cluster);
+
+ /* Calculate inode cluster alignment. */
+ if (xfs_sb_version_hasalign(&mp->m_sb) &&
+ mp->m_sb.sb_inoalignmt >= igeo->blocks_per_cluster)
+ igeo->cluster_align = mp->m_sb.sb_inoalignmt;
+ else
+ igeo->cluster_align = 1;
+ igeo->inoalign_mask = igeo->cluster_align - 1;
+ igeo->cluster_align_inodes = XFS_FSB_TO_INO(mp, igeo->cluster_align);
+
+ /*
+ * If we are using stripe alignment, check whether
+ * the stripe unit is a multiple of the inode alignment
+ */
+ if (mp->m_dalign && igeo->inoalign_mask &&
+ !(mp->m_dalign & igeo->inoalign_mask))
+ igeo->ialloc_align = mp->m_dalign;
+ else
+ igeo->ialloc_align = 0;
+}
diff --git a/fs/xfs/libxfs/xfs_ialloc.h b/fs/xfs/libxfs/xfs_ialloc.h
index e936b7cc9389..323592d563d5 100644
--- a/fs/xfs/libxfs/xfs_ialloc.h
+++ b/fs/xfs/libxfs/xfs_ialloc.h
@@ -23,16 +23,6 @@ struct xfs_icluster {
* sparse chunks */
};
-/* Calculate and return the number of filesystem blocks per inode cluster */
-static inline int
-xfs_icluster_size_fsb(
- struct xfs_mount *mp)
-{
- if (mp->m_sb.sb_blocksize >= mp->m_inode_cluster_size)
- return 1;
- return mp->m_inode_cluster_size >> mp->m_sb.sb_blocklog;
-}
-
/*
* Make an inode pointer out of the buffer/offset.
*/
@@ -96,13 +86,6 @@ xfs_imap(
uint flags); /* flags for inode btree lookup */
/*
- * Compute and fill in value of m_in_maxlevels.
- */
-void
-xfs_ialloc_compute_maxlevels(
- struct xfs_mount *mp); /* file system mount structure */
-
-/*
* Log specified fields for the ag hdr (inode section)
*/
void
@@ -168,5 +151,6 @@ int xfs_inobt_insert_rec(struct xfs_btree_cur *cur, uint16_t holemask,
int *stat);
int xfs_ialloc_cluster_alignment(struct xfs_mount *mp);
+void xfs_ialloc_setup_geometry(struct xfs_mount *mp);
#endif /* __XFS_IALLOC_H__ */
diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c
index 9b25e7a0df47..b82992f795aa 100644
--- a/fs/xfs/libxfs/xfs_ialloc_btree.c
+++ b/fs/xfs/libxfs/xfs_ialloc_btree.c
@@ -11,14 +11,12 @@
#include "xfs_trans_resv.h"
#include "xfs_bit.h"
#include "xfs_mount.h"
-#include "xfs_inode.h"
#include "xfs_btree.h"
#include "xfs_ialloc.h"
#include "xfs_ialloc_btree.h"
#include "xfs_alloc.h"
#include "xfs_error.h"
#include "xfs_trace.h"
-#include "xfs_cksum.h"
#include "xfs_trans.h"
#include "xfs_rmap.h"
@@ -28,7 +26,7 @@ xfs_inobt_get_minrecs(
struct xfs_btree_cur *cur,
int level)
{
- return cur->bc_mp->m_inobt_mnr[level != 0];
+ return M_IGEO(cur->bc_mp)->inobt_mnr[level != 0];
}
STATIC struct xfs_btree_cur *
@@ -124,7 +122,7 @@ xfs_finobt_alloc_block(
union xfs_btree_ptr *new,
int *stat)
{
- if (cur->bc_mp->m_inotbt_nores)
+ if (cur->bc_mp->m_finobt_nores)
return xfs_inobt_alloc_block(cur, start, new, stat);
return __xfs_inobt_alloc_block(cur, start, new, stat,
XFS_AG_RESV_METADATA);
@@ -154,7 +152,7 @@ xfs_finobt_free_block(
struct xfs_btree_cur *cur,
struct xfs_buf *bp)
{
- if (cur->bc_mp->m_inotbt_nores)
+ if (cur->bc_mp->m_finobt_nores)
return xfs_inobt_free_block(cur, bp);
return __xfs_inobt_free_block(cur, bp, XFS_AG_RESV_METADATA);
}
@@ -164,7 +162,7 @@ xfs_inobt_get_maxrecs(
struct xfs_btree_cur *cur,
int level)
{
- return cur->bc_mp->m_inobt_mxr[level != 0];
+ return M_IGEO(cur->bc_mp)->inobt_mxr[level != 0];
}
STATIC void
@@ -255,11 +253,14 @@ static xfs_failaddr_t
xfs_inobt_verify(
struct xfs_buf *bp)
{
- struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_mount *mp = bp->b_mount;
struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
xfs_failaddr_t fa;
unsigned int level;
+ if (!xfs_verify_magic(bp, block->bb_magic))
+ return __this_address;
+
/*
* During growfs operations, we can't verify the exact owner as the
* perag is not fully initialised and hence not attached to the buffer.
@@ -270,26 +271,19 @@ xfs_inobt_verify(
* but beware of the landmine (i.e. need to check pag->pagi_init) if we
* ever do.
*/
- switch (block->bb_magic) {
- case cpu_to_be32(XFS_IBT_CRC_MAGIC):
- case cpu_to_be32(XFS_FIBT_CRC_MAGIC):
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
fa = xfs_btree_sblock_v5hdr_verify(bp);
if (fa)
return fa;
- /* fall through */
- case cpu_to_be32(XFS_IBT_MAGIC):
- case cpu_to_be32(XFS_FIBT_MAGIC):
- break;
- default:
- return __this_address;
}
/* level verification */
level = be16_to_cpu(block->bb_level);
- if (level >= mp->m_in_maxlevels)
+ if (level >= M_IGEO(mp)->inobt_maxlevels)
return __this_address;
- return xfs_btree_sblock_verify(bp, mp->m_inobt_mxr[level != 0]);
+ return xfs_btree_sblock_verify(bp,
+ M_IGEO(mp)->inobt_mxr[level != 0]);
}
static void
@@ -328,6 +322,16 @@ xfs_inobt_write_verify(
const struct xfs_buf_ops xfs_inobt_buf_ops = {
.name = "xfs_inobt",
+ .magic = { cpu_to_be32(XFS_IBT_MAGIC), cpu_to_be32(XFS_IBT_CRC_MAGIC) },
+ .verify_read = xfs_inobt_read_verify,
+ .verify_write = xfs_inobt_write_verify,
+ .verify_struct = xfs_inobt_verify,
+};
+
+const struct xfs_buf_ops xfs_finobt_buf_ops = {
+ .name = "xfs_finobt",
+ .magic = { cpu_to_be32(XFS_FIBT_MAGIC),
+ cpu_to_be32(XFS_FIBT_CRC_MAGIC) },
.verify_read = xfs_inobt_read_verify,
.verify_write = xfs_inobt_write_verify,
.verify_struct = xfs_inobt_verify,
@@ -389,7 +393,7 @@ static const struct xfs_btree_ops xfs_finobt_ops = {
.init_rec_from_cur = xfs_inobt_init_rec_from_cur,
.init_ptr_from_cur = xfs_finobt_init_ptr_from_cur,
.key_diff = xfs_inobt_key_diff,
- .buf_ops = &xfs_inobt_buf_ops,
+ .buf_ops = &xfs_finobt_buf_ops,
.diff_two_keys = xfs_inobt_diff_two_keys,
.keys_inorder = xfs_inobt_keys_inorder,
.recs_inorder = xfs_inobt_recs_inorder,
@@ -541,14 +545,53 @@ xfs_inobt_max_size(
xfs_agblock_t agblocks = xfs_ag_block_count(mp, agno);
/* Bail out if we're uninitialized, which can happen in mkfs. */
- if (mp->m_inobt_mxr[0] == 0)
+ if (M_IGEO(mp)->inobt_mxr[0] == 0)
return 0;
- return xfs_btree_calc_size(mp->m_inobt_mnr,
+ /*
+ * The log is permanently allocated, so the space it occupies will
+ * never be available for the kinds of things that would require btree
+ * expansion. We therefore can pretend the space isn't there.
+ */
+ if (mp->m_sb.sb_logstart &&
+ XFS_FSB_TO_AGNO(mp, mp->m_sb.sb_logstart) == agno)
+ agblocks -= mp->m_sb.sb_logblocks;
+
+ return xfs_btree_calc_size(M_IGEO(mp)->inobt_mnr,
(uint64_t)agblocks * mp->m_sb.sb_inopblock /
XFS_INODES_PER_CHUNK);
}
+/* Read AGI and create inobt cursor. */
+int
+xfs_inobt_cur(
+ struct xfs_mount *mp,
+ struct xfs_trans *tp,
+ xfs_agnumber_t agno,
+ xfs_btnum_t which,
+ struct xfs_btree_cur **curpp,
+ struct xfs_buf **agi_bpp)
+{
+ struct xfs_btree_cur *cur;
+ int error;
+
+ ASSERT(*agi_bpp == NULL);
+ ASSERT(*curpp == NULL);
+
+ error = xfs_ialloc_read_agi(mp, tp, agno, agi_bpp);
+ if (error)
+ return error;
+
+ cur = xfs_inobt_init_cursor(mp, tp, *agi_bpp, agno, which);
+ if (!cur) {
+ xfs_trans_brelse(tp, *agi_bpp);
+ *agi_bpp = NULL;
+ return -ENOMEM;
+ }
+ *curpp = cur;
+ return 0;
+}
+
static int
xfs_inobt_count_blocks(
struct xfs_mount *mp,
@@ -557,15 +600,14 @@ xfs_inobt_count_blocks(
xfs_btnum_t btnum,
xfs_extlen_t *tree_blocks)
{
- struct xfs_buf *agbp;
- struct xfs_btree_cur *cur;
+ struct xfs_buf *agbp = NULL;
+ struct xfs_btree_cur *cur = NULL;
int error;
- error = xfs_ialloc_read_agi(mp, tp, agno, &agbp);
+ error = xfs_inobt_cur(mp, tp, agno, btnum, &cur, &agbp);
if (error)
return error;
- cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, btnum);
error = xfs_btree_count_blocks(cur, tree_blocks);
xfs_btree_del_cursor(cur, error);
xfs_trans_brelse(tp, agbp);
@@ -605,5 +647,5 @@ xfs_iallocbt_calc_size(
struct xfs_mount *mp,
unsigned long long len)
{
- return xfs_btree_calc_size(mp->m_inobt_mnr, len);
+ return xfs_btree_calc_size(M_IGEO(mp)->inobt_mnr, len);
}
diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.h b/fs/xfs/libxfs/xfs_ialloc_btree.h
index ebdd0c6b8766..951305ecaae1 100644
--- a/fs/xfs/libxfs/xfs_ialloc_btree.h
+++ b/fs/xfs/libxfs/xfs_ialloc_btree.h
@@ -64,5 +64,8 @@ int xfs_finobt_calc_reserves(struct xfs_mount *mp, struct xfs_trans *tp,
xfs_agnumber_t agno, xfs_extlen_t *ask, xfs_extlen_t *used);
extern xfs_extlen_t xfs_iallocbt_calc_size(struct xfs_mount *mp,
unsigned long long len);
+int xfs_inobt_cur(struct xfs_mount *mp, struct xfs_trans *tp,
+ xfs_agnumber_t agno, xfs_btnum_t btnum,
+ struct xfs_btree_cur **curpp, struct xfs_buf **agi_bpp);
#endif /* __XFS_IALLOC_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_iext_tree.c b/fs/xfs/libxfs/xfs_iext_tree.c
index 771dd072015d..27aa3f2bc4bc 100644
--- a/fs/xfs/libxfs/xfs_iext_tree.c
+++ b/fs/xfs/libxfs/xfs_iext_tree.c
@@ -3,18 +3,14 @@
* Copyright (c) 2017 Christoph Hellwig.
*/
-#include <linux/cache.h>
-#include <linux/kernel.h>
-#include <linux/slab.h>
#include "xfs.h"
+#include "xfs_shared.h"
#include "xfs_format.h"
#include "xfs_bit.h"
#include "xfs_log_format.h"
#include "xfs_inode.h"
-#include "xfs_inode_fork.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
-#include "xfs_bmap.h"
#include "xfs_trace.h"
/*
@@ -614,16 +610,15 @@ xfs_iext_realloc_root(
}
/*
- * Increment the sequence counter if we are on a COW fork. This allows
- * the writeback code to skip looking for a COW extent if the COW fork
- * hasn't changed. We use WRITE_ONCE here to ensure the update to the
- * sequence counter is seen before the modifications to the extent
- * tree itself take effect.
+ * Increment the sequence counter on extent tree changes. If we are on a COW
+ * fork, this allows the writeback code to skip looking for a COW extent if the
+ * COW fork hasn't changed. We use WRITE_ONCE here to ensure the update to the
+ * sequence counter is seen before the modifications to the extent tree itself
+ * take effect.
*/
static inline void xfs_iext_inc_seq(struct xfs_ifork *ifp, int state)
{
- if (state & BMAP_COWFORK)
- WRITE_ONCE(ifp->if_seq, READ_ONCE(ifp->if_seq) + 1);
+ WRITE_ONCE(ifp->if_seq, READ_ONCE(ifp->if_seq) + 1);
}
void
diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c
index 09d9c8cfa4a0..28ab3c5255e1 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.c
+++ b/fs/xfs/libxfs/xfs_inode_buf.c
@@ -10,11 +10,9 @@
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
-#include "xfs_defer.h"
#include "xfs_inode.h"
#include "xfs_errortag.h"
#include "xfs_error.h"
-#include "xfs_cksum.h"
#include "xfs_icache.h"
#include "xfs_trans.h"
#include "xfs_ialloc.h"
@@ -33,12 +31,9 @@ xfs_inobp_check(
xfs_buf_t *bp)
{
int i;
- int j;
xfs_dinode_t *dip;
- j = mp->m_inode_cluster_size >> mp->m_sb.sb_inodelog;
-
- for (i = 0; i < j; i++) {
+ for (i = 0; i < M_IGEO(mp)->inodes_per_cluster; i++) {
dip = xfs_buf_offset(bp, i * mp->m_sb.sb_inodesize);
if (!dip->di_next_unlinked) {
xfs_alert(mp,
@@ -80,7 +75,7 @@ xfs_inode_buf_verify(
struct xfs_buf *bp,
bool readahead)
{
- struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_mount *mp = bp->b_mount;
xfs_agnumber_t agno;
int i;
int ni;
@@ -97,10 +92,9 @@ xfs_inode_buf_verify(
dip = xfs_buf_offset(bp, (i << mp->m_sb.sb_inodelog));
unlinked_ino = be32_to_cpu(dip->di_next_unlinked);
- di_ok = dip->di_magic == cpu_to_be16(XFS_DINODE_MAGIC) &&
+ di_ok = xfs_verify_magic16(bp, dip->di_magic) &&
xfs_dinode_good_version(mp, dip->di_version) &&
- (unlinked_ino == NULLAGINO ||
- xfs_verify_agino(mp, agno, unlinked_ino));
+ xfs_verify_agino_or_null(mp, agno, unlinked_ino);
if (unlikely(XFS_TEST_ERROR(!di_ok, mp,
XFS_ERRTAG_ITOBP_INOTOBP))) {
if (readahead) {
@@ -147,12 +141,16 @@ xfs_inode_buf_write_verify(
const struct xfs_buf_ops xfs_inode_buf_ops = {
.name = "xfs_inode",
+ .magic16 = { cpu_to_be16(XFS_DINODE_MAGIC),
+ cpu_to_be16(XFS_DINODE_MAGIC) },
.verify_read = xfs_inode_buf_read_verify,
.verify_write = xfs_inode_buf_write_verify,
};
const struct xfs_buf_ops xfs_inode_buf_ra_ops = {
- .name = "xxfs_inode_ra",
+ .name = "xfs_inode_ra",
+ .magic16 = { cpu_to_be16(XFS_DINODE_MAGIC),
+ cpu_to_be16(XFS_DINODE_MAGIC) },
.verify_read = xfs_inode_buf_readahead_verify,
.verify_write = xfs_inode_buf_write_verify,
};
diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c
index f9acf1d436f6..bf3e04018246 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.c
+++ b/fs/xfs/libxfs/xfs_inode_fork.c
@@ -3,10 +3,10 @@
* Copyright (c) 2000-2006 Silicon Graphics, Inc.
* All Rights Reserved.
*/
-#include <linux/log2.h>
#include "xfs.h"
#include "xfs_fs.h"
+#include "xfs_shared.h"
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
@@ -19,12 +19,10 @@
#include "xfs_bmap.h"
#include "xfs_error.h"
#include "xfs_trace.h"
-#include "xfs_attr_sf.h"
#include "xfs_da_format.h"
#include "xfs_da_btree.h"
#include "xfs_dir2_priv.h"
#include "xfs_attr_leaf.h"
-#include "xfs_shared.h"
kmem_zone_t *xfs_ifork_zone;
diff --git a/fs/xfs/libxfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h
index 60361d2d74a1..00c62ce170d0 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.h
+++ b/fs/xfs/libxfs/xfs_inode_fork.h
@@ -14,7 +14,7 @@ struct xfs_dinode;
*/
struct xfs_ifork {
int if_bytes; /* bytes in if_u1 */
- unsigned int if_seq; /* cow fork mod counter */
+ unsigned int if_seq; /* fork mod counter */
struct xfs_btree_block *if_broot; /* file's incore btree root */
short if_broot_bytes; /* bytes allocated for root */
unsigned char if_flags; /* per-fork flags */
diff --git a/fs/xfs/libxfs/xfs_log_rlimit.c b/fs/xfs/libxfs/xfs_log_rlimit.c
index 1b542ec11d5d..7f55eb3f3653 100644
--- a/fs/xfs/libxfs/xfs_log_rlimit.c
+++ b/fs/xfs/libxfs/xfs_log_rlimit.c
@@ -12,9 +12,7 @@
#include "xfs_mount.h"
#include "xfs_da_format.h"
#include "xfs_trans_space.h"
-#include "xfs_inode.h"
#include "xfs_da_btree.h"
-#include "xfs_attr_leaf.h"
#include "xfs_bmap_btree.h"
/*
diff --git a/fs/xfs/libxfs/xfs_quota_defs.h b/fs/xfs/libxfs/xfs_quota_defs.h
index 4bfdd5f4c6af..b2113b17e53c 100644
--- a/fs/xfs/libxfs/xfs_quota_defs.h
+++ b/fs/xfs/libxfs/xfs_quota_defs.h
@@ -142,7 +142,7 @@ extern xfs_failaddr_t xfs_dquot_verify(struct xfs_mount *mp,
extern xfs_failaddr_t xfs_dqblk_verify(struct xfs_mount *mp,
struct xfs_dqblk *dqb, xfs_dqid_t id, uint type);
extern int xfs_calc_dquots_per_chunk(unsigned int nbblks);
-extern int xfs_dqblk_repair(struct xfs_mount *mp, struct xfs_dqblk *dqb,
+extern void xfs_dqblk_repair(struct xfs_mount *mp, struct xfs_dqblk *dqb,
xfs_dqid_t id, uint type);
#endif /* __XFS_QUOTA_H__ */
diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c
index 542aa1475b5f..51bb9bdb0e84 100644
--- a/fs/xfs/libxfs/xfs_refcount.c
+++ b/fs/xfs/libxfs/xfs_refcount.c
@@ -9,7 +9,6 @@
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
-#include "xfs_sb.h"
#include "xfs_mount.h"
#include "xfs_defer.h"
#include "xfs_btree.h"
@@ -19,7 +18,6 @@
#include "xfs_errortag.h"
#include "xfs_error.h"
#include "xfs_trace.h"
-#include "xfs_cksum.h"
#include "xfs_trans.h"
#include "xfs_bit.h"
#include "xfs_refcount.h"
diff --git a/fs/xfs/libxfs/xfs_refcount_btree.c b/fs/xfs/libxfs/xfs_refcount_btree.c
index d9eab657b63e..38529dbacd55 100644
--- a/fs/xfs/libxfs/xfs_refcount_btree.c
+++ b/fs/xfs/libxfs/xfs_refcount_btree.c
@@ -12,12 +12,10 @@
#include "xfs_sb.h"
#include "xfs_mount.h"
#include "xfs_btree.h"
-#include "xfs_bmap.h"
#include "xfs_refcount_btree.h"
#include "xfs_alloc.h"
#include "xfs_error.h"
#include "xfs_trace.h"
-#include "xfs_cksum.h"
#include "xfs_trans.h"
#include "xfs_bit.h"
#include "xfs_rmap.h"
@@ -203,13 +201,13 @@ STATIC xfs_failaddr_t
xfs_refcountbt_verify(
struct xfs_buf *bp)
{
- struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_mount *mp = bp->b_mount;
struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
struct xfs_perag *pag = bp->b_pag;
xfs_failaddr_t fa;
unsigned int level;
- if (block->bb_magic != cpu_to_be32(XFS_REFC_CRC_MAGIC))
+ if (!xfs_verify_magic(bp, block->bb_magic))
return __this_address;
if (!xfs_sb_version_hasreflink(&mp->m_sb))
@@ -264,6 +262,7 @@ xfs_refcountbt_write_verify(
const struct xfs_buf_ops xfs_refcountbt_buf_ops = {
.name = "xfs_refcountbt",
+ .magic = { 0, cpu_to_be32(XFS_REFC_CRC_MAGIC) },
.verify_read = xfs_refcountbt_read_verify,
.verify_write = xfs_refcountbt_write_verify,
.verify_struct = xfs_refcountbt_verify,
@@ -426,6 +425,15 @@ xfs_refcountbt_calc_reserves(
tree_len = be32_to_cpu(agf->agf_refcount_blocks);
xfs_trans_brelse(tp, agbp);
+ /*
+ * The log is permanently allocated, so the space it occupies will
+ * never be available for the kinds of things that would require btree
+ * expansion. We therefore can pretend the space isn't there.
+ */
+ if (mp->m_sb.sb_logstart &&
+ XFS_FSB_TO_AGNO(mp, mp->m_sb.sb_logstart) == agno)
+ agblocks -= mp->m_sb.sb_logblocks;
+
*ask += xfs_refcountbt_max_size(mp, agblocks);
*used += tree_len;
diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c
index 8ed885507dd8..e6aeb390b2fb 100644
--- a/fs/xfs/libxfs/xfs_rmap.c
+++ b/fs/xfs/libxfs/xfs_rmap.c
@@ -10,24 +10,17 @@
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
#include "xfs_bit.h"
-#include "xfs_sb.h"
#include "xfs_mount.h"
#include "xfs_defer.h"
-#include "xfs_da_format.h"
-#include "xfs_da_btree.h"
#include "xfs_btree.h"
#include "xfs_trans.h"
#include "xfs_alloc.h"
#include "xfs_rmap.h"
#include "xfs_rmap_btree.h"
-#include "xfs_trans_space.h"
#include "xfs_trace.h"
#include "xfs_errortag.h"
#include "xfs_error.h"
-#include "xfs_extent_busy.h"
-#include "xfs_bmap.h"
#include "xfs_inode.h"
-#include "xfs_ialloc.h"
/*
* Lookup the first record less than or equal to [bno, len, owner, offset]
diff --git a/fs/xfs/libxfs/xfs_rmap_btree.c b/fs/xfs/libxfs/xfs_rmap_btree.c
index f79cf040d745..fc78efa52c94 100644
--- a/fs/xfs/libxfs/xfs_rmap_btree.c
+++ b/fs/xfs/libxfs/xfs_rmap_btree.c
@@ -9,18 +9,14 @@
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
-#include "xfs_bit.h"
#include "xfs_sb.h"
#include "xfs_mount.h"
-#include "xfs_defer.h"
-#include "xfs_inode.h"
#include "xfs_trans.h"
#include "xfs_alloc.h"
#include "xfs_btree.h"
#include "xfs_rmap.h"
#include "xfs_rmap_btree.h"
#include "xfs_trace.h"
-#include "xfs_cksum.h"
#include "xfs_error.h"
#include "xfs_extent_busy.h"
#include "xfs_ag_resv.h"
@@ -292,7 +288,7 @@ static xfs_failaddr_t
xfs_rmapbt_verify(
struct xfs_buf *bp)
{
- struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_mount *mp = bp->b_mount;
struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
struct xfs_perag *pag = bp->b_pag;
xfs_failaddr_t fa;
@@ -310,7 +306,7 @@ xfs_rmapbt_verify(
* from the on disk AGF. Again, we can only check against maximum limits
* in this case.
*/
- if (block->bb_magic != cpu_to_be32(XFS_RMAP_CRC_MAGIC))
+ if (!xfs_verify_magic(bp, block->bb_magic))
return __this_address;
if (!xfs_sb_version_hasrmapbt(&mp->m_sb))
@@ -365,6 +361,7 @@ xfs_rmapbt_write_verify(
const struct xfs_buf_ops xfs_rmapbt_buf_ops = {
.name = "xfs_rmapbt",
+ .magic = { 0, cpu_to_be32(XFS_RMAP_CRC_MAGIC) },
.verify_read = xfs_rmapbt_read_verify,
.verify_write = xfs_rmapbt_write_verify,
.verify_struct = xfs_rmapbt_verify,
@@ -577,6 +574,15 @@ xfs_rmapbt_calc_reserves(
tree_len = be32_to_cpu(agf->agf_rmap_blocks);
xfs_trans_brelse(tp, agbp);
+ /*
+ * The log is permanently allocated, so the space it occupies will
+ * never be available for the kinds of things that would require btree
+ * expansion. We therefore can pretend the space isn't there.
+ */
+ if (mp->m_sb.sb_logstart &&
+ XFS_FSB_TO_AGNO(mp, mp->m_sb.sb_logstart) == agno)
+ agblocks -= mp->m_sb.sb_logblocks;
+
/* Reserve 1% of the AG or enough for 1 block per record. */
*ask += max(agblocks / 100, xfs_rmapbt_max_size(mp, agblocks));
*used += tree_len;
diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c
index eaaff67e9626..8ea1efc97b41 100644
--- a/fs/xfs/libxfs/xfs_rtbitmap.c
+++ b/fs/xfs/libxfs/xfs_rtbitmap.c
@@ -13,15 +13,7 @@
#include "xfs_mount.h"
#include "xfs_inode.h"
#include "xfs_bmap.h"
-#include "xfs_bmap_util.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_alloc.h"
-#include "xfs_error.h"
#include "xfs_trans.h"
-#include "xfs_trans_space.h"
-#include "xfs_trace.h"
-#include "xfs_buf.h"
-#include "xfs_icache.h"
#include "xfs_rtalloc.h"
diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index b5a82acd7dfe..a08dd8f40346 100644
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -10,26 +10,20 @@
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
#include "xfs_bit.h"
-#include "xfs_sb.h"
#include "xfs_mount.h"
-#include "xfs_defer.h"
-#include "xfs_inode.h"
#include "xfs_ialloc.h"
#include "xfs_alloc.h"
#include "xfs_error.h"
#include "xfs_trace.h"
-#include "xfs_cksum.h"
#include "xfs_trans.h"
#include "xfs_buf_item.h"
#include "xfs_bmap_btree.h"
#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
#include "xfs_log.h"
#include "xfs_rmap_btree.h"
-#include "xfs_bmap.h"
#include "xfs_refcount_btree.h"
#include "xfs_da_format.h"
-#include "xfs_da_btree.h"
+#include "xfs_health.h"
/*
* Physical superblock buffer manipulations. Shared with libxfs in userspace.
@@ -225,10 +219,11 @@ xfs_validate_sb_common(
struct xfs_buf *bp,
struct xfs_sb *sbp)
{
+ struct xfs_dsb *dsb = XFS_BUF_TO_SBP(bp);
uint32_t agcount = 0;
uint32_t rem;
- if (sbp->sb_magicnum != XFS_SB_MAGIC) {
+ if (!xfs_verify_magic(bp, dsb->sb_magicnum)) {
xfs_warn(mp, "bad magic number");
return -EWRONGFS;
}
@@ -684,7 +679,7 @@ xfs_sb_read_verify(
struct xfs_buf *bp)
{
struct xfs_sb sb;
- struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_mount *mp = bp->b_mount;
struct xfs_dsb *dsb = XFS_BUF_TO_SBP(bp);
int error;
@@ -750,7 +745,7 @@ xfs_sb_write_verify(
struct xfs_buf *bp)
{
struct xfs_sb sb;
- struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_mount *mp = bp->b_mount;
struct xfs_buf_log_item *bip = bp->b_log_item;
int error;
@@ -781,12 +776,14 @@ out_error:
const struct xfs_buf_ops xfs_sb_buf_ops = {
.name = "xfs_sb",
+ .magic = { cpu_to_be32(XFS_SB_MAGIC), cpu_to_be32(XFS_SB_MAGIC) },
.verify_read = xfs_sb_read_verify,
.verify_write = xfs_sb_write_verify,
};
const struct xfs_buf_ops xfs_sb_quiet_buf_ops = {
.name = "xfs_sb_quiet",
+ .magic = { cpu_to_be32(XFS_SB_MAGIC), cpu_to_be32(XFS_SB_MAGIC) },
.verify_read = xfs_sb_quiet_read_verify,
.verify_write = xfs_sb_write_verify,
};
@@ -796,12 +793,14 @@ const struct xfs_buf_ops xfs_sb_quiet_buf_ops = {
*
* Mount initialization code establishing various mount
* fields from the superblock associated with the given
- * mount structure
+ * mount structure.
+ *
+ * Inode geometry are calculated in xfs_ialloc_setup_geometry.
*/
void
xfs_sb_mount_common(
- struct xfs_mount *mp,
- struct xfs_sb *sbp)
+ struct xfs_mount *mp,
+ struct xfs_sb *sbp)
{
mp->m_agfrotor = mp->m_agirotor = 0;
mp->m_maxagi = mp->m_sb.sb_agcount;
@@ -809,7 +808,6 @@ xfs_sb_mount_common(
mp->m_blkbb_log = sbp->sb_blocklog - BBSHIFT;
mp->m_sectbb_log = sbp->sb_sectlog - BBSHIFT;
mp->m_agno_log = xfs_highbit32(sbp->sb_agcount - 1) + 1;
- mp->m_agino_log = sbp->sb_inopblog + sbp->sb_agblklog;
mp->m_blockmask = sbp->sb_blocksize - 1;
mp->m_blockwsize = sbp->sb_blocksize >> XFS_WORDLOG;
mp->m_blockwmask = mp->m_blockwsize - 1;
@@ -819,11 +817,6 @@ xfs_sb_mount_common(
mp->m_alloc_mnr[0] = mp->m_alloc_mxr[0] / 2;
mp->m_alloc_mnr[1] = mp->m_alloc_mxr[1] / 2;
- mp->m_inobt_mxr[0] = xfs_inobt_maxrecs(mp, sbp->sb_blocksize, 1);
- mp->m_inobt_mxr[1] = xfs_inobt_maxrecs(mp, sbp->sb_blocksize, 0);
- mp->m_inobt_mnr[0] = mp->m_inobt_mxr[0] / 2;
- mp->m_inobt_mnr[1] = mp->m_inobt_mxr[1] / 2;
-
mp->m_bmap_dmxr[0] = xfs_bmbt_maxrecs(mp, sbp->sb_blocksize, 1);
mp->m_bmap_dmxr[1] = xfs_bmbt_maxrecs(mp, sbp->sb_blocksize, 0);
mp->m_bmap_dmnr[0] = mp->m_bmap_dmxr[0] / 2;
@@ -840,14 +833,6 @@ xfs_sb_mount_common(
mp->m_refc_mnr[1] = mp->m_refc_mxr[1] / 2;
mp->m_bsize = XFS_FSB_TO_BB(mp, 1);
- mp->m_ialloc_inos = max_t(uint16_t, XFS_INODES_PER_CHUNK,
- sbp->sb_inopblock);
- mp->m_ialloc_blks = mp->m_ialloc_inos >> sbp->sb_inopblog;
-
- if (sbp->sb_spino_align)
- mp->m_ialloc_min_blks = sbp->sb_spino_align;
- else
- mp->m_ialloc_min_blks = mp->m_ialloc_blks;
mp->m_alloc_set_aside = xfs_alloc_set_aside(mp);
mp->m_ag_max_usable = xfs_alloc_ag_max_usable(mp);
}
@@ -874,7 +859,7 @@ xfs_initialize_perag_data(
uint64_t bfreelst = 0;
uint64_t btree = 0;
uint64_t fdblocks;
- int error;
+ int error = 0;
for (index = 0; index < agcount; index++) {
/*
@@ -902,7 +887,7 @@ xfs_initialize_perag_data(
/*
* If the new summary counts are obviously incorrect, fail the
* mount operation because that implies the AGFs are also corrupt.
- * Clear BAD_SUMMARY so that we don't unmount with a dirty log, which
+ * Clear FS_COUNTERS so that we don't unmount with a dirty log, which
* will prevent xfs_repair from fixing anything.
*/
if (fdblocks > sbp->sb_dblocks || ifree > ialloc) {
@@ -920,7 +905,7 @@ xfs_initialize_perag_data(
xfs_reinit_percpu_counters(mp);
out:
- mp->m_flags &= ~XFS_MOUNT_BAD_SUMMARY;
+ xfs_fs_mark_healthy(mp, XFS_SICK_FS_COUNTERS);
return error;
}
@@ -935,7 +920,7 @@ xfs_log_sb(
struct xfs_trans *tp)
{
struct xfs_mount *mp = tp->t_mountp;
- struct xfs_buf *bp = xfs_trans_getsb(tp, mp, 0);
+ struct xfs_buf *bp = xfs_trans_getsb(tp, mp);
mp->m_sb.sb_icount = percpu_counter_sum(&mp->m_icount);
mp->m_sb.sb_ifree = percpu_counter_sum(&mp->m_ifree);
@@ -1001,7 +986,7 @@ xfs_update_secondary_sbs(
bp = xfs_buf_get(mp->m_ddev_targp,
XFS_AG_DADDR(mp, agno, XFS_SB_DADDR),
- XFS_FSS_TO_BB(mp, 1), 0);
+ XFS_FSS_TO_BB(mp, 1));
/*
* If we get an error reading or writing alternate superblocks,
* continue. xfs_repair chooses the "best" superblock based
@@ -1065,7 +1050,7 @@ xfs_sync_sb_buf(
if (error)
return error;
- bp = xfs_trans_getsb(tp, mp, 0);
+ bp = xfs_trans_getsb(tp, mp);
xfs_log_sb(tp);
xfs_trans_bhold(tp, bp);
xfs_trans_set_sync(tp);
@@ -1081,7 +1066,7 @@ out:
return error;
}
-int
+void
xfs_fs_geometry(
struct xfs_sb *sbp,
struct xfs_fsop_geom *geo,
@@ -1105,13 +1090,13 @@ xfs_fs_geometry(
memcpy(geo->uuid, &sbp->sb_uuid, sizeof(sbp->sb_uuid));
if (struct_version < 2)
- return 0;
+ return;
geo->sunit = sbp->sb_unit;
geo->swidth = sbp->sb_width;
if (struct_version < 3)
- return 0;
+ return;
geo->version = XFS_FSOP_GEOM_VERSION;
geo->flags = XFS_FSOP_GEOM_FLAGS_NLINK |
@@ -1155,14 +1140,17 @@ xfs_fs_geometry(
geo->dirblocksize = xfs_dir2_dirblock_bytes(sbp);
if (struct_version < 4)
- return 0;
+ return;
if (xfs_sb_version_haslogv2(sbp))
geo->flags |= XFS_FSOP_GEOM_FLAGS_LOGV2;
geo->logsunit = sbp->sb_logsunit;
- return 0;
+ if (struct_version < 5)
+ return;
+
+ geo->version = XFS_FSOP_GEOM_VERSION_V5;
}
/* Read a secondary superblock. */
diff --git a/fs/xfs/libxfs/xfs_sb.h b/fs/xfs/libxfs/xfs_sb.h
index 13564d69800a..92465a9a5162 100644
--- a/fs/xfs/libxfs/xfs_sb.h
+++ b/fs/xfs/libxfs/xfs_sb.h
@@ -33,7 +33,7 @@ extern void xfs_sb_quota_from_disk(struct xfs_sb *sbp);
extern int xfs_update_secondary_sbs(struct xfs_mount *mp);
#define XFS_FS_GEOM_MAX_STRUCT_VER (4)
-extern int xfs_fs_geometry(struct xfs_sb *sbp, struct xfs_fsop_geom *geo,
+extern void xfs_fs_geometry(struct xfs_sb *sbp, struct xfs_fsop_geom *geo,
int struct_version);
extern int xfs_sb_read_secondary(struct xfs_mount *mp,
struct xfs_trans *tp, xfs_agnumber_t agno,
diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h
index 1c5debe748f0..e0641b7337b3 100644
--- a/fs/xfs/libxfs/xfs_shared.h
+++ b/fs/xfs/libxfs/xfs_shared.h
@@ -25,7 +25,8 @@ extern const struct xfs_buf_ops xfs_agf_buf_ops;
extern const struct xfs_buf_ops xfs_agi_buf_ops;
extern const struct xfs_buf_ops xfs_agf_buf_ops;
extern const struct xfs_buf_ops xfs_agfl_buf_ops;
-extern const struct xfs_buf_ops xfs_allocbt_buf_ops;
+extern const struct xfs_buf_ops xfs_bnobt_buf_ops;
+extern const struct xfs_buf_ops xfs_cntbt_buf_ops;
extern const struct xfs_buf_ops xfs_rmapbt_buf_ops;
extern const struct xfs_buf_ops xfs_refcountbt_buf_ops;
extern const struct xfs_buf_ops xfs_attr3_leaf_buf_ops;
@@ -36,6 +37,7 @@ extern const struct xfs_buf_ops xfs_dquot_buf_ops;
extern const struct xfs_buf_ops xfs_symlink_buf_ops;
extern const struct xfs_buf_ops xfs_agi_buf_ops;
extern const struct xfs_buf_ops xfs_inobt_buf_ops;
+extern const struct xfs_buf_ops xfs_finobt_buf_ops;
extern const struct xfs_buf_ops xfs_inode_buf_ops;
extern const struct xfs_buf_ops xfs_inode_buf_ra_ops;
extern const struct xfs_buf_ops xfs_dquot_buf_ops;
@@ -63,7 +65,6 @@ void xfs_log_get_max_trans_res(struct xfs_mount *mp,
#define XFS_TRANS_DQ_DIRTY 0x10 /* at least one dquot in trx dirty */
#define XFS_TRANS_RESERVE 0x20 /* OK to use reserved data blocks */
#define XFS_TRANS_NO_WRITECOUNT 0x40 /* do not elevate SB writecount */
-#define XFS_TRANS_NOFS 0x80 /* pass KM_NOFS to kmem_alloc */
/*
* LOWMODE is used by the allocator to activate the lowspace algorithm - when
* free space is running low the extent allocator may choose to allocate an
@@ -134,4 +135,52 @@ void xfs_symlink_local_to_remote(struct xfs_trans *tp, struct xfs_buf *bp,
struct xfs_inode *ip, struct xfs_ifork *ifp);
xfs_failaddr_t xfs_symlink_shortform_verify(struct xfs_inode *ip);
+/* Computed inode geometry for the filesystem. */
+struct xfs_ino_geometry {
+ /* Maximum inode count in this filesystem. */
+ uint64_t maxicount;
+
+ /* Actual inode cluster buffer size, in bytes. */
+ unsigned int inode_cluster_size;
+
+ /*
+ * Desired inode cluster buffer size, in bytes. This value is not
+ * rounded up to at least one filesystem block, which is necessary for
+ * the sole purpose of validating sb_spino_align. Runtime code must
+ * only ever use inode_cluster_size.
+ */
+ unsigned int inode_cluster_size_raw;
+
+ /* Inode cluster sizes, adjusted to be at least 1 fsb. */
+ unsigned int inodes_per_cluster;
+ unsigned int blocks_per_cluster;
+
+ /* Inode cluster alignment. */
+ unsigned int cluster_align;
+ unsigned int cluster_align_inodes;
+ unsigned int inoalign_mask; /* mask sb_inoalignmt if used */
+
+ unsigned int inobt_mxr[2]; /* max inobt btree records */
+ unsigned int inobt_mnr[2]; /* min inobt btree records */
+ unsigned int inobt_maxlevels; /* max inobt btree levels. */
+
+ /* Size of inode allocations under normal operation. */
+ unsigned int ialloc_inos;
+ unsigned int ialloc_blks;
+
+ /* Minimum inode blocks for a sparse allocation. */
+ unsigned int ialloc_min_blks;
+
+ /* stripe unit inode alignment */
+ unsigned int ialloc_align;
+
+ unsigned int agino_log; /* #bits for agino in inum */
+};
+
+/* Keep iterating the data structure. */
+#define XFS_ITER_CONTINUE (0)
+
+/* Stop iterating the data structure. */
+#define XFS_ITER_ABORT (1)
+
#endif /* __XFS_SHARED_H__ */
diff --git a/fs/xfs/libxfs/xfs_symlink_remote.c b/fs/xfs/libxfs/xfs_symlink_remote.c
index 77d80106f989..3b8260ca7d1b 100644
--- a/fs/xfs/libxfs/xfs_symlink_remote.c
+++ b/fs/xfs/libxfs/xfs_symlink_remote.c
@@ -11,12 +11,8 @@
#include "xfs_shared.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
#include "xfs_inode.h"
#include "xfs_error.h"
-#include "xfs_trace.h"
-#include "xfs_symlink.h"
-#include "xfs_cksum.h"
#include "xfs_trans.h"
#include "xfs_buf_item.h"
#include "xfs_log.h"
@@ -90,12 +86,12 @@ static xfs_failaddr_t
xfs_symlink_verify(
struct xfs_buf *bp)
{
- struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_mount *mp = bp->b_mount;
struct xfs_dsymlink_hdr *dsl = bp->b_addr;
if (!xfs_sb_version_hascrc(&mp->m_sb))
return __this_address;
- if (dsl->sl_magic != cpu_to_be32(XFS_SYMLINK_MAGIC))
+ if (!xfs_verify_magic(bp, dsl->sl_magic))
return __this_address;
if (!uuid_equal(&dsl->sl_uuid, &mp->m_sb.sb_meta_uuid))
return __this_address;
@@ -116,7 +112,7 @@ static void
xfs_symlink_read_verify(
struct xfs_buf *bp)
{
- struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_mount *mp = bp->b_mount;
xfs_failaddr_t fa;
/* no verification of non-crc buffers */
@@ -136,7 +132,7 @@ static void
xfs_symlink_write_verify(
struct xfs_buf *bp)
{
- struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_mount *mp = bp->b_mount;
struct xfs_buf_log_item *bip = bp->b_log_item;
xfs_failaddr_t fa;
@@ -159,6 +155,7 @@ xfs_symlink_write_verify(
const struct xfs_buf_ops xfs_symlink_buf_ops = {
.name = "xfs_symlink",
+ .magic = { 0, cpu_to_be32(XFS_SYMLINK_MAGIC) },
.verify_read = xfs_symlink_read_verify,
.verify_write = xfs_symlink_write_verify,
.verify_struct = xfs_symlink_verify,
diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/libxfs/xfs_trans_inode.c
index 542927321a61..a9ad90926b87 100644
--- a/fs/xfs/xfs_trans_inode.c
+++ b/fs/xfs/libxfs/xfs_trans_inode.c
@@ -8,13 +8,10 @@
#include "xfs_shared.h"
#include "xfs_format.h"
#include "xfs_log_format.h"
-#include "xfs_trans_resv.h"
-#include "xfs_mount.h"
#include "xfs_inode.h"
#include "xfs_trans.h"
#include "xfs_trans_priv.h"
#include "xfs_inode_item.h"
-#include "xfs_trace.h"
#include <linux/iversion.h>
@@ -69,6 +66,10 @@ xfs_trans_ichgtime(
inode->i_mtime = tv;
if (flags & XFS_ICHGTIME_CHG)
inode->i_ctime = tv;
+ if (flags & XFS_ICHGTIME_CREATE) {
+ ip->i_d.di_crtime.t_sec = (int32_t)tv.tv_sec;
+ ip->i_d.di_crtime.t_nsec = (int32_t)tv.tv_nsec;
+ }
}
/*
diff --git a/fs/xfs/libxfs/xfs_trans_resv.c b/fs/xfs/libxfs/xfs_trans_resv.c
index f99a7aefe418..d12bbd526e7c 100644
--- a/fs/xfs/libxfs/xfs_trans_resv.c
+++ b/fs/xfs/libxfs/xfs_trans_resv.c
@@ -15,12 +15,10 @@
#include "xfs_da_btree.h"
#include "xfs_inode.h"
#include "xfs_bmap_btree.h"
-#include "xfs_ialloc.h"
#include "xfs_quota.h"
#include "xfs_trans.h"
#include "xfs_qm.h"
#include "xfs_trans_space.h"
-#include "xfs_trace.h"
#define _ALLOC true
#define _FREE false
@@ -136,9 +134,10 @@ STATIC uint
xfs_calc_inobt_res(
struct xfs_mount *mp)
{
- return xfs_calc_buf_res(mp->m_in_maxlevels, XFS_FSB_TO_B(mp, 1)) +
- xfs_calc_buf_res(xfs_allocfree_log_count(mp, 1),
- XFS_FSB_TO_B(mp, 1));
+ return xfs_calc_buf_res(M_IGEO(mp)->inobt_maxlevels,
+ XFS_FSB_TO_B(mp, 1)) +
+ xfs_calc_buf_res(xfs_allocfree_log_count(mp, 1),
+ XFS_FSB_TO_B(mp, 1));
}
/*
@@ -167,7 +166,7 @@ xfs_calc_finobt_res(
* includes:
*
* the allocation btrees: 2 trees * (max depth - 1) * block size
- * the inode chunk: m_ialloc_blks * N
+ * the inode chunk: m_ino_geo.ialloc_blks * N
*
* The size N of the inode chunk reservation depends on whether it is for
* allocation or free and which type of create transaction is in use. An inode
@@ -193,7 +192,7 @@ xfs_calc_inode_chunk_res(
size = XFS_FSB_TO_B(mp, 1);
}
- res += xfs_calc_buf_res(mp->m_ialloc_blks, size);
+ res += xfs_calc_buf_res(M_IGEO(mp)->ialloc_blks, size);
return res;
}
@@ -307,7 +306,7 @@ xfs_calc_iunlink_remove_reservation(
struct xfs_mount *mp)
{
return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) +
- 2 * max_t(uint, XFS_FSB_TO_B(mp, 1), mp->m_inode_cluster_size);
+ 2 * M_IGEO(mp)->inode_cluster_size;
}
/*
@@ -345,7 +344,7 @@ STATIC uint
xfs_calc_iunlink_add_reservation(xfs_mount_t *mp)
{
return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) +
- max_t(uint, XFS_FSB_TO_B(mp, 1), mp->m_inode_cluster_size);
+ M_IGEO(mp)->inode_cluster_size;
}
/*
@@ -876,9 +875,13 @@ xfs_trans_resv_calc(
resp->tr_sb.tr_logres = xfs_calc_sb_reservation(mp);
resp->tr_sb.tr_logcount = XFS_DEFAULT_LOG_COUNT;
+ /* growdata requires permanent res; it can free space to the last AG */
+ resp->tr_growdata.tr_logres = xfs_calc_growdata_reservation(mp);
+ resp->tr_growdata.tr_logcount = XFS_DEFAULT_PERM_LOG_COUNT;
+ resp->tr_growdata.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+
/* The following transaction are logged in logical format */
resp->tr_ichange.tr_logres = xfs_calc_ichange_reservation(mp);
- resp->tr_growdata.tr_logres = xfs_calc_growdata_reservation(mp);
resp->tr_fsyncts.tr_logres = xfs_calc_swrite_reservation(mp);
resp->tr_writeid.tr_logres = xfs_calc_writeid_reservation(mp);
resp->tr_attrsetrt.tr_logres = xfs_calc_attrsetrt_reservation(mp);
diff --git a/fs/xfs/libxfs/xfs_trans_space.h b/fs/xfs/libxfs/xfs_trans_space.h
index a62fb950bef1..88221c7a04cc 100644
--- a/fs/xfs/libxfs/xfs_trans_space.h
+++ b/fs/xfs/libxfs/xfs_trans_space.h
@@ -56,9 +56,9 @@
#define XFS_DIRREMOVE_SPACE_RES(mp) \
XFS_DAREMOVE_SPACE_RES(mp, XFS_DATA_FORK)
#define XFS_IALLOC_SPACE_RES(mp) \
- ((mp)->m_ialloc_blks + \
+ (M_IGEO(mp)->ialloc_blks + \
(xfs_sb_version_hasfinobt(&mp->m_sb) ? 2 : 1 * \
- ((mp)->m_in_maxlevels - 1)))
+ (M_IGEO(mp)->inobt_maxlevels - 1)))
/*
* Space reservation values for various transactions.
@@ -94,7 +94,8 @@
#define XFS_SYMLINK_SPACE_RES(mp,nl,b) \
(XFS_IALLOC_SPACE_RES(mp) + XFS_DIRENTER_SPACE_RES(mp,nl) + (b))
#define XFS_IFREE_SPACE_RES(mp) \
- (xfs_sb_version_hasfinobt(&mp->m_sb) ? (mp)->m_in_maxlevels : 0)
+ (xfs_sb_version_hasfinobt(&mp->m_sb) ? \
+ M_IGEO(mp)->inobt_maxlevels : 0)
#endif /* __XFS_TRANS_SPACE_H__ */
diff --git a/fs/xfs/libxfs/xfs_types.c b/fs/xfs/libxfs/xfs_types.c
index 3306fc42cfad..4f595546a639 100644
--- a/fs/xfs/libxfs/xfs_types.c
+++ b/fs/xfs/libxfs/xfs_types.c
@@ -7,19 +7,10 @@
#include "xfs.h"
#include "xfs_fs.h"
#include "xfs_format.h"
-#include "xfs_log_format.h"
#include "xfs_shared.h"
#include "xfs_trans_resv.h"
#include "xfs_bit.h"
-#include "xfs_sb.h"
#include "xfs_mount.h"
-#include "xfs_defer.h"
-#include "xfs_inode.h"
-#include "xfs_btree.h"
-#include "xfs_rmap.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_alloc.h"
-#include "xfs_ialloc.h"
/* Find the size of the AG, in blocks. */
xfs_agblock_t
@@ -87,14 +78,14 @@ xfs_agino_range(
* Calculate the first inode, which will be in the first
* cluster-aligned block after the AGFL.
*/
- bno = round_up(XFS_AGFL_BLOCK(mp) + 1, mp->m_cluster_align);
+ bno = round_up(XFS_AGFL_BLOCK(mp) + 1, M_IGEO(mp)->cluster_align);
*first = XFS_AGB_TO_AGINO(mp, bno);
/*
* Calculate the last inode, which will be at the end of the
* last (aligned) cluster that can be allocated in the AG.
*/
- bno = round_down(eoag, mp->m_cluster_align);
+ bno = round_down(eoag, M_IGEO(mp)->cluster_align);
*last = XFS_AGB_TO_AGINO(mp, bno) - 1;
}
@@ -116,6 +107,19 @@ xfs_verify_agino(
}
/*
+ * Verify that an AG inode number pointer neither points outside the AG
+ * nor points at static metadata, or is NULLAGINO.
+ */
+bool
+xfs_verify_agino_or_null(
+ struct xfs_mount *mp,
+ xfs_agnumber_t agno,
+ xfs_agino_t agino)
+{
+ return agino == NULLAGINO || xfs_verify_agino(mp, agno, agino);
+}
+
+/*
* Verify that an FS inode number pointer neither points outside the
* filesystem nor points at static AG metadata.
*/
@@ -172,7 +176,7 @@ xfs_verify_rtbno(
}
/* Calculate the range of valid icount values. */
-static void
+void
xfs_icount_range(
struct xfs_mount *mp,
unsigned long long *min,
@@ -204,3 +208,14 @@ xfs_verify_icount(
xfs_icount_range(mp, &min, &max);
return icount >= min && icount <= max;
}
+
+/* Sanity-checking of dir/attr block offsets. */
+bool
+xfs_verify_dablk(
+ struct xfs_mount *mp,
+ xfs_fileoff_t dabno)
+{
+ xfs_dablk_t max_dablk = -1U;
+
+ return dabno <= max_dablk;
+}
diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h
index 8f02855a019a..802b34cd10fe 100644
--- a/fs/xfs/libxfs/xfs_types.h
+++ b/fs/xfs/libxfs/xfs_types.h
@@ -183,10 +183,15 @@ void xfs_agino_range(struct xfs_mount *mp, xfs_agnumber_t agno,
xfs_agino_t *first, xfs_agino_t *last);
bool xfs_verify_agino(struct xfs_mount *mp, xfs_agnumber_t agno,
xfs_agino_t agino);
+bool xfs_verify_agino_or_null(struct xfs_mount *mp, xfs_agnumber_t agno,
+ xfs_agino_t agino);
bool xfs_verify_ino(struct xfs_mount *mp, xfs_ino_t ino);
bool xfs_internal_inum(struct xfs_mount *mp, xfs_ino_t ino);
bool xfs_verify_dir_ino(struct xfs_mount *mp, xfs_ino_t ino);
bool xfs_verify_rtbno(struct xfs_mount *mp, xfs_rtblock_t rtbno);
bool xfs_verify_icount(struct xfs_mount *mp, unsigned long long icount);
+bool xfs_verify_dablk(struct xfs_mount *mp, xfs_fileoff_t off);
+void xfs_icount_range(struct xfs_mount *mp, unsigned long long *min,
+ unsigned long long *max);
#endif /* __XFS_TYPES_H__ */
diff --git a/fs/xfs/scrub/agheader.c b/fs/xfs/scrub/agheader.c
index 90955ab1e895..16b09b941441 100644
--- a/fs/xfs/scrub/agheader.c
+++ b/fs/xfs/scrub/agheader.c
@@ -9,20 +9,13 @@
#include "xfs_format.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
-#include "xfs_defer.h"
#include "xfs_btree.h"
-#include "xfs_bit.h"
-#include "xfs_log_format.h"
-#include "xfs_trans.h"
#include "xfs_sb.h"
-#include "xfs_inode.h"
#include "xfs_alloc.h"
#include "xfs_ialloc.h"
#include "xfs_rmap.h"
-#include "scrub/xfs_scrub.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
-#include "scrub/trace.h"
/* Superblock */
@@ -399,7 +392,7 @@ xchk_agf_xref_cntbt(
if (!xchk_should_check_xref(sc, &error, &sc->sa.cnt_cur))
return;
if (!have) {
- if (agf->agf_freeblks != be32_to_cpu(0))
+ if (agf->agf_freeblks != cpu_to_be32(0))
xchk_block_xref_set_corrupt(sc, sc->sa.agf_bp);
return;
}
@@ -514,6 +507,7 @@ xchk_agf(
{
struct xfs_mount *mp = sc->mp;
struct xfs_agf *agf;
+ struct xfs_perag *pag;
xfs_agnumber_t agno;
xfs_agblock_t agbno;
xfs_agblock_t eoag;
@@ -586,6 +580,16 @@ xchk_agf(
if (agfl_count != 0 && fl_count != agfl_count)
xchk_block_set_corrupt(sc, sc->sa.agf_bp);
+ /* Do the incore counters match? */
+ pag = xfs_perag_get(mp, agno);
+ if (pag->pagf_freeblks != be32_to_cpu(agf->agf_freeblks))
+ xchk_block_set_corrupt(sc, sc->sa.agf_bp);
+ if (pag->pagf_flcount != be32_to_cpu(agf->agf_flcount))
+ xchk_block_set_corrupt(sc, sc->sa.agf_bp);
+ if (pag->pagf_btreeblks != be32_to_cpu(agf->agf_btreeblks))
+ xchk_block_set_corrupt(sc, sc->sa.agf_bp);
+ xfs_perag_put(pag);
+
xchk_agf_xref(sc);
out:
return error;
@@ -635,7 +639,7 @@ xchk_agfl_block(
xchk_agfl_block_xref(sc, agbno);
if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
- return XFS_BTREE_QUERY_RANGE_ABORT;
+ return XFS_ITER_ABORT;
return 0;
}
@@ -726,7 +730,7 @@ xchk_agfl(
/* Check the blocks in the AGFL. */
error = xfs_agfl_walk(sc->mp, XFS_BUF_TO_AGF(sc->sa.agf_bp),
sc->sa.agfl_bp, xchk_agfl_block, &sai);
- if (error == XFS_BTREE_QUERY_RANGE_ABORT) {
+ if (error == XFS_ITER_ABORT) {
error = 0;
goto out_free;
}
@@ -811,6 +815,7 @@ xchk_agi(
{
struct xfs_mount *mp = sc->mp;
struct xfs_agi *agi;
+ struct xfs_perag *pag;
xfs_agnumber_t agno;
xfs_agblock_t agbno;
xfs_agblock_t eoag;
@@ -864,25 +869,31 @@ xchk_agi(
/* Check inode pointers */
agino = be32_to_cpu(agi->agi_newino);
- if (agino != NULLAGINO && !xfs_verify_agino(mp, agno, agino))
+ if (!xfs_verify_agino_or_null(mp, agno, agino))
xchk_block_set_corrupt(sc, sc->sa.agi_bp);
agino = be32_to_cpu(agi->agi_dirino);
- if (agino != NULLAGINO && !xfs_verify_agino(mp, agno, agino))
+ if (!xfs_verify_agino_or_null(mp, agno, agino))
xchk_block_set_corrupt(sc, sc->sa.agi_bp);
/* Check unlinked inode buckets */
for (i = 0; i < XFS_AGI_UNLINKED_BUCKETS; i++) {
agino = be32_to_cpu(agi->agi_unlinked[i]);
- if (agino == NULLAGINO)
- continue;
- if (!xfs_verify_agino(mp, agno, agino))
+ if (!xfs_verify_agino_or_null(mp, agno, agino))
xchk_block_set_corrupt(sc, sc->sa.agi_bp);
}
if (agi->agi_pad32 != cpu_to_be32(0))
xchk_block_set_corrupt(sc, sc->sa.agi_bp);
+ /* Do the incore counters match? */
+ pag = xfs_perag_get(mp, agno);
+ if (pag->pagi_count != be32_to_cpu(agi->agi_count))
+ xchk_block_set_corrupt(sc, sc->sa.agi_bp);
+ if (pag->pagi_freecount != be32_to_cpu(agi->agi_freecount))
+ xchk_block_set_corrupt(sc, sc->sa.agi_bp);
+ xfs_perag_put(pag);
+
xchk_agi_xref(sc);
out:
return error;
diff --git a/fs/xfs/scrub/agheader_repair.c b/fs/xfs/scrub/agheader_repair.c
index 03d1e15cceba..7a1a38b636a9 100644
--- a/fs/xfs/scrub/agheader_repair.c
+++ b/fs/xfs/scrub/agheader_repair.c
@@ -9,22 +9,17 @@
#include "xfs_format.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
-#include "xfs_defer.h"
#include "xfs_btree.h"
-#include "xfs_bit.h"
#include "xfs_log_format.h"
#include "xfs_trans.h"
#include "xfs_sb.h"
-#include "xfs_inode.h"
#include "xfs_alloc.h"
#include "xfs_alloc_btree.h"
#include "xfs_ialloc.h"
#include "xfs_ialloc_btree.h"
#include "xfs_rmap.h"
#include "xfs_rmap_btree.h"
-#include "xfs_refcount.h"
#include "xfs_refcount_btree.h"
-#include "scrub/xfs_scrub.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/trace.h"
@@ -341,23 +336,19 @@ xrep_agf(
struct xrep_find_ag_btree fab[XREP_AGF_MAX] = {
[XREP_AGF_BNOBT] = {
.rmap_owner = XFS_RMAP_OWN_AG,
- .buf_ops = &xfs_allocbt_buf_ops,
- .magic = XFS_ABTB_CRC_MAGIC,
+ .buf_ops = &xfs_bnobt_buf_ops,
},
[XREP_AGF_CNTBT] = {
.rmap_owner = XFS_RMAP_OWN_AG,
- .buf_ops = &xfs_allocbt_buf_ops,
- .magic = XFS_ABTC_CRC_MAGIC,
+ .buf_ops = &xfs_cntbt_buf_ops,
},
[XREP_AGF_RMAPBT] = {
.rmap_owner = XFS_RMAP_OWN_AG,
.buf_ops = &xfs_rmapbt_buf_ops,
- .magic = XFS_RMAP_CRC_MAGIC,
},
[XREP_AGF_REFCOUNTBT] = {
.rmap_owner = XFS_RMAP_OWN_REFC,
.buf_ops = &xfs_refcountbt_buf_ops,
- .magic = XFS_REFC_CRC_MAGIC,
},
[XREP_AGF_END] = {
.buf_ops = NULL,
@@ -875,12 +866,10 @@ xrep_agi(
[XREP_AGI_INOBT] = {
.rmap_owner = XFS_RMAP_OWN_INOBT,
.buf_ops = &xfs_inobt_buf_ops,
- .magic = XFS_IBT_CRC_MAGIC,
},
[XREP_AGI_FINOBT] = {
.rmap_owner = XFS_RMAP_OWN_INOBT,
- .buf_ops = &xfs_inobt_buf_ops,
- .magic = XFS_FIBT_CRC_MAGIC,
+ .buf_ops = &xfs_finobt_buf_ops,
},
[XREP_AGI_END] = {
.buf_ops = NULL
diff --git a/fs/xfs/scrub/alloc.c b/fs/xfs/scrub/alloc.c
index 44883e9112ad..a43d1813c4ff 100644
--- a/fs/xfs/scrub/alloc.c
+++ b/fs/xfs/scrub/alloc.c
@@ -9,19 +9,12 @@
#include "xfs_format.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
-#include "xfs_defer.h"
#include "xfs_btree.h"
-#include "xfs_bit.h"
-#include "xfs_log_format.h"
-#include "xfs_trans.h"
-#include "xfs_sb.h"
#include "xfs_alloc.h"
#include "xfs_rmap.h"
-#include "scrub/xfs_scrub.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/btree.h"
-#include "scrub/trace.h"
/*
* Set us up to scrub free space btrees.
diff --git a/fs/xfs/scrub/attr.c b/fs/xfs/scrub/attr.c
index 81d5e90547a1..1afc58bf71dd 100644
--- a/fs/xfs/scrub/attr.c
+++ b/fs/xfs/scrub/attr.c
@@ -9,26 +9,62 @@
#include "xfs_format.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
-#include "xfs_defer.h"
-#include "xfs_btree.h"
-#include "xfs_bit.h"
#include "xfs_log_format.h"
-#include "xfs_trans.h"
-#include "xfs_sb.h"
#include "xfs_inode.h"
#include "xfs_da_format.h"
#include "xfs_da_btree.h"
-#include "xfs_dir2.h"
#include "xfs_attr.h"
#include "xfs_attr_leaf.h"
-#include "scrub/xfs_scrub.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/dabtree.h"
-#include "scrub/trace.h"
+#include "scrub/attr.h"
-#include <linux/posix_acl_xattr.h>
-#include <linux/xattr.h>
+/*
+ * Allocate enough memory to hold an attr value and attr block bitmaps,
+ * reallocating the buffer if necessary. Buffer contents are not preserved
+ * across a reallocation.
+ */
+int
+xchk_setup_xattr_buf(
+ struct xfs_scrub *sc,
+ size_t value_size,
+ xfs_km_flags_t flags)
+{
+ size_t sz;
+ struct xchk_xattr_buf *ab = sc->buf;
+
+ /*
+ * We need enough space to read an xattr value from the file or enough
+ * space to hold three copies of the xattr free space bitmap. We don't
+ * need the buffer space for both purposes at the same time.
+ */
+ sz = 3 * sizeof(long) * BITS_TO_LONGS(sc->mp->m_attr_geo->blksize);
+ sz = max_t(size_t, sz, value_size);
+
+ /*
+ * If there's already a buffer, figure out if we need to reallocate it
+ * to accommodate a larger size.
+ */
+ if (ab) {
+ if (sz <= ab->sz)
+ return 0;
+ kmem_free(ab);
+ sc->buf = NULL;
+ }
+
+ /*
+ * Don't zero the buffer upon allocation to avoid runtime overhead.
+ * All users must be careful never to read uninitialized contents.
+ */
+ ab = kmem_alloc_large(sizeof(*ab) + sz, flags);
+ if (!ab)
+ return -ENOMEM;
+
+ ab->sz = sz;
+ sc->buf = ab;
+ return 0;
+}
/* Set us up to scrub an inode's extended attributes. */
int
@@ -36,19 +72,18 @@ xchk_setup_xattr(
struct xfs_scrub *sc,
struct xfs_inode *ip)
{
- size_t sz;
+ int error;
/*
- * Allocate the buffer without the inode lock held. We need enough
- * space to read every xattr value in the file or enough space to
- * hold three copies of the xattr free space bitmap. (Not both at
- * the same time.)
+ * We failed to get memory while checking attrs, so this time try to
+ * get all the memory we're ever going to need. Allocate the buffer
+ * without the inode lock held, which means we can sleep.
*/
- sz = max_t(size_t, XATTR_SIZE_MAX, 3 * sizeof(long) *
- BITS_TO_LONGS(sc->mp->m_attr_geo->blksize));
- sc->buf = kmem_zalloc_large(sz, KM_SLEEP);
- if (!sc->buf)
- return -ENOMEM;
+ if (sc->flags & XCHK_TRY_HARDER) {
+ error = xchk_setup_xattr_buf(sc, XATTR_SIZE_MAX, KM_SLEEP);
+ if (error)
+ return error;
+ }
return xchk_setup_inode_contents(sc, ip, 0);
}
@@ -82,12 +117,36 @@ xchk_xattr_listent(
sx = container_of(context, struct xchk_xattr, context);
+ if (xchk_should_terminate(sx->sc, &error)) {
+ context->seen_enough = error;
+ return;
+ }
+
if (flags & XFS_ATTR_INCOMPLETE) {
/* Incomplete attr key, just mark the inode for preening. */
xchk_ino_set_preen(sx->sc, context->dp->i_ino);
return;
}
+ /* Does this name make sense? */
+ if (!xfs_attr_namecheck(name, namelen)) {
+ xchk_fblock_set_corrupt(sx->sc, XFS_ATTR_FORK, args.blkno);
+ return;
+ }
+
+ /*
+ * Try to allocate enough memory to extrat the attr value. If that
+ * doesn't work, we overload the seen_enough variable to convey
+ * the error message back to the main scrub function.
+ */
+ error = xchk_setup_xattr_buf(sx->sc, valuelen, KM_MAYFAIL);
+ if (error == -ENOMEM)
+ error = -EDEADLOCK;
+ if (error) {
+ context->seen_enough = error;
+ return;
+ }
+
args.flags = ATTR_KERNOTIME;
if (flags & XFS_ATTR_ROOT)
args.flags |= ATTR_ROOT;
@@ -100,8 +159,8 @@ xchk_xattr_listent(
args.namelen = namelen;
args.hashval = xfs_da_hashname(args.name, args.namelen);
args.trans = context->tp;
- args.value = sx->sc->buf;
- args.valuelen = XATTR_SIZE_MAX;
+ args.value = xchk_xattr_valuebuf(sx->sc);
+ args.valuelen = valuelen;
error = xfs_attr_get_ilocked(context->dp, &args);
if (error == -EEXIST)
@@ -114,7 +173,7 @@ xchk_xattr_listent(
args.blkno);
fail_xref:
if (sx->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
- context->seen_enough = 1;
+ context->seen_enough = XFS_ITER_ABORT;
return;
}
@@ -159,13 +218,12 @@ xchk_xattr_check_freemap(
unsigned long *map,
struct xfs_attr3_icleaf_hdr *leafhdr)
{
- unsigned long *freemap;
- unsigned long *dstmap;
+ unsigned long *freemap = xchk_xattr_freemap(sc);
+ unsigned long *dstmap = xchk_xattr_dstmap(sc);
unsigned int mapsize = sc->mp->m_attr_geo->blksize;
int i;
/* Construct bitmap of freemap contents. */
- freemap = (unsigned long *)sc->buf + BITS_TO_LONGS(mapsize);
bitmap_zero(freemap, mapsize);
for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) {
if (!xchk_xattr_set_map(sc, freemap,
@@ -175,7 +233,6 @@ xchk_xattr_check_freemap(
}
/* Look for bits that are set in freemap and are marked in use. */
- dstmap = freemap + BITS_TO_LONGS(mapsize);
return bitmap_and(dstmap, freemap, map, mapsize) == 0;
}
@@ -190,13 +247,13 @@ xchk_xattr_entry(
char *buf_end,
struct xfs_attr_leafblock *leaf,
struct xfs_attr3_icleaf_hdr *leafhdr,
- unsigned long *usedmap,
struct xfs_attr_leaf_entry *ent,
int idx,
unsigned int *usedbytes,
__u32 *last_hashval)
{
struct xfs_mount *mp = ds->state->mp;
+ unsigned long *usedmap = xchk_xattr_usedmap(ds->sc);
char *name_end;
struct xfs_attr_leaf_name_local *lentry;
struct xfs_attr_leaf_name_remote *rentry;
@@ -256,16 +313,26 @@ xchk_xattr_block(
struct xfs_attr_leafblock *leaf = bp->b_addr;
struct xfs_attr_leaf_entry *ent;
struct xfs_attr_leaf_entry *entries;
- unsigned long *usedmap = ds->sc->buf;
+ unsigned long *usedmap;
char *buf_end;
size_t off;
__u32 last_hashval = 0;
unsigned int usedbytes = 0;
unsigned int hdrsize;
int i;
+ int error;
if (*last_checked == blk->blkno)
return 0;
+
+ /* Allocate memory for block usage checking. */
+ error = xchk_setup_xattr_buf(ds->sc, 0, KM_MAYFAIL);
+ if (error == -ENOMEM)
+ return -EDEADLOCK;
+ if (error)
+ return error;
+ usedmap = xchk_xattr_usedmap(ds->sc);
+
*last_checked = blk->blkno;
bitmap_zero(usedmap, mp->m_attr_geo->blksize);
@@ -313,7 +380,7 @@ xchk_xattr_block(
/* Check the entry and nameval. */
xchk_xattr_entry(ds, level, buf_end, leaf, &leafhdr,
- usedmap, ent, i, &usedbytes, &last_hashval);
+ ent, i, &usedbytes, &last_hashval);
if (ds->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
goto out;
@@ -453,6 +520,10 @@ xchk_xattr(
error = xfs_attr_list_int_ilocked(&sx.context);
if (!xchk_fblock_process_error(sc, XFS_ATTR_FORK, 0, &error))
goto out;
+
+ /* Did our listent function try to return any errors? */
+ if (sx.context.seen_enough < 0)
+ error = sx.context.seen_enough;
out:
return error;
}
diff --git a/fs/xfs/scrub/attr.h b/fs/xfs/scrub/attr.h
new file mode 100644
index 000000000000..13a1d2e8424d
--- /dev/null
+++ b/fs/xfs/scrub/attr.h
@@ -0,0 +1,71 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (C) 2019 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ */
+#ifndef __XFS_SCRUB_ATTR_H__
+#define __XFS_SCRUB_ATTR_H__
+
+/*
+ * Temporary storage for online scrub and repair of extended attributes.
+ */
+struct xchk_xattr_buf {
+ /* Size of @buf, in bytes. */
+ size_t sz;
+
+ /*
+ * Memory buffer -- either used for extracting attr values while
+ * walking the attributes; or for computing attr block bitmaps when
+ * checking the attribute tree.
+ *
+ * Each bitmap contains enough bits to track every byte in an attr
+ * block (rounded up to the size of an unsigned long). The attr block
+ * used space bitmap starts at the beginning of the buffer; the free
+ * space bitmap follows immediately after; and we have a third buffer
+ * for storing intermediate bitmap results.
+ */
+ uint8_t buf[0];
+};
+
+/* A place to store attribute values. */
+static inline uint8_t *
+xchk_xattr_valuebuf(
+ struct xfs_scrub *sc)
+{
+ struct xchk_xattr_buf *ab = sc->buf;
+
+ return ab->buf;
+}
+
+/* A bitmap of space usage computed by walking an attr leaf block. */
+static inline unsigned long *
+xchk_xattr_usedmap(
+ struct xfs_scrub *sc)
+{
+ struct xchk_xattr_buf *ab = sc->buf;
+
+ return (unsigned long *)ab->buf;
+}
+
+/* A bitmap of free space computed by walking attr leaf block free info. */
+static inline unsigned long *
+xchk_xattr_freemap(
+ struct xfs_scrub *sc)
+{
+ return xchk_xattr_usedmap(sc) +
+ BITS_TO_LONGS(sc->mp->m_attr_geo->blksize);
+}
+
+/* A bitmap used to hold temporary results. */
+static inline unsigned long *
+xchk_xattr_dstmap(
+ struct xfs_scrub *sc)
+{
+ return xchk_xattr_freemap(sc) +
+ BITS_TO_LONGS(sc->mp->m_attr_geo->blksize);
+}
+
+int xchk_setup_xattr_buf(struct xfs_scrub *sc, size_t value_size,
+ xfs_km_flags_t flags);
+
+#endif /* __XFS_SCRUB_ATTR_H__ */
diff --git a/fs/xfs/scrub/bitmap.c b/fs/xfs/scrub/bitmap.c
index fdadc9e1dc49..3d47d111be5a 100644
--- a/fs/xfs/scrub/bitmap.c
+++ b/fs/xfs/scrub/bitmap.c
@@ -10,11 +10,6 @@
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
#include "xfs_btree.h"
-#include "scrub/xfs_scrub.h"
-#include "scrub/scrub.h"
-#include "scrub/common.h"
-#include "scrub/trace.h"
-#include "scrub/repair.h"
#include "scrub/bitmap.h"
/*
diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c
index e1d11f3223e3..1bd29fdc2ab5 100644
--- a/fs/xfs/scrub/bmap.c
+++ b/fs/xfs/scrub/bmap.c
@@ -9,27 +9,19 @@
#include "xfs_format.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
-#include "xfs_defer.h"
#include "xfs_btree.h"
#include "xfs_bit.h"
#include "xfs_log_format.h"
#include "xfs_trans.h"
-#include "xfs_sb.h"
#include "xfs_inode.h"
-#include "xfs_inode_fork.h"
#include "xfs_alloc.h"
-#include "xfs_rtalloc.h"
#include "xfs_bmap.h"
-#include "xfs_bmap_util.h"
#include "xfs_bmap_btree.h"
#include "xfs_rmap.h"
#include "xfs_rmap_btree.h"
-#include "xfs_refcount.h"
-#include "scrub/xfs_scrub.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/btree.h"
-#include "scrub/trace.h"
/* Set us up with an inode's bmap. */
int
@@ -281,6 +273,31 @@ xchk_bmap_extent_xref(
xchk_ag_free(info->sc, &info->sc->sa);
}
+/*
+ * Directories and attr forks should never have blocks that can't be addressed
+ * by a xfs_dablk_t.
+ */
+STATIC void
+xchk_bmap_dirattr_extent(
+ struct xfs_inode *ip,
+ struct xchk_bmap_info *info,
+ struct xfs_bmbt_irec *irec)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ xfs_fileoff_t off;
+
+ if (!S_ISDIR(VFS_I(ip)->i_mode) && info->whichfork != XFS_ATTR_FORK)
+ return;
+
+ if (!xfs_verify_dablk(mp, irec->br_startoff))
+ xchk_fblock_set_corrupt(info->sc, info->whichfork,
+ irec->br_startoff);
+
+ off = irec->br_startoff + irec->br_blockcount - 1;
+ if (!xfs_verify_dablk(mp, off))
+ xchk_fblock_set_corrupt(info->sc, info->whichfork, off);
+}
+
/* Scrub a single extent record. */
STATIC int
xchk_bmap_extent(
@@ -305,6 +322,8 @@ xchk_bmap_extent(
xchk_fblock_set_corrupt(info->sc, info->whichfork,
irec->br_startoff);
+ xchk_bmap_dirattr_extent(ip, info, irec);
+
/* There should never be a "hole" extent in either extent list. */
if (irec->br_startblock == HOLESTARTBLOCK)
xchk_fblock_set_corrupt(info->sc, info->whichfork,
diff --git a/fs/xfs/scrub/btree.c b/fs/xfs/scrub/btree.c
index 6f94d1f7322d..f52a7b8256f9 100644
--- a/fs/xfs/scrub/btree.c
+++ b/fs/xfs/scrub/btree.c
@@ -9,14 +9,7 @@
#include "xfs_format.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
-#include "xfs_defer.h"
#include "xfs_btree.h"
-#include "xfs_bit.h"
-#include "xfs_log_format.h"
-#include "xfs_trans.h"
-#include "xfs_sb.h"
-#include "xfs_inode.h"
-#include "xfs_alloc.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/btree.h"
@@ -415,8 +408,17 @@ xchk_btree_check_owner(
struct xfs_btree_cur *cur = bs->cur;
struct check_owner *co;
- if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) && bp == NULL)
+ /*
+ * In theory, xfs_btree_get_block should only give us a null buffer
+ * pointer for the root of a root-in-inode btree type, but we need
+ * to check defensively here in case the cursor state is also screwed
+ * up.
+ */
+ if (bp == NULL) {
+ if (!(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE))
+ xchk_btree_set_corrupt(bs->sc, bs->cur, level);
return 0;
+ }
/*
* We want to cross-reference each btree block with the bnobt
diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c
index 0c54ff55b901..18876056e5e0 100644
--- a/fs/xfs/scrub/common.c
+++ b/fs/xfs/scrub/common.c
@@ -9,22 +9,16 @@
#include "xfs_format.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
-#include "xfs_defer.h"
#include "xfs_btree.h"
-#include "xfs_bit.h"
#include "xfs_log_format.h"
#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_inode.h"
#include "xfs_icache.h"
-#include "xfs_itable.h"
#include "xfs_alloc.h"
#include "xfs_alloc_btree.h"
-#include "xfs_bmap.h"
-#include "xfs_bmap_btree.h"
#include "xfs_ialloc.h"
#include "xfs_ialloc_btree.h"
-#include "xfs_refcount.h"
#include "xfs_refcount_btree.h"
#include "xfs_rmap.h"
#include "xfs_rmap_btree.h"
@@ -32,12 +26,11 @@
#include "xfs_trans_priv.h"
#include "xfs_attr.h"
#include "xfs_reflink.h"
-#include "scrub/xfs_scrub.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/trace.h"
-#include "scrub/btree.h"
#include "scrub/repair.h"
+#include "scrub/health.h"
/* Common code for the metadata scrubbers. */
@@ -208,6 +201,15 @@ xchk_ino_set_preen(
trace_xchk_ino_preen(sc, ino, __return_address);
}
+/* Record something being wrong with the filesystem primary superblock. */
+void
+xchk_set_corrupt(
+ struct xfs_scrub *sc)
+{
+ sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT;
+ trace_xchk_fs_error(sc, 0, __return_address);
+}
+
/* Record a corrupt block. */
void
xchk_block_set_corrupt(
@@ -458,13 +460,18 @@ xchk_ag_btcur_init(
struct xfs_mount *mp = sc->mp;
xfs_agnumber_t agno = sa->agno;
- if (sa->agf_bp) {
+ xchk_perag_get(sc->mp, sa);
+ if (sa->agf_bp &&
+ xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_BNO)) {
/* Set up a bnobt cursor for cross-referencing. */
sa->bno_cur = xfs_allocbt_init_cursor(mp, sc->tp, sa->agf_bp,
agno, XFS_BTNUM_BNO);
if (!sa->bno_cur)
goto err;
+ }
+ if (sa->agf_bp &&
+ xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_CNT)) {
/* Set up a cntbt cursor for cross-referencing. */
sa->cnt_cur = xfs_allocbt_init_cursor(mp, sc->tp, sa->agf_bp,
agno, XFS_BTNUM_CNT);
@@ -473,7 +480,8 @@ xchk_ag_btcur_init(
}
/* Set up a inobt cursor for cross-referencing. */
- if (sa->agi_bp) {
+ if (sa->agi_bp &&
+ xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_INO)) {
sa->ino_cur = xfs_inobt_init_cursor(mp, sc->tp, sa->agi_bp,
agno, XFS_BTNUM_INO);
if (!sa->ino_cur)
@@ -481,7 +489,8 @@ xchk_ag_btcur_init(
}
/* Set up a finobt cursor for cross-referencing. */
- if (sa->agi_bp && xfs_sb_version_hasfinobt(&mp->m_sb)) {
+ if (sa->agi_bp && xfs_sb_version_hasfinobt(&mp->m_sb) &&
+ xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_FINO)) {
sa->fino_cur = xfs_inobt_init_cursor(mp, sc->tp, sa->agi_bp,
agno, XFS_BTNUM_FINO);
if (!sa->fino_cur)
@@ -489,7 +498,8 @@ xchk_ag_btcur_init(
}
/* Set up a rmapbt cursor for cross-referencing. */
- if (sa->agf_bp && xfs_sb_version_hasrmapbt(&mp->m_sb)) {
+ if (sa->agf_bp && xfs_sb_version_hasrmapbt(&mp->m_sb) &&
+ xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_RMAP)) {
sa->rmap_cur = xfs_rmapbt_init_cursor(mp, sc->tp, sa->agf_bp,
agno);
if (!sa->rmap_cur)
@@ -497,7 +507,8 @@ xchk_ag_btcur_init(
}
/* Set up a refcountbt cursor for cross-referencing. */
- if (sa->agf_bp && xfs_sb_version_hasreflink(&mp->m_sb)) {
+ if (sa->agf_bp && xfs_sb_version_hasreflink(&mp->m_sb) &&
+ xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_REFC)) {
sa->refc_cur = xfs_refcountbt_init_cursor(mp, sc->tp,
sa->agf_bp, agno);
if (!sa->refc_cur)
@@ -884,3 +895,21 @@ xchk_ilock_inverted(
}
return -EDEADLOCK;
}
+
+/* Pause background reaping of resources. */
+void
+xchk_stop_reaping(
+ struct xfs_scrub *sc)
+{
+ sc->flags |= XCHK_REAPING_DISABLED;
+ xfs_stop_block_reaping(sc->mp);
+}
+
+/* Restart background reaping of resources. */
+void
+xchk_start_reaping(
+ struct xfs_scrub *sc)
+{
+ xfs_start_block_reaping(sc->mp);
+ sc->flags &= ~XCHK_REAPING_DISABLED;
+}
diff --git a/fs/xfs/scrub/common.h b/fs/xfs/scrub/common.h
index e26a430bd466..003a772cd26c 100644
--- a/fs/xfs/scrub/common.h
+++ b/fs/xfs/scrub/common.h
@@ -39,6 +39,7 @@ void xchk_block_set_preen(struct xfs_scrub *sc,
struct xfs_buf *bp);
void xchk_ino_set_preen(struct xfs_scrub *sc, xfs_ino_t ino);
+void xchk_set_corrupt(struct xfs_scrub *sc);
void xchk_block_set_corrupt(struct xfs_scrub *sc,
struct xfs_buf *bp);
void xchk_ino_set_corrupt(struct xfs_scrub *sc, xfs_ino_t ino);
@@ -105,6 +106,7 @@ xchk_setup_quota(struct xfs_scrub *sc, struct xfs_inode *ip)
return -ENOENT;
}
#endif
+int xchk_setup_fscounters(struct xfs_scrub *sc, struct xfs_inode *ip);
void xchk_ag_free(struct xfs_scrub *sc, struct xchk_ag *sa);
int xchk_ag_init(struct xfs_scrub *sc, xfs_agnumber_t agno,
@@ -137,5 +139,7 @@ static inline bool xchk_skip_xref(struct xfs_scrub_metadata *sm)
int xchk_metadata_inode_forks(struct xfs_scrub *sc);
int xchk_ilock_inverted(struct xfs_inode *ip, uint lock_mode);
+void xchk_stop_reaping(struct xfs_scrub *sc);
+void xchk_start_reaping(struct xfs_scrub *sc);
#endif /* __XFS_SCRUB_COMMON_H__ */
diff --git a/fs/xfs/scrub/dabtree.c b/fs/xfs/scrub/dabtree.c
index f1260b4bfdee..94c4f1de1922 100644
--- a/fs/xfs/scrub/dabtree.c
+++ b/fs/xfs/scrub/dabtree.c
@@ -9,20 +9,12 @@
#include "xfs_format.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
-#include "xfs_defer.h"
-#include "xfs_btree.h"
-#include "xfs_bit.h"
#include "xfs_log_format.h"
#include "xfs_trans.h"
-#include "xfs_sb.h"
#include "xfs_inode.h"
-#include "xfs_inode_fork.h"
-#include "xfs_da_format.h"
-#include "xfs_da_btree.h"
#include "xfs_dir2.h"
#include "xfs_dir2_priv.h"
#include "xfs_attr_leaf.h"
-#include "scrub/xfs_scrub.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/trace.h"
@@ -574,6 +566,11 @@ xchk_da_btree(
/* Drill another level deeper. */
blkno = be32_to_cpu(key->before);
level++;
+ if (level >= XFS_DA_NODE_MAXDEPTH) {
+ /* Too deep! */
+ xchk_da_set_corrupt(&ds, level - 1);
+ break;
+ }
ds.tree_level--;
error = xchk_da_btree_block(&ds, level, blkno);
if (error)
diff --git a/fs/xfs/scrub/dir.c b/fs/xfs/scrub/dir.c
index cd3e4d768a18..1e2e11721eb9 100644
--- a/fs/xfs/scrub/dir.c
+++ b/fs/xfs/scrub/dir.c
@@ -9,24 +9,14 @@
#include "xfs_format.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
-#include "xfs_defer.h"
-#include "xfs_btree.h"
-#include "xfs_bit.h"
#include "xfs_log_format.h"
#include "xfs_trans.h"
-#include "xfs_sb.h"
#include "xfs_inode.h"
#include "xfs_icache.h"
-#include "xfs_itable.h"
-#include "xfs_da_format.h"
-#include "xfs_da_btree.h"
#include "xfs_dir2.h"
#include "xfs_dir2_priv.h"
-#include "xfs_ialloc.h"
-#include "scrub/xfs_scrub.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
-#include "scrub/trace.h"
#include "scrub/dabtree.h"
/* Set us up to scrub directories. */
@@ -129,6 +119,12 @@ xchk_dir_actor(
goto out;
}
+ /* Does this name make sense? */
+ if (!xfs_dir2_namecheck(name, namelen)) {
+ xchk_fblock_set_corrupt(sdc->sc, XFS_DATA_FORK, offset);
+ goto out;
+ }
+
if (!strncmp(".", name, namelen)) {
/* If this is "." then check that the inum matches the dir. */
if (xfs_sb_version_hasftype(&mp->m_sb) && type != DT_DIR)
diff --git a/fs/xfs/scrub/fscounters.c b/fs/xfs/scrub/fscounters.c
new file mode 100644
index 000000000000..fc3f510c9034
--- /dev/null
+++ b/fs/xfs/scrub/fscounters.c
@@ -0,0 +1,354 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Copyright (C) 2019 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_sb.h"
+#include "xfs_alloc.h"
+#include "xfs_ialloc.h"
+#include "xfs_health.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/trace.h"
+
+/*
+ * FS Summary Counters
+ * ===================
+ *
+ * The basics of filesystem summary counter checking are that we iterate the
+ * AGs counting the number of free blocks, free space btree blocks, per-AG
+ * reservations, inodes, delayed allocation reservations, and free inodes.
+ * Then we compare what we computed against the in-core counters.
+ *
+ * However, the reality is that summary counters are a tricky beast to check.
+ * While we /could/ freeze the filesystem and scramble around the AGs counting
+ * the free blocks, in practice we prefer not do that for a scan because
+ * freezing is costly. To get around this, we added a per-cpu counter of the
+ * delalloc reservations so that we can rotor around the AGs relatively
+ * quickly, and we allow the counts to be slightly off because we're not taking
+ * any locks while we do this.
+ *
+ * So the first thing we do is warm up the buffer cache in the setup routine by
+ * walking all the AGs to make sure the incore per-AG structure has been
+ * initialized. The expected value calculation then iterates the incore per-AG
+ * structures as quickly as it can. We snapshot the percpu counters before and
+ * after this operation and use the difference in counter values to guess at
+ * our tolerance for mismatch between expected and actual counter values.
+ */
+
+/*
+ * Since the expected value computation is lockless but only browses incore
+ * values, the percpu counters should be fairly close to each other. However,
+ * we'll allow ourselves to be off by at least this (arbitrary) amount.
+ */
+#define XCHK_FSCOUNT_MIN_VARIANCE (512)
+
+/*
+ * Make sure the per-AG structure has been initialized from the on-disk header
+ * contents and trust that the incore counters match the ondisk counters. (The
+ * AGF and AGI scrubbers check them, and a normal xfs_scrub run checks the
+ * summary counters after checking all AG headers). Do this from the setup
+ * function so that the inner AG aggregation loop runs as quickly as possible.
+ *
+ * This function runs during the setup phase /before/ we start checking any
+ * metadata.
+ */
+STATIC int
+xchk_fscount_warmup(
+ struct xfs_scrub *sc)
+{
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_buf *agi_bp = NULL;
+ struct xfs_buf *agf_bp = NULL;
+ struct xfs_perag *pag = NULL;
+ xfs_agnumber_t agno;
+ int error = 0;
+
+ for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
+ pag = xfs_perag_get(mp, agno);
+
+ if (pag->pagi_init && pag->pagf_init)
+ goto next_loop_perag;
+
+ /* Lock both AG headers. */
+ error = xfs_ialloc_read_agi(mp, sc->tp, agno, &agi_bp);
+ if (error)
+ break;
+ error = xfs_alloc_read_agf(mp, sc->tp, agno, 0, &agf_bp);
+ if (error)
+ break;
+ error = -ENOMEM;
+ if (!agf_bp || !agi_bp)
+ break;
+
+ /*
+ * These are supposed to be initialized by the header read
+ * function.
+ */
+ error = -EFSCORRUPTED;
+ if (!pag->pagi_init || !pag->pagf_init)
+ break;
+
+ xfs_buf_relse(agf_bp);
+ agf_bp = NULL;
+ xfs_buf_relse(agi_bp);
+ agi_bp = NULL;
+next_loop_perag:
+ xfs_perag_put(pag);
+ pag = NULL;
+ error = 0;
+
+ if (fatal_signal_pending(current))
+ break;
+ }
+
+ if (agf_bp)
+ xfs_buf_relse(agf_bp);
+ if (agi_bp)
+ xfs_buf_relse(agi_bp);
+ if (pag)
+ xfs_perag_put(pag);
+ return error;
+}
+
+int
+xchk_setup_fscounters(
+ struct xfs_scrub *sc,
+ struct xfs_inode *ip)
+{
+ struct xchk_fscounters *fsc;
+ int error;
+
+ sc->buf = kmem_zalloc(sizeof(struct xchk_fscounters), KM_SLEEP);
+ if (!sc->buf)
+ return -ENOMEM;
+ fsc = sc->buf;
+
+ xfs_icount_range(sc->mp, &fsc->icount_min, &fsc->icount_max);
+
+ /* We must get the incore counters set up before we can proceed. */
+ error = xchk_fscount_warmup(sc);
+ if (error)
+ return error;
+
+ /*
+ * Pause background reclaim while we're scrubbing to reduce the
+ * likelihood of background perturbations to the counters throwing off
+ * our calculations.
+ */
+ xchk_stop_reaping(sc);
+
+ return xchk_trans_alloc(sc, 0);
+}
+
+/*
+ * Calculate what the global in-core counters ought to be from the incore
+ * per-AG structure. Callers can compare this to the actual in-core counters
+ * to estimate by how much both in-core and on-disk counters need to be
+ * adjusted.
+ */
+STATIC int
+xchk_fscount_aggregate_agcounts(
+ struct xfs_scrub *sc,
+ struct xchk_fscounters *fsc)
+{
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_perag *pag;
+ uint64_t delayed;
+ xfs_agnumber_t agno;
+ int tries = 8;
+
+retry:
+ fsc->icount = 0;
+ fsc->ifree = 0;
+ fsc->fdblocks = 0;
+
+ for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
+ pag = xfs_perag_get(mp, agno);
+
+ /* This somehow got unset since the warmup? */
+ if (!pag->pagi_init || !pag->pagf_init) {
+ xfs_perag_put(pag);
+ return -EFSCORRUPTED;
+ }
+
+ /* Count all the inodes */
+ fsc->icount += pag->pagi_count;
+ fsc->ifree += pag->pagi_freecount;
+
+ /* Add up the free/freelist/bnobt/cntbt blocks */
+ fsc->fdblocks += pag->pagf_freeblks;
+ fsc->fdblocks += pag->pagf_flcount;
+ fsc->fdblocks += pag->pagf_btreeblks;
+
+ /*
+ * Per-AG reservations are taken out of the incore counters,
+ * so they must be left out of the free blocks computation.
+ */
+ fsc->fdblocks -= pag->pag_meta_resv.ar_reserved;
+ fsc->fdblocks -= pag->pag_rmapbt_resv.ar_orig_reserved;
+
+ xfs_perag_put(pag);
+
+ if (fatal_signal_pending(current))
+ break;
+ }
+
+ /*
+ * The global incore space reservation is taken from the incore
+ * counters, so leave that out of the computation.
+ */
+ fsc->fdblocks -= mp->m_resblks_avail;
+
+ /*
+ * Delayed allocation reservations are taken out of the incore counters
+ * but not recorded on disk, so leave them and their indlen blocks out
+ * of the computation.
+ */
+ delayed = percpu_counter_sum(&mp->m_delalloc_blks);
+ fsc->fdblocks -= delayed;
+
+ trace_xchk_fscounters_calc(mp, fsc->icount, fsc->ifree, fsc->fdblocks,
+ delayed);
+
+
+ /* Bail out if the values we compute are totally nonsense. */
+ if (fsc->icount < fsc->icount_min || fsc->icount > fsc->icount_max ||
+ fsc->fdblocks > mp->m_sb.sb_dblocks ||
+ fsc->ifree > fsc->icount_max)
+ return -EFSCORRUPTED;
+
+ /*
+ * If ifree > icount then we probably had some perturbation in the
+ * counters while we were calculating things. We'll try a few times
+ * to maintain ifree <= icount before giving up.
+ */
+ if (fsc->ifree > fsc->icount) {
+ if (tries--)
+ goto retry;
+ xchk_set_incomplete(sc);
+ return 0;
+ }
+
+ return 0;
+}
+
+/*
+ * Is the @counter reasonably close to the @expected value?
+ *
+ * We neither locked nor froze anything in the filesystem while aggregating the
+ * per-AG data to compute the @expected value, which means that the counter
+ * could have changed. We know the @old_value of the summation of the counter
+ * before the aggregation, and we re-sum the counter now. If the expected
+ * value falls between the two summations, we're ok.
+ *
+ * Otherwise, we /might/ have a problem. If the change in the summations is
+ * more than we want to tolerate, the filesystem is probably busy and we should
+ * just send back INCOMPLETE and see if userspace will try again.
+ */
+static inline bool
+xchk_fscount_within_range(
+ struct xfs_scrub *sc,
+ const int64_t old_value,
+ struct percpu_counter *counter,
+ uint64_t expected)
+{
+ int64_t min_value, max_value;
+ int64_t curr_value = percpu_counter_sum(counter);
+
+ trace_xchk_fscounters_within_range(sc->mp, expected, curr_value,
+ old_value);
+
+ /* Negative values are always wrong. */
+ if (curr_value < 0)
+ return false;
+
+ /* Exact matches are always ok. */
+ if (curr_value == expected)
+ return true;
+
+ min_value = min(old_value, curr_value);
+ max_value = max(old_value, curr_value);
+
+ /* Within the before-and-after range is ok. */
+ if (expected >= min_value && expected <= max_value)
+ return true;
+
+ /*
+ * If the difference between the two summations is too large, the fs
+ * might just be busy and so we'll mark the scrub incomplete. Return
+ * true here so that we don't mark the counter corrupt.
+ *
+ * XXX: In the future when userspace can grant scrub permission to
+ * quiesce the filesystem to solve the outsized variance problem, this
+ * check should be moved up and the return code changed to signal to
+ * userspace that we need quiesce permission.
+ */
+ if (max_value - min_value >= XCHK_FSCOUNT_MIN_VARIANCE) {
+ xchk_set_incomplete(sc);
+ return true;
+ }
+
+ return false;
+}
+
+/* Check the superblock counters. */
+int
+xchk_fscounters(
+ struct xfs_scrub *sc)
+{
+ struct xfs_mount *mp = sc->mp;
+ struct xchk_fscounters *fsc = sc->buf;
+ int64_t icount, ifree, fdblocks;
+ int error;
+
+ /* Snapshot the percpu counters. */
+ icount = percpu_counter_sum(&mp->m_icount);
+ ifree = percpu_counter_sum(&mp->m_ifree);
+ fdblocks = percpu_counter_sum(&mp->m_fdblocks);
+
+ /* No negative values, please! */
+ if (icount < 0 || ifree < 0 || fdblocks < 0)
+ xchk_set_corrupt(sc);
+
+ /* See if icount is obviously wrong. */
+ if (icount < fsc->icount_min || icount > fsc->icount_max)
+ xchk_set_corrupt(sc);
+
+ /* See if fdblocks is obviously wrong. */
+ if (fdblocks > mp->m_sb.sb_dblocks)
+ xchk_set_corrupt(sc);
+
+ /*
+ * If ifree exceeds icount by more than the minimum variance then
+ * something's probably wrong with the counters.
+ */
+ if (ifree > icount && ifree - icount > XCHK_FSCOUNT_MIN_VARIANCE)
+ xchk_set_corrupt(sc);
+
+ /* Walk the incore AG headers to calculate the expected counters. */
+ error = xchk_fscount_aggregate_agcounts(sc, fsc);
+ if (!xchk_process_error(sc, 0, XFS_SB_BLOCK(mp), &error))
+ return error;
+ if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_INCOMPLETE)
+ return 0;
+
+ /* Compare the in-core counters with whatever we counted. */
+ if (!xchk_fscount_within_range(sc, icount, &mp->m_icount, fsc->icount))
+ xchk_set_corrupt(sc);
+
+ if (!xchk_fscount_within_range(sc, ifree, &mp->m_ifree, fsc->ifree))
+ xchk_set_corrupt(sc);
+
+ if (!xchk_fscount_within_range(sc, fdblocks, &mp->m_fdblocks,
+ fsc->fdblocks))
+ xchk_set_corrupt(sc);
+
+ return 0;
+}
diff --git a/fs/xfs/scrub/health.c b/fs/xfs/scrub/health.c
new file mode 100644
index 000000000000..b2f602811e9d
--- /dev/null
+++ b/fs/xfs/scrub/health.c
@@ -0,0 +1,229 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Copyright (C) 2019 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_btree.h"
+#include "xfs_sb.h"
+#include "xfs_health.h"
+#include "scrub/scrub.h"
+
+/*
+ * Scrub and In-Core Filesystem Health Assessments
+ * ===============================================
+ *
+ * Online scrub and repair have the time and the ability to perform stronger
+ * checks than we can do from the metadata verifiers, because they can
+ * cross-reference records between data structures. Therefore, scrub is in a
+ * good position to update the online filesystem health assessments to reflect
+ * the good/bad state of the data structure.
+ *
+ * We therefore extend scrub in the following ways to achieve this:
+ *
+ * 1. Create a "sick_mask" field in the scrub context. When we're setting up a
+ * scrub call, set this to the default XFS_SICK_* flag(s) for the selected
+ * scrub type (call it A). Scrub and repair functions can override the default
+ * sick_mask value if they choose.
+ *
+ * 2. If the scrubber returns a runtime error code, we exit making no changes
+ * to the incore sick state.
+ *
+ * 3. If the scrubber finds that A is clean, use sick_mask to clear the incore
+ * sick flags before exiting.
+ *
+ * 4. If the scrubber finds that A is corrupt, use sick_mask to set the incore
+ * sick flags. If the user didn't want to repair then we exit, leaving the
+ * metadata structure unfixed and the sick flag set.
+ *
+ * 5. Now we know that A is corrupt and the user wants to repair, so run the
+ * repairer. If the repairer returns an error code, we exit with that error
+ * code, having made no further changes to the incore sick state.
+ *
+ * 6. If repair rebuilds A correctly and the subsequent re-scrub of A is clean,
+ * use sick_mask to clear the incore sick flags. This should have the effect
+ * that A is no longer marked sick.
+ *
+ * 7. If repair rebuilds A incorrectly, the re-scrub will find it corrupt and
+ * use sick_mask to set the incore sick flags. This should have no externally
+ * visible effect since we already set them in step (4).
+ *
+ * There are some complications to this story, however. For certain types of
+ * complementary metadata indices (e.g. inobt/finobt), it is easier to rebuild
+ * both structures at the same time. The following principles apply to this
+ * type of repair strategy:
+ *
+ * 8. Any repair function that rebuilds multiple structures should update
+ * sick_mask_visible to reflect whatever other structures are rebuilt, and
+ * verify that all the rebuilt structures can pass a scrub check. The outcomes
+ * of 5-7 still apply, but with a sick_mask that covers everything being
+ * rebuilt.
+ */
+
+/* Map our scrub type to a sick mask and a set of health update functions. */
+
+enum xchk_health_group {
+ XHG_FS = 1,
+ XHG_RT,
+ XHG_AG,
+ XHG_INO,
+};
+
+struct xchk_health_map {
+ enum xchk_health_group group;
+ unsigned int sick_mask;
+};
+
+static const struct xchk_health_map type_to_health_flag[XFS_SCRUB_TYPE_NR] = {
+ [XFS_SCRUB_TYPE_SB] = { XHG_AG, XFS_SICK_AG_SB },
+ [XFS_SCRUB_TYPE_AGF] = { XHG_AG, XFS_SICK_AG_AGF },
+ [XFS_SCRUB_TYPE_AGFL] = { XHG_AG, XFS_SICK_AG_AGFL },
+ [XFS_SCRUB_TYPE_AGI] = { XHG_AG, XFS_SICK_AG_AGI },
+ [XFS_SCRUB_TYPE_BNOBT] = { XHG_AG, XFS_SICK_AG_BNOBT },
+ [XFS_SCRUB_TYPE_CNTBT] = { XHG_AG, XFS_SICK_AG_CNTBT },
+ [XFS_SCRUB_TYPE_INOBT] = { XHG_AG, XFS_SICK_AG_INOBT },
+ [XFS_SCRUB_TYPE_FINOBT] = { XHG_AG, XFS_SICK_AG_FINOBT },
+ [XFS_SCRUB_TYPE_RMAPBT] = { XHG_AG, XFS_SICK_AG_RMAPBT },
+ [XFS_SCRUB_TYPE_REFCNTBT] = { XHG_AG, XFS_SICK_AG_REFCNTBT },
+ [XFS_SCRUB_TYPE_INODE] = { XHG_INO, XFS_SICK_INO_CORE },
+ [XFS_SCRUB_TYPE_BMBTD] = { XHG_INO, XFS_SICK_INO_BMBTD },
+ [XFS_SCRUB_TYPE_BMBTA] = { XHG_INO, XFS_SICK_INO_BMBTA },
+ [XFS_SCRUB_TYPE_BMBTC] = { XHG_INO, XFS_SICK_INO_BMBTC },
+ [XFS_SCRUB_TYPE_DIR] = { XHG_INO, XFS_SICK_INO_DIR },
+ [XFS_SCRUB_TYPE_XATTR] = { XHG_INO, XFS_SICK_INO_XATTR },
+ [XFS_SCRUB_TYPE_SYMLINK] = { XHG_INO, XFS_SICK_INO_SYMLINK },
+ [XFS_SCRUB_TYPE_PARENT] = { XHG_INO, XFS_SICK_INO_PARENT },
+ [XFS_SCRUB_TYPE_RTBITMAP] = { XHG_RT, XFS_SICK_RT_BITMAP },
+ [XFS_SCRUB_TYPE_RTSUM] = { XHG_RT, XFS_SICK_RT_SUMMARY },
+ [XFS_SCRUB_TYPE_UQUOTA] = { XHG_FS, XFS_SICK_FS_UQUOTA },
+ [XFS_SCRUB_TYPE_GQUOTA] = { XHG_FS, XFS_SICK_FS_GQUOTA },
+ [XFS_SCRUB_TYPE_PQUOTA] = { XHG_FS, XFS_SICK_FS_PQUOTA },
+ [XFS_SCRUB_TYPE_FSCOUNTERS] = { XHG_FS, XFS_SICK_FS_COUNTERS },
+};
+
+/* Return the health status mask for this scrub type. */
+unsigned int
+xchk_health_mask_for_scrub_type(
+ __u32 scrub_type)
+{
+ return type_to_health_flag[scrub_type].sick_mask;
+}
+
+/*
+ * Update filesystem health assessments based on what we found and did.
+ *
+ * If the scrubber finds errors, we mark sick whatever's mentioned in
+ * sick_mask, no matter whether this is a first scan or an
+ * evaluation of repair effectiveness.
+ *
+ * Otherwise, no direct corruption was found, so mark whatever's in
+ * sick_mask as healthy.
+ */
+void
+xchk_update_health(
+ struct xfs_scrub *sc)
+{
+ struct xfs_perag *pag;
+ bool bad;
+
+ if (!sc->sick_mask)
+ return;
+
+ bad = (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT);
+ switch (type_to_health_flag[sc->sm->sm_type].group) {
+ case XHG_AG:
+ pag = xfs_perag_get(sc->mp, sc->sm->sm_agno);
+ if (bad)
+ xfs_ag_mark_sick(pag, sc->sick_mask);
+ else
+ xfs_ag_mark_healthy(pag, sc->sick_mask);
+ xfs_perag_put(pag);
+ break;
+ case XHG_INO:
+ if (!sc->ip)
+ return;
+ if (bad)
+ xfs_inode_mark_sick(sc->ip, sc->sick_mask);
+ else
+ xfs_inode_mark_healthy(sc->ip, sc->sick_mask);
+ break;
+ case XHG_FS:
+ if (bad)
+ xfs_fs_mark_sick(sc->mp, sc->sick_mask);
+ else
+ xfs_fs_mark_healthy(sc->mp, sc->sick_mask);
+ break;
+ case XHG_RT:
+ if (bad)
+ xfs_rt_mark_sick(sc->mp, sc->sick_mask);
+ else
+ xfs_rt_mark_healthy(sc->mp, sc->sick_mask);
+ break;
+ default:
+ ASSERT(0);
+ break;
+ }
+}
+
+/* Is the given per-AG btree healthy enough for scanning? */
+bool
+xchk_ag_btree_healthy_enough(
+ struct xfs_scrub *sc,
+ struct xfs_perag *pag,
+ xfs_btnum_t btnum)
+{
+ unsigned int mask = 0;
+
+ /*
+ * We always want the cursor if it's the same type as whatever we're
+ * scrubbing, even if we already know the structure is corrupt.
+ *
+ * Otherwise, we're only interested in the btree for cross-referencing.
+ * If we know the btree is bad then don't bother, just set XFAIL.
+ */
+ switch (btnum) {
+ case XFS_BTNUM_BNO:
+ if (sc->sm->sm_type == XFS_SCRUB_TYPE_BNOBT)
+ return true;
+ mask = XFS_SICK_AG_BNOBT;
+ break;
+ case XFS_BTNUM_CNT:
+ if (sc->sm->sm_type == XFS_SCRUB_TYPE_CNTBT)
+ return true;
+ mask = XFS_SICK_AG_CNTBT;
+ break;
+ case XFS_BTNUM_INO:
+ if (sc->sm->sm_type == XFS_SCRUB_TYPE_INOBT)
+ return true;
+ mask = XFS_SICK_AG_INOBT;
+ break;
+ case XFS_BTNUM_FINO:
+ if (sc->sm->sm_type == XFS_SCRUB_TYPE_FINOBT)
+ return true;
+ mask = XFS_SICK_AG_FINOBT;
+ break;
+ case XFS_BTNUM_RMAP:
+ if (sc->sm->sm_type == XFS_SCRUB_TYPE_RMAPBT)
+ return true;
+ mask = XFS_SICK_AG_RMAPBT;
+ break;
+ case XFS_BTNUM_REFC:
+ if (sc->sm->sm_type == XFS_SCRUB_TYPE_REFCNTBT)
+ return true;
+ mask = XFS_SICK_AG_REFCNTBT;
+ break;
+ default:
+ ASSERT(0);
+ return true;
+ }
+
+ if (xfs_ag_has_sickness(pag, mask)) {
+ sc->sm->sm_flags |= XFS_SCRUB_OFLAG_XFAIL;
+ return false;
+ }
+
+ return true;
+}
diff --git a/fs/xfs/scrub/health.h b/fs/xfs/scrub/health.h
new file mode 100644
index 000000000000..d0b938d3d028
--- /dev/null
+++ b/fs/xfs/scrub/health.h
@@ -0,0 +1,14 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Copyright (C) 2019 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ */
+#ifndef __XFS_SCRUB_HEALTH_H__
+#define __XFS_SCRUB_HEALTH_H__
+
+unsigned int xchk_health_mask_for_scrub_type(__u32 scrub_type);
+void xchk_update_health(struct xfs_scrub *sc);
+bool xchk_ag_btree_healthy_enough(struct xfs_scrub *sc, struct xfs_perag *pag,
+ xfs_btnum_t btnum);
+
+#endif /* __XFS_SCRUB_HEALTH_H__ */
diff --git a/fs/xfs/scrub/ialloc.c b/fs/xfs/scrub/ialloc.c
index 882dc56c5c21..681758704fda 100644
--- a/fs/xfs/scrub/ialloc.c
+++ b/fs/xfs/scrub/ialloc.c
@@ -9,21 +9,14 @@
#include "xfs_format.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
-#include "xfs_defer.h"
#include "xfs_btree.h"
-#include "xfs_bit.h"
#include "xfs_log_format.h"
#include "xfs_trans.h"
-#include "xfs_sb.h"
#include "xfs_inode.h"
-#include "xfs_alloc.h"
#include "xfs_ialloc.h"
#include "xfs_ialloc_btree.h"
#include "xfs_icache.h"
#include "xfs_rmap.h"
-#include "xfs_log.h"
-#include "xfs_trans_priv.h"
-#include "scrub/xfs_scrub.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/btree.h"
@@ -39,7 +32,7 @@ xchk_setup_ag_iallocbt(
struct xfs_scrub *sc,
struct xfs_inode *ip)
{
- return xchk_setup_ag_btree(sc, ip, sc->try_harder);
+ return xchk_setup_ag_btree(sc, ip, sc->flags & XCHK_TRY_HARDER);
}
/* Inode btree scrubber. */
@@ -47,6 +40,12 @@ xchk_setup_ag_iallocbt(
struct xchk_iallocbt {
/* Number of inodes we see while scanning inobt. */
unsigned long long inodes;
+
+ /* Expected next startino, for big block filesystems. */
+ xfs_agino_t next_startino;
+
+ /* Expected end of the current inode cluster. */
+ xfs_agino_t next_cluster_ino;
};
/*
@@ -128,42 +127,58 @@ xchk_iallocbt_freecount(
return hweight64(freemask);
}
-/* Check a particular inode with ir_free. */
+/*
+ * Check that an inode's allocation status matches ir_free in the inobt
+ * record. First we try querying the in-core inode state, and if the inode
+ * isn't loaded we examine the on-disk inode directly.
+ *
+ * Since there can be 1:M and M:1 mappings between inobt records and inode
+ * clusters, we pass in the inode location information as an inobt record;
+ * the index of an inode cluster within the inobt record (as well as the
+ * cluster buffer itself); and the index of the inode within the cluster.
+ *
+ * @irec is the inobt record.
+ * @irec_ino is the inode offset from the start of the record.
+ * @dip is the on-disk inode.
+ */
STATIC int
-xchk_iallocbt_check_cluster_freemask(
+xchk_iallocbt_check_cluster_ifree(
struct xchk_btree *bs,
- xfs_ino_t fsino,
- xfs_agino_t chunkino,
- xfs_agino_t clusterino,
struct xfs_inobt_rec_incore *irec,
- struct xfs_buf *bp)
+ unsigned int irec_ino,
+ struct xfs_dinode *dip)
{
- struct xfs_dinode *dip;
struct xfs_mount *mp = bs->cur->bc_mp;
- bool inode_is_free = false;
+ xfs_ino_t fsino;
+ xfs_agino_t agino;
+ bool irec_free;
+ bool ino_inuse;
bool freemask_ok;
- bool inuse;
int error = 0;
if (xchk_should_terminate(bs->sc, &error))
return error;
- dip = xfs_buf_offset(bp, clusterino * mp->m_sb.sb_inodesize);
+ /*
+ * Given an inobt record and the offset of an inode from the start of
+ * the record, compute which fs inode we're talking about.
+ */
+ agino = irec->ir_startino + irec_ino;
+ fsino = XFS_AGINO_TO_INO(mp, bs->cur->bc_private.a.agno, agino);
+ irec_free = (irec->ir_free & XFS_INOBT_MASK(irec_ino));
+
if (be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC ||
- (dip->di_version >= 3 &&
- be64_to_cpu(dip->di_ino) != fsino + clusterino)) {
+ (dip->di_version >= 3 && be64_to_cpu(dip->di_ino) != fsino)) {
xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
goto out;
}
- if (irec->ir_free & XFS_INOBT_MASK(chunkino + clusterino))
- inode_is_free = true;
- error = xfs_icache_inode_is_allocated(mp, bs->cur->bc_tp,
- fsino + clusterino, &inuse);
+ error = xfs_icache_inode_is_allocated(mp, bs->cur->bc_tp, fsino,
+ &ino_inuse);
if (error == -ENODATA) {
/* Not cached, just read the disk buffer */
- freemask_ok = inode_is_free ^ !!(dip->di_mode);
- if (!bs->sc->try_harder && !freemask_ok)
+ freemask_ok = irec_free ^ !!(dip->di_mode);
+ if (!(bs->sc->flags & XCHK_TRY_HARDER) && !freemask_ok)
return -EDEADLOCK;
} else if (error < 0) {
/*
@@ -174,7 +189,7 @@ xchk_iallocbt_check_cluster_freemask(
goto out;
} else {
/* Inode is all there. */
- freemask_ok = inode_is_free ^ inuse;
+ freemask_ok = irec_free ^ ino_inuse;
}
if (!freemask_ok)
xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
@@ -182,86 +197,223 @@ out:
return 0;
}
-/* Make sure the free mask is consistent with what the inodes think. */
+/*
+ * Check that the holemask and freemask of a hypothetical inode cluster match
+ * what's actually on disk. If sparse inodes are enabled, the cluster does
+ * not actually have to map to inodes if the corresponding holemask bit is set.
+ *
+ * @cluster_base is the first inode in the cluster within the @irec.
+ */
STATIC int
-xchk_iallocbt_check_freemask(
+xchk_iallocbt_check_cluster(
struct xchk_btree *bs,
- struct xfs_inobt_rec_incore *irec)
+ struct xfs_inobt_rec_incore *irec,
+ unsigned int cluster_base)
{
struct xfs_imap imap;
struct xfs_mount *mp = bs->cur->bc_mp;
struct xfs_dinode *dip;
- struct xfs_buf *bp;
- xfs_ino_t fsino;
- xfs_agino_t nr_inodes;
- xfs_agino_t agino;
- xfs_agino_t chunkino;
- xfs_agino_t clusterino;
+ struct xfs_buf *cluster_bp;
+ unsigned int nr_inodes;
+ xfs_agnumber_t agno = bs->cur->bc_private.a.agno;
xfs_agblock_t agbno;
- uint16_t holemask;
+ unsigned int cluster_index;
+ uint16_t cluster_mask = 0;
uint16_t ir_holemask;
int error = 0;
- /* Make sure the freemask matches the inode records. */
- nr_inodes = mp->m_inodes_per_cluster;
-
- for (agino = irec->ir_startino;
- agino < irec->ir_startino + XFS_INODES_PER_CHUNK;
- agino += mp->m_inodes_per_cluster) {
- fsino = XFS_AGINO_TO_INO(mp, bs->cur->bc_private.a.agno, agino);
- chunkino = agino - irec->ir_startino;
- agbno = XFS_AGINO_TO_AGBNO(mp, agino);
-
- /* Compute the holemask mask for this cluster. */
- for (clusterino = 0, holemask = 0; clusterino < nr_inodes;
- clusterino += XFS_INODES_PER_HOLEMASK_BIT)
- holemask |= XFS_INOBT_MASK((chunkino + clusterino) /
- XFS_INODES_PER_HOLEMASK_BIT);
-
- /* The whole cluster must be a hole or not a hole. */
- ir_holemask = (irec->ir_holemask & holemask);
- if (ir_holemask != holemask && ir_holemask != 0) {
+ nr_inodes = min_t(unsigned int, XFS_INODES_PER_CHUNK,
+ M_IGEO(mp)->inodes_per_cluster);
+
+ /* Map this inode cluster */
+ agbno = XFS_AGINO_TO_AGBNO(mp, irec->ir_startino + cluster_base);
+
+ /* Compute a bitmask for this cluster that can be used for holemask. */
+ for (cluster_index = 0;
+ cluster_index < nr_inodes;
+ cluster_index += XFS_INODES_PER_HOLEMASK_BIT)
+ cluster_mask |= XFS_INOBT_MASK((cluster_base + cluster_index) /
+ XFS_INODES_PER_HOLEMASK_BIT);
+
+ /*
+ * Map the first inode of this cluster to a buffer and offset.
+ * Be careful about inobt records that don't align with the start of
+ * the inode buffer when block sizes are large enough to hold multiple
+ * inode chunks. When this happens, cluster_base will be zero but
+ * ir_startino can be large enough to make im_boffset nonzero.
+ */
+ ir_holemask = (irec->ir_holemask & cluster_mask);
+ imap.im_blkno = XFS_AGB_TO_DADDR(mp, agno, agbno);
+ imap.im_len = XFS_FSB_TO_BB(mp, M_IGEO(mp)->blocks_per_cluster);
+ imap.im_boffset = XFS_INO_TO_OFFSET(mp, irec->ir_startino) <<
+ mp->m_sb.sb_inodelog;
+
+ if (imap.im_boffset != 0 && cluster_base != 0) {
+ ASSERT(imap.im_boffset == 0 || cluster_base == 0);
+ xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
+ return 0;
+ }
+
+ trace_xchk_iallocbt_check_cluster(mp, agno, irec->ir_startino,
+ imap.im_blkno, imap.im_len, cluster_base, nr_inodes,
+ cluster_mask, ir_holemask,
+ XFS_INO_TO_OFFSET(mp, irec->ir_startino +
+ cluster_base));
+
+ /* The whole cluster must be a hole or not a hole. */
+ if (ir_holemask != cluster_mask && ir_holemask != 0) {
+ xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
+ return 0;
+ }
+
+ /* If any part of this is a hole, skip it. */
+ if (ir_holemask) {
+ xchk_xref_is_not_owned_by(bs->sc, agbno,
+ M_IGEO(mp)->blocks_per_cluster,
+ &XFS_RMAP_OINFO_INODES);
+ return 0;
+ }
+
+ xchk_xref_is_owned_by(bs->sc, agbno, M_IGEO(mp)->blocks_per_cluster,
+ &XFS_RMAP_OINFO_INODES);
+
+ /* Grab the inode cluster buffer. */
+ error = xfs_imap_to_bp(mp, bs->cur->bc_tp, &imap, &dip, &cluster_bp,
+ 0, 0);
+ if (!xchk_btree_xref_process_error(bs->sc, bs->cur, 0, &error))
+ return error;
+
+ /* Check free status of each inode within this cluster. */
+ for (cluster_index = 0; cluster_index < nr_inodes; cluster_index++) {
+ struct xfs_dinode *dip;
+
+ if (imap.im_boffset >= BBTOB(cluster_bp->b_length)) {
xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
- continue;
+ break;
}
- /* If any part of this is a hole, skip it. */
- if (ir_holemask) {
- xchk_xref_is_not_owned_by(bs->sc, agbno,
- mp->m_blocks_per_cluster,
- &XFS_RMAP_OINFO_INODES);
- continue;
+ dip = xfs_buf_offset(cluster_bp, imap.im_boffset);
+ error = xchk_iallocbt_check_cluster_ifree(bs, irec,
+ cluster_base + cluster_index, dip);
+ if (error)
+ break;
+ imap.im_boffset += mp->m_sb.sb_inodesize;
+ }
+
+ xfs_trans_brelse(bs->cur->bc_tp, cluster_bp);
+ return error;
+}
+
+/*
+ * For all the inode clusters that could map to this inobt record, make sure
+ * that the holemask makes sense and that the allocation status of each inode
+ * matches the freemask.
+ */
+STATIC int
+xchk_iallocbt_check_clusters(
+ struct xchk_btree *bs,
+ struct xfs_inobt_rec_incore *irec)
+{
+ unsigned int cluster_base;
+ int error = 0;
+
+ /*
+ * For the common case where this inobt record maps to multiple inode
+ * clusters this will call _check_cluster for each cluster.
+ *
+ * For the case that multiple inobt records map to a single cluster,
+ * this will call _check_cluster once.
+ */
+ for (cluster_base = 0;
+ cluster_base < XFS_INODES_PER_CHUNK;
+ cluster_base += M_IGEO(bs->sc->mp)->inodes_per_cluster) {
+ error = xchk_iallocbt_check_cluster(bs, irec, cluster_base);
+ if (error)
+ break;
+ }
+
+ return error;
+}
+
+/*
+ * Make sure this inode btree record is aligned properly. Because a fs block
+ * contains multiple inodes, we check that the inobt record is aligned to the
+ * correct inode, not just the correct block on disk. This results in a finer
+ * grained corruption check.
+ */
+STATIC void
+xchk_iallocbt_rec_alignment(
+ struct xchk_btree *bs,
+ struct xfs_inobt_rec_incore *irec)
+{
+ struct xfs_mount *mp = bs->sc->mp;
+ struct xchk_iallocbt *iabt = bs->private;
+ struct xfs_ino_geometry *igeo = M_IGEO(mp);
+
+ /*
+ * finobt records have different positioning requirements than inobt
+ * records: each finobt record must have a corresponding inobt record.
+ * That is checked in the xref function, so for now we only catch the
+ * obvious case where the record isn't at all aligned properly.
+ *
+ * Note that if a fs block contains more than a single chunk of inodes,
+ * we will have finobt records only for those chunks containing free
+ * inodes, and therefore expect chunk alignment of finobt records.
+ * Otherwise, we expect that the finobt record is aligned to the
+ * cluster alignment as told by the superblock.
+ */
+ if (bs->cur->bc_btnum == XFS_BTNUM_FINO) {
+ unsigned int imask;
+
+ imask = min_t(unsigned int, XFS_INODES_PER_CHUNK,
+ igeo->cluster_align_inodes) - 1;
+ if (irec->ir_startino & imask)
+ xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
+ return;
+ }
+
+ if (iabt->next_startino != NULLAGINO) {
+ /*
+ * We're midway through a cluster of inodes that is mapped by
+ * multiple inobt records. Did we get the record for the next
+ * irec in the sequence?
+ */
+ if (irec->ir_startino != iabt->next_startino) {
+ xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
+ return;
}
- xchk_xref_is_owned_by(bs->sc, agbno, mp->m_blocks_per_cluster,
- &XFS_RMAP_OINFO_INODES);
+ iabt->next_startino += XFS_INODES_PER_CHUNK;
- /* Grab the inode cluster buffer. */
- imap.im_blkno = XFS_AGB_TO_DADDR(mp, bs->cur->bc_private.a.agno,
- agbno);
- imap.im_len = XFS_FSB_TO_BB(mp, mp->m_blocks_per_cluster);
- imap.im_boffset = 0;
-
- error = xfs_imap_to_bp(mp, bs->cur->bc_tp, &imap,
- &dip, &bp, 0, 0);
- if (!xchk_btree_xref_process_error(bs->sc, bs->cur, 0,
- &error))
- continue;
-
- /* Which inodes are free? */
- for (clusterino = 0; clusterino < nr_inodes; clusterino++) {
- error = xchk_iallocbt_check_cluster_freemask(bs,
- fsino, chunkino, clusterino, irec, bp);
- if (error) {
- xfs_trans_brelse(bs->cur->bc_tp, bp);
- return error;
- }
+ /* Are we done with the cluster? */
+ if (iabt->next_startino >= iabt->next_cluster_ino) {
+ iabt->next_startino = NULLAGINO;
+ iabt->next_cluster_ino = NULLAGINO;
}
+ return;
+ }
+
+ /* inobt records must be aligned to cluster and inoalignmnt size. */
+ if (irec->ir_startino & (igeo->cluster_align_inodes - 1)) {
+ xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
+ return;
+ }
- xfs_trans_brelse(bs->cur->bc_tp, bp);
+ if (irec->ir_startino & (igeo->inodes_per_cluster - 1)) {
+ xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
+ return;
}
- return error;
+ if (igeo->inodes_per_cluster <= XFS_INODES_PER_CHUNK)
+ return;
+
+ /*
+ * If this is the start of an inode cluster that can be mapped by
+ * multiple inobt records, the next inobt record must follow exactly
+ * after this one.
+ */
+ iabt->next_startino = irec->ir_startino + XFS_INODES_PER_CHUNK;
+ iabt->next_cluster_ino = irec->ir_startino + igeo->inodes_per_cluster;
}
/* Scrub an inobt/finobt record. */
@@ -276,7 +428,6 @@ xchk_iallocbt_rec(
uint64_t holes;
xfs_agnumber_t agno = bs->cur->bc_private.a.agno;
xfs_agino_t agino;
- xfs_agblock_t agbno;
xfs_extlen_t len;
int holecount;
int i;
@@ -303,11 +454,9 @@ xchk_iallocbt_rec(
goto out;
}
- /* Make sure this record is aligned to cluster and inoalignmnt size. */
- agbno = XFS_AGINO_TO_AGBNO(mp, irec.ir_startino);
- if ((agbno & (mp->m_cluster_align - 1)) ||
- (agbno & (mp->m_blocks_per_cluster - 1)))
- xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
+ xchk_iallocbt_rec_alignment(bs, &irec);
+ if (bs->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+ goto out;
iabt->inodes += irec.ir_count;
@@ -320,7 +469,7 @@ xchk_iallocbt_rec(
if (!xchk_iallocbt_chunk(bs, &irec, agino, len))
goto out;
- goto check_freemask;
+ goto check_clusters;
}
/* Check each chunk of a sparse inode cluster. */
@@ -346,8 +495,8 @@ xchk_iallocbt_rec(
holecount + irec.ir_count != XFS_INODES_PER_CHUNK)
xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
-check_freemask:
- error = xchk_iallocbt_check_freemask(bs, &irec);
+check_clusters:
+ error = xchk_iallocbt_check_clusters(bs, &irec);
if (error)
goto out;
@@ -429,6 +578,8 @@ xchk_iallocbt(
struct xfs_btree_cur *cur;
struct xchk_iallocbt iabt = {
.inodes = 0,
+ .next_startino = NULLAGINO,
+ .next_cluster_ino = NULLAGINO,
};
int error;
diff --git a/fs/xfs/scrub/inode.c b/fs/xfs/scrub/inode.c
index e213efc194a1..6d483ab29e63 100644
--- a/fs/xfs/scrub/inode.c
+++ b/fs/xfs/scrub/inode.c
@@ -9,27 +9,17 @@
#include "xfs_format.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
-#include "xfs_defer.h"
#include "xfs_btree.h"
-#include "xfs_bit.h"
#include "xfs_log_format.h"
-#include "xfs_trans.h"
-#include "xfs_sb.h"
#include "xfs_inode.h"
-#include "xfs_icache.h"
-#include "xfs_inode_buf.h"
-#include "xfs_inode_fork.h"
#include "xfs_ialloc.h"
#include "xfs_da_format.h"
#include "xfs_reflink.h"
#include "xfs_rmap.h"
-#include "xfs_bmap.h"
#include "xfs_bmap_util.h"
-#include "scrub/xfs_scrub.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/btree.h"
-#include "scrub/trace.h"
/*
* Grab total control of the inode metadata. It doesn't matter here if
diff --git a/fs/xfs/scrub/parent.c b/fs/xfs/scrub/parent.c
index 1c9d7c7f64f5..c962bd534690 100644
--- a/fs/xfs/scrub/parent.c
+++ b/fs/xfs/scrub/parent.c
@@ -9,21 +9,13 @@
#include "xfs_format.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
-#include "xfs_defer.h"
-#include "xfs_btree.h"
-#include "xfs_bit.h"
#include "xfs_log_format.h"
-#include "xfs_trans.h"
-#include "xfs_sb.h"
#include "xfs_inode.h"
#include "xfs_icache.h"
#include "xfs_dir2.h"
#include "xfs_dir2_priv.h"
-#include "xfs_ialloc.h"
-#include "scrub/xfs_scrub.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
-#include "scrub/trace.h"
/* Set us up to scrub parents. */
int
@@ -320,7 +312,7 @@ out:
* If we failed to lock the parent inode even after a retry, just mark
* this scrub incomplete and return.
*/
- if (sc->try_harder && error == -EDEADLOCK) {
+ if ((sc->flags & XCHK_TRY_HARDER) && error == -EDEADLOCK) {
error = 0;
xchk_set_incomplete(sc);
}
diff --git a/fs/xfs/scrub/quota.c b/fs/xfs/scrub/quota.c
index 782d582d3edd..0a33b4421c32 100644
--- a/fs/xfs/scrub/quota.c
+++ b/fs/xfs/scrub/quota.c
@@ -9,24 +9,13 @@
#include "xfs_format.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
-#include "xfs_defer.h"
-#include "xfs_btree.h"
-#include "xfs_bit.h"
#include "xfs_log_format.h"
#include "xfs_trans.h"
-#include "xfs_sb.h"
#include "xfs_inode.h"
-#include "xfs_inode_fork.h"
-#include "xfs_alloc.h"
-#include "xfs_bmap.h"
#include "xfs_quota.h"
#include "xfs_qm.h"
-#include "xfs_dquot.h"
-#include "xfs_dquot_item.h"
-#include "scrub/xfs_scrub.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
-#include "scrub/trace.h"
/* Convert a scrub type code to a DQ flag, or return 0 if error. */
static inline uint
@@ -60,7 +49,7 @@ xchk_setup_quota(
dqtype = xchk_quota_to_dqtype(sc);
if (dqtype == 0)
return -EINVAL;
- sc->has_quotaofflock = true;
+ sc->flags |= XCHK_HAS_QUOTAOFFLOCK;
mutex_lock(&sc->mp->m_quotainfo->qi_quotaofflock);
if (!xfs_this_quota_on(sc->mp, dqtype))
return -ENOENT;
@@ -144,7 +133,7 @@ xchk_quota_item(
if (bsoft > bhard)
xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset);
- if (ihard > mp->m_maxicount)
+ if (ihard > M_IGEO(mp)->maxicount)
xchk_fblock_set_warning(sc, XFS_DATA_FORK, offset);
if (isoft > ihard)
xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset);
diff --git a/fs/xfs/scrub/refcount.c b/fs/xfs/scrub/refcount.c
index 708b4158eb90..93b3793bc5b3 100644
--- a/fs/xfs/scrub/refcount.c
+++ b/fs/xfs/scrub/refcount.c
@@ -7,22 +7,12 @@
#include "xfs_fs.h"
#include "xfs_shared.h"
#include "xfs_format.h"
-#include "xfs_trans_resv.h"
-#include "xfs_mount.h"
-#include "xfs_defer.h"
#include "xfs_btree.h"
-#include "xfs_bit.h"
-#include "xfs_log_format.h"
-#include "xfs_trans.h"
-#include "xfs_sb.h"
-#include "xfs_alloc.h"
#include "xfs_rmap.h"
#include "xfs_refcount.h"
-#include "scrub/xfs_scrub.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/btree.h"
-#include "scrub/trace.h"
/*
* Set us up to scrub reference count btrees.
diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
index 6acf1bfa0bfe..4cfeec57fb05 100644
--- a/fs/xfs/scrub/repair.c
+++ b/fs/xfs/scrub/repair.c
@@ -9,29 +9,21 @@
#include "xfs_format.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
-#include "xfs_defer.h"
#include "xfs_btree.h"
-#include "xfs_bit.h"
#include "xfs_log_format.h"
#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_inode.h"
-#include "xfs_icache.h"
#include "xfs_alloc.h"
#include "xfs_alloc_btree.h"
#include "xfs_ialloc.h"
#include "xfs_ialloc_btree.h"
#include "xfs_rmap.h"
#include "xfs_rmap_btree.h"
-#include "xfs_refcount.h"
#include "xfs_refcount_btree.h"
#include "xfs_extent_busy.h"
#include "xfs_ag_resv.h"
-#include "xfs_trans_space.h"
#include "xfs_quota.h"
-#include "xfs_attr.h"
-#include "xfs_reflink.h"
-#include "scrub/xfs_scrub.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/trace.h"
@@ -46,8 +38,7 @@
int
xrep_attempt(
struct xfs_inode *ip,
- struct xfs_scrub *sc,
- bool *fixed)
+ struct xfs_scrub *sc)
{
int error = 0;
@@ -66,13 +57,13 @@ xrep_attempt(
* scrub so that we can tell userspace if we fixed the problem.
*/
sc->sm->sm_flags &= ~XFS_SCRUB_FLAGS_OUT;
- *fixed = true;
+ sc->flags |= XREP_ALREADY_FIXED;
return -EAGAIN;
case -EDEADLOCK:
case -EAGAIN:
/* Tell the caller to try again having grabbed all the locks. */
- if (!sc->try_harder) {
- sc->try_harder = true;
+ if (!(sc->flags & XCHK_TRY_HARDER)) {
+ sc->flags |= XCHK_TRY_HARDER;
return -EAGAIN;
}
/*
@@ -137,10 +128,16 @@ xrep_roll_ag_trans(
if (sc->sa.agfl_bp)
xfs_trans_bhold(sc->tp, sc->sa.agfl_bp);
- /* Roll the transaction. */
+ /*
+ * Roll the transaction. We still own the buffer and the buffer lock
+ * regardless of whether or not the roll succeeds. If the roll fails,
+ * the buffers will be released during teardown on our way out of the
+ * kernel. If it succeeds, we join them to the new transaction and
+ * move on.
+ */
error = xfs_trans_roll(&sc->tp);
if (error)
- goto out_release;
+ return error;
/* Join AG headers to the new transaction. */
if (sc->sa.agi_bp)
@@ -151,21 +148,6 @@ xrep_roll_ag_trans(
xfs_trans_bjoin(sc->tp, sc->sa.agfl_bp);
return 0;
-
-out_release:
- /*
- * Rolling failed, so release the hold on the buffers. The
- * buffers will be released during teardown on our way out
- * of the kernel.
- */
- if (sc->sa.agi_bp)
- xfs_trans_bhold_release(sc->tp, sc->sa.agi_bp);
- if (sc->sa.agf_bp)
- xfs_trans_bhold_release(sc->tp, sc->sa.agf_bp);
- if (sc->sa.agfl_bp)
- xfs_trans_bhold_release(sc->tp, sc->sa.agfl_bp);
-
- return error;
}
/*
@@ -367,7 +349,7 @@ xrep_init_btblock(
bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, XFS_FSB_TO_DADDR(mp, fsb),
XFS_FSB_TO_BB(mp, 1), 0);
xfs_buf_zero(bp, 0, BBTOB(bp->b_length));
- xfs_btree_init_block(mp, bp, btnum, 0, 0, sc->sa.agno, 0);
+ xfs_btree_init_block(mp, bp, btnum, 0, 0, sc->sa.agno);
xfs_trans_buf_set_type(tp, bp, XFS_BLFT_BTREE_BUF);
xfs_trans_log_buf(tp, bp, 0, bp->b_length);
bp->b_ops = ops;
@@ -682,7 +664,7 @@ xrep_findroot_agfl_walk(
{
xfs_agblock_t *agbno = priv;
- return (*agbno == bno) ? XFS_BTREE_QUERY_RANGE_ABORT : 0;
+ return (*agbno == bno) ? XFS_ITER_ABORT : 0;
}
/* Does this block match the btree information passed in? */
@@ -712,7 +694,7 @@ xrep_findroot_block(
if (owner == XFS_RMAP_OWN_AG) {
error = xfs_agfl_walk(mp, ri->agf, ri->agfl_bp,
xrep_findroot_agfl_walk, &agbno);
- if (error == XFS_BTREE_QUERY_RANGE_ABORT)
+ if (error == XFS_ITER_ABORT)
return 0;
if (error)
return error;
@@ -743,7 +725,8 @@ xrep_findroot_block(
/* Ensure the block magic matches the btree type we're looking for. */
btblock = XFS_BUF_TO_BLOCK(bp);
- if (be32_to_cpu(btblock->bb_magic) != fab->magic)
+ ASSERT(fab->buf_ops->magic[1] != 0);
+ if (btblock->bb_magic != fab->buf_ops->magic[1])
goto out;
/*
diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h
index f2fc18bb7605..60c61d7052a8 100644
--- a/fs/xfs/scrub/repair.h
+++ b/fs/xfs/scrub/repair.h
@@ -15,7 +15,7 @@ static inline int xrep_notsupported(struct xfs_scrub *sc)
/* Repair helpers */
-int xrep_attempt(struct xfs_inode *ip, struct xfs_scrub *sc, bool *fixed);
+int xrep_attempt(struct xfs_inode *ip, struct xfs_scrub *sc);
void xrep_failure(struct xfs_mount *mp);
int xrep_roll_ag_trans(struct xfs_scrub *sc);
bool xrep_ag_has_space(struct xfs_perag *pag, xfs_extlen_t nr_blocks,
@@ -42,9 +42,6 @@ struct xrep_find_ag_btree {
/* in: buffer ops */
const struct xfs_buf_ops *buf_ops;
- /* in: magic number of the btree */
- uint32_t magic;
-
/* out: the highest btree block found and the tree height */
xfs_agblock_t root;
unsigned int height;
@@ -67,8 +64,7 @@ int xrep_agi(struct xfs_scrub *sc);
static inline int xrep_attempt(
struct xfs_inode *ip,
- struct xfs_scrub *sc,
- bool *fixed)
+ struct xfs_scrub *sc)
{
return -EOPNOTSUPP;
}
diff --git a/fs/xfs/scrub/rmap.c b/fs/xfs/scrub/rmap.c
index 92a140c5b55e..8d4cefd761c1 100644
--- a/fs/xfs/scrub/rmap.c
+++ b/fs/xfs/scrub/rmap.c
@@ -9,21 +9,12 @@
#include "xfs_format.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
-#include "xfs_defer.h"
#include "xfs_btree.h"
-#include "xfs_bit.h"
-#include "xfs_log_format.h"
-#include "xfs_trans.h"
-#include "xfs_sb.h"
-#include "xfs_alloc.h"
-#include "xfs_ialloc.h"
#include "xfs_rmap.h"
#include "xfs_refcount.h"
-#include "scrub/xfs_scrub.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/btree.h"
-#include "scrub/trace.h"
/*
* Set us up to scrub reverse mapping btrees.
diff --git a/fs/xfs/scrub/rtbitmap.c b/fs/xfs/scrub/rtbitmap.c
index 665d4bbb17cc..c642bc206c41 100644
--- a/fs/xfs/scrub/rtbitmap.c
+++ b/fs/xfs/scrub/rtbitmap.c
@@ -9,19 +9,12 @@
#include "xfs_format.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
-#include "xfs_defer.h"
-#include "xfs_btree.h"
-#include "xfs_bit.h"
#include "xfs_log_format.h"
#include "xfs_trans.h"
-#include "xfs_sb.h"
-#include "xfs_alloc.h"
#include "xfs_rtalloc.h"
#include "xfs_inode.h"
-#include "scrub/xfs_scrub.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
-#include "scrub/trace.h"
/* Set us up with the realtime metadata locked. */
int
@@ -141,9 +134,8 @@ xchk_xref_is_used_rt_space(
startext = fsbno;
endext = fsbno + len - 1;
do_div(startext, sc->mp->m_sb.sb_rextsize);
- if (do_div(endext, sc->mp->m_sb.sb_rextsize))
- endext++;
- extcount = endext - startext;
+ do_div(endext, sc->mp->m_sb.sb_rextsize);
+ extcount = endext - startext + 1;
xfs_ilock(sc->mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP);
error = xfs_rtalloc_extent_is_free(sc->mp, sc->tp, startext, extcount,
&is_free);
diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c
index 1b2344d00525..15c8c5f3f688 100644
--- a/fs/xfs/scrub/scrub.c
+++ b/fs/xfs/scrub/scrub.c
@@ -9,37 +9,18 @@
#include "xfs_format.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
-#include "xfs_defer.h"
-#include "xfs_btree.h"
-#include "xfs_bit.h"
#include "xfs_log_format.h"
#include "xfs_trans.h"
-#include "xfs_sb.h"
#include "xfs_inode.h"
-#include "xfs_icache.h"
-#include "xfs_itable.h"
-#include "xfs_alloc.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_bmap.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_ialloc.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_refcount.h"
-#include "xfs_refcount_btree.h"
-#include "xfs_rmap.h"
-#include "xfs_rmap_btree.h"
#include "xfs_quota.h"
#include "xfs_qm.h"
#include "xfs_errortag.h"
#include "xfs_error.h"
-#include "xfs_log.h"
-#include "xfs_trans_priv.h"
-#include "scrub/xfs_scrub.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/trace.h"
-#include "scrub/btree.h"
#include "scrub/repair.h"
+#include "scrub/health.h"
/*
* Online Scrub and Repair
@@ -186,8 +167,12 @@ xchk_teardown(
xfs_irele(sc->ip);
sc->ip = NULL;
}
- if (sc->has_quotaofflock)
+ if (sc->flags & XCHK_REAPING_DISABLED)
+ xchk_start_reaping(sc);
+ if (sc->flags & XCHK_HAS_QUOTAOFFLOCK) {
mutex_unlock(&sc->mp->m_quotainfo->qi_quotaofflock);
+ sc->flags &= ~XCHK_HAS_QUOTAOFFLOCK;
+ }
if (sc->buf) {
kmem_free(sc->buf);
sc->buf = NULL;
@@ -347,6 +332,12 @@ static const struct xchk_meta_ops meta_scrub_ops[] = {
.scrub = xchk_quota,
.repair = xrep_notsupported,
},
+ [XFS_SCRUB_TYPE_FSCOUNTERS] = { /* fs summary counters */
+ .type = ST_FS,
+ .setup = xchk_setup_fscounters,
+ .scrub = xchk_fscounters,
+ .repair = xrep_notsupported,
+ },
};
/* This isn't a stable feature, warn once per day. */
@@ -466,10 +457,14 @@ xfs_scrub_metadata(
struct xfs_inode *ip,
struct xfs_scrub_metadata *sm)
{
- struct xfs_scrub sc;
+ struct xfs_scrub sc = {
+ .mp = ip->i_mount,
+ .sm = sm,
+ .sa = {
+ .agno = NULLAGNUMBER,
+ },
+ };
struct xfs_mount *mp = ip->i_mount;
- bool try_harder = false;
- bool already_fixed = false;
int error = 0;
BUILD_BUG_ON(sizeof(meta_scrub_ops) !=
@@ -491,21 +486,17 @@ xfs_scrub_metadata(
xchk_experimental_warning(mp);
+ sc.ops = &meta_scrub_ops[sm->sm_type];
+ sc.sick_mask = xchk_health_mask_for_scrub_type(sm->sm_type);
retry_op:
/* Set up for the operation. */
- memset(&sc, 0, sizeof(sc));
- sc.mp = ip->i_mount;
- sc.sm = sm;
- sc.ops = &meta_scrub_ops[sm->sm_type];
- sc.try_harder = try_harder;
- sc.sa.agno = NULLAGNUMBER;
error = sc.ops->setup(&sc, ip);
if (error)
goto out_teardown;
/* Scrub for errors. */
error = sc.ops->scrub(&sc);
- if (!try_harder && error == -EDEADLOCK) {
+ if (!(sc.flags & XCHK_TRY_HARDER) && error == -EDEADLOCK) {
/*
* Scrubbers return -EDEADLOCK to mean 'try harder'.
* Tear down everything we hold, then set up again with
@@ -514,12 +505,15 @@ retry_op:
error = xchk_teardown(&sc, ip, 0);
if (error)
goto out;
- try_harder = true;
+ sc.flags |= XCHK_TRY_HARDER;
goto retry_op;
} else if (error)
goto out_teardown;
- if ((sc.sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) && !already_fixed) {
+ xchk_update_health(&sc);
+
+ if ((sc.sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) &&
+ !(sc.flags & XREP_ALREADY_FIXED)) {
bool needs_fix;
/* Let debug users force us into the repair routines. */
@@ -542,10 +536,13 @@ retry_op:
* If it's broken, userspace wants us to fix it, and we haven't
* already tried to fix it, then attempt a repair.
*/
- error = xrep_attempt(ip, &sc, &already_fixed);
+ error = xrep_attempt(ip, &sc);
if (error == -EAGAIN) {
- if (sc.try_harder)
- try_harder = true;
+ /*
+ * Either the repair function succeeded or it couldn't
+ * get all the resources it needs; either way, we go
+ * back to the beginning and call the scrub function.
+ */
error = xchk_teardown(&sc, ip, 0);
if (error) {
xrep_failure(mp);
diff --git a/fs/xfs/scrub/scrub.h b/fs/xfs/scrub/scrub.h
index 22f754fba8e5..ad1ceb44a628 100644
--- a/fs/xfs/scrub/scrub.h
+++ b/fs/xfs/scrub/scrub.h
@@ -62,13 +62,27 @@ struct xfs_scrub {
struct xfs_inode *ip;
void *buf;
uint ilock_flags;
- bool try_harder;
- bool has_quotaofflock;
+
+ /* See the XCHK/XREP state flags below. */
+ unsigned int flags;
+
+ /*
+ * The XFS_SICK_* flags that correspond to the metadata being scrubbed
+ * or repaired. We will use this mask to update the in-core fs health
+ * status with whatever we find.
+ */
+ unsigned int sick_mask;
/* State tracking for single-AG operations. */
struct xchk_ag sa;
};
+/* XCHK state flags grow up from zero, XREP state flags grown down from 2^31 */
+#define XCHK_TRY_HARDER (1 << 0) /* can't get resources, try again */
+#define XCHK_HAS_QUOTAOFFLOCK (1 << 1) /* we hold the quotaoff lock */
+#define XCHK_REAPING_DISABLED (1 << 2) /* background block reaping paused */
+#define XREP_ALREADY_FIXED (1 << 31) /* checking our repair work */
+
/* Metadata scrubbers */
int xchk_tester(struct xfs_scrub *sc);
int xchk_superblock(struct xfs_scrub *sc);
@@ -113,6 +127,7 @@ xchk_quota(struct xfs_scrub *sc)
return -ENOENT;
}
#endif
+int xchk_fscounters(struct xfs_scrub *sc);
/* cross-referencing helpers */
void xchk_xref_is_used_space(struct xfs_scrub *sc, xfs_agblock_t agbno,
@@ -138,4 +153,12 @@ void xchk_xref_is_used_rt_space(struct xfs_scrub *sc, xfs_rtblock_t rtbno,
# define xchk_xref_is_used_rt_space(sc, rtbno, len) do { } while (0)
#endif
+struct xchk_fscounters {
+ uint64_t icount;
+ uint64_t ifree;
+ uint64_t fdblocks;
+ unsigned long long icount_min;
+ unsigned long long icount_max;
+};
+
#endif /* __XFS_SCRUB_SCRUB_H__ */
diff --git a/fs/xfs/scrub/symlink.c b/fs/xfs/scrub/symlink.c
index f7ebaa946999..99c0b1234c3c 100644
--- a/fs/xfs/scrub/symlink.c
+++ b/fs/xfs/scrub/symlink.c
@@ -9,19 +9,11 @@
#include "xfs_format.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
-#include "xfs_defer.h"
-#include "xfs_btree.h"
-#include "xfs_bit.h"
#include "xfs_log_format.h"
-#include "xfs_trans.h"
-#include "xfs_sb.h"
#include "xfs_inode.h"
-#include "xfs_inode_fork.h"
#include "xfs_symlink.h"
-#include "scrub/xfs_scrub.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
-#include "scrub/trace.h"
/* Set us up to scrub a symbolic link. */
int
diff --git a/fs/xfs/scrub/trace.c b/fs/xfs/scrub/trace.c
index 96feaf8dcdec..9eaab2eb5ed3 100644
--- a/fs/xfs/scrub/trace.c
+++ b/fs/xfs/scrub/trace.c
@@ -10,15 +10,9 @@
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
-#include "xfs_defer.h"
-#include "xfs_da_format.h"
#include "xfs_inode.h"
#include "xfs_btree.h"
-#include "xfs_trans.h"
-#include "xfs_bit.h"
-#include "scrub/xfs_scrub.h"
#include "scrub/scrub.h"
-#include "scrub/common.h"
/* Figure out which block the btree cursor was pointing to. */
static inline xfs_fsblock_t
diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h
index 8344b14031ef..3362bae28b46 100644
--- a/fs/xfs/scrub/trace.h
+++ b/fs/xfs/scrub/trace.h
@@ -50,6 +50,7 @@ TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_RTSUM);
TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_UQUOTA);
TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_GQUOTA);
TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_PQUOTA);
+TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_FSCOUNTERS);
#define XFS_SCRUB_TYPE_STRINGS \
{ XFS_SCRUB_TYPE_PROBE, "probe" }, \
@@ -75,7 +76,8 @@ TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_PQUOTA);
{ XFS_SCRUB_TYPE_RTSUM, "rtsummary" }, \
{ XFS_SCRUB_TYPE_UQUOTA, "usrquota" }, \
{ XFS_SCRUB_TYPE_GQUOTA, "grpquota" }, \
- { XFS_SCRUB_TYPE_PQUOTA, "prjquota" }
+ { XFS_SCRUB_TYPE_PQUOTA, "prjquota" }, \
+ { XFS_SCRUB_TYPE_FSCOUNTERS, "fscounters" }
DECLARE_EVENT_CLASS(xchk_class,
TP_PROTO(struct xfs_inode *ip, struct xfs_scrub_metadata *sm,
@@ -223,6 +225,7 @@ DEFINE_EVENT(xchk_block_error_class, name, \
void *ret_ip), \
TP_ARGS(sc, daddr, ret_ip))
+DEFINE_SCRUB_BLOCK_ERROR_EVENT(xchk_fs_error);
DEFINE_SCRUB_BLOCK_ERROR_EVENT(xchk_block_error);
DEFINE_SCRUB_BLOCK_ERROR_EVENT(xchk_block_preen);
@@ -545,6 +548,109 @@ TRACE_EVENT(xchk_xref_error,
__entry->ret_ip)
);
+TRACE_EVENT(xchk_iallocbt_check_cluster,
+ TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
+ xfs_agino_t startino, xfs_daddr_t map_daddr,
+ unsigned short map_len, unsigned int chunk_ino,
+ unsigned int nr_inodes, uint16_t cluster_mask,
+ uint16_t holemask, unsigned int cluster_ino),
+ TP_ARGS(mp, agno, startino, map_daddr, map_len, chunk_ino, nr_inodes,
+ cluster_mask, holemask, cluster_ino),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_agino_t, startino)
+ __field(xfs_daddr_t, map_daddr)
+ __field(unsigned short, map_len)
+ __field(unsigned int, chunk_ino)
+ __field(unsigned int, nr_inodes)
+ __field(unsigned int, cluster_ino)
+ __field(uint16_t, cluster_mask)
+ __field(uint16_t, holemask)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->agno = agno;
+ __entry->startino = startino;
+ __entry->map_daddr = map_daddr;
+ __entry->map_len = map_len;
+ __entry->chunk_ino = chunk_ino;
+ __entry->nr_inodes = nr_inodes;
+ __entry->cluster_mask = cluster_mask;
+ __entry->holemask = holemask;
+ __entry->cluster_ino = cluster_ino;
+ ),
+ TP_printk("dev %d:%d agno %d startino %u daddr 0x%llx len %d chunkino %u nr_inodes %u cluster_mask 0x%x holemask 0x%x cluster_ino %u",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->agno,
+ __entry->startino,
+ __entry->map_daddr,
+ __entry->map_len,
+ __entry->chunk_ino,
+ __entry->nr_inodes,
+ __entry->cluster_mask,
+ __entry->holemask,
+ __entry->cluster_ino)
+)
+
+TRACE_EVENT(xchk_fscounters_calc,
+ TP_PROTO(struct xfs_mount *mp, uint64_t icount, uint64_t ifree,
+ uint64_t fdblocks, uint64_t delalloc),
+ TP_ARGS(mp, icount, ifree, fdblocks, delalloc),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(int64_t, icount_sb)
+ __field(uint64_t, icount_calculated)
+ __field(int64_t, ifree_sb)
+ __field(uint64_t, ifree_calculated)
+ __field(int64_t, fdblocks_sb)
+ __field(uint64_t, fdblocks_calculated)
+ __field(uint64_t, delalloc)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->icount_sb = mp->m_sb.sb_icount;
+ __entry->icount_calculated = icount;
+ __entry->ifree_sb = mp->m_sb.sb_ifree;
+ __entry->ifree_calculated = ifree;
+ __entry->fdblocks_sb = mp->m_sb.sb_fdblocks;
+ __entry->fdblocks_calculated = fdblocks;
+ __entry->delalloc = delalloc;
+ ),
+ TP_printk("dev %d:%d icount %lld:%llu ifree %lld::%llu fdblocks %lld::%llu delalloc %llu",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->icount_sb,
+ __entry->icount_calculated,
+ __entry->ifree_sb,
+ __entry->ifree_calculated,
+ __entry->fdblocks_sb,
+ __entry->fdblocks_calculated,
+ __entry->delalloc)
+)
+
+TRACE_EVENT(xchk_fscounters_within_range,
+ TP_PROTO(struct xfs_mount *mp, uint64_t expected, int64_t curr_value,
+ int64_t old_value),
+ TP_ARGS(mp, expected, curr_value, old_value),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(uint64_t, expected)
+ __field(int64_t, curr_value)
+ __field(int64_t, old_value)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->expected = expected;
+ __entry->curr_value = curr_value;
+ __entry->old_value = old_value;
+ ),
+ TP_printk("dev %d:%d expected %llu curr_value %lld old_value %lld",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->expected,
+ __entry->curr_value,
+ __entry->old_value)
+)
+
/* repair tracepoints */
#if IS_ENABLED(CONFIG_XFS_ONLINE_REPAIR)
diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c
index 8039e35147dd..cbda40d40326 100644
--- a/fs/xfs/xfs_acl.c
+++ b/fs/xfs/xfs_acl.c
@@ -4,16 +4,14 @@
* All Rights Reserved.
*/
#include "xfs.h"
+#include "xfs_shared.h"
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
#include "xfs_inode.h"
-#include "xfs_acl.h"
#include "xfs_attr.h"
#include "xfs_trace.h"
-#include <linux/slab.h>
-#include <linux/xattr.h>
#include <linux/posix_acl_xattr.h>
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index d9048bcea49c..f16d5f196c6b 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -12,23 +12,19 @@
#include "xfs_mount.h"
#include "xfs_inode.h"
#include "xfs_trans.h"
-#include "xfs_inode_item.h"
-#include "xfs_alloc.h"
-#include "xfs_error.h"
#include "xfs_iomap.h"
#include "xfs_trace.h"
#include "xfs_bmap.h"
#include "xfs_bmap_util.h"
-#include "xfs_bmap_btree.h"
#include "xfs_reflink.h"
-#include <linux/writeback.h>
/*
* structure owned by writepages passed to individual writepage calls
*/
struct xfs_writepage_ctx {
struct xfs_bmbt_irec imap;
- unsigned int io_type;
+ int fork;
+ unsigned int data_seq;
unsigned int cow_seq;
struct xfs_ioend *ioend;
};
@@ -62,7 +58,7 @@ xfs_find_daxdev_for_inode(
static void
xfs_finish_page_writeback(
struct inode *inode,
- struct bio_vec *bvec,
+ struct bio_vec *bvec,
int error)
{
struct iomap_page *iop = to_iomap_page(bvec->bv_page);
@@ -97,7 +93,7 @@ xfs_destroy_ioend(
for (bio = &ioend->io_inline_bio; bio; bio = next) {
struct bio_vec *bvec;
- int i;
+ struct bvec_iter_all iter_all;
/*
* For the last bio, bi_private points to the ioend, so we
@@ -109,7 +105,7 @@ xfs_destroy_ioend(
next = bio->bi_private;
/* walk each page on bio, ending page IO on them */
- bio_for_each_segment_all(bvec, bio, i)
+ bio_for_each_segment_all(bvec, bio, iter_all)
xfs_finish_page_writeback(inode, bvec, error);
bio_put(bio);
}
@@ -137,8 +133,7 @@ xfs_setfilesize_trans_alloc(
struct xfs_trans *tp;
int error;
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_fsyncts, 0, 0,
- XFS_TRANS_NOFS, &tp);
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_fsyncts, 0, 0, 0, &tp);
if (error)
return error;
@@ -232,17 +227,24 @@ xfs_setfilesize_ioend(
* IO write completion.
*/
STATIC void
-xfs_end_io(
- struct work_struct *work)
+xfs_end_ioend(
+ struct xfs_ioend *ioend)
{
- struct xfs_ioend *ioend =
- container_of(work, struct xfs_ioend, io_work);
+ struct list_head ioend_list;
struct xfs_inode *ip = XFS_I(ioend->io_inode);
xfs_off_t offset = ioend->io_offset;
size_t size = ioend->io_size;
+ unsigned int nofs_flag;
int error;
/*
+ * We can allocate memory here while doing writeback on behalf of
+ * memory reclaim. To avoid memory allocation deadlocks set the
+ * task-wide nofs context for the following operations.
+ */
+ nofs_flag = memalloc_nofs_save();
+
+ /*
* Just clean up the in-memory strutures if the fs has been shut down.
*/
if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
@@ -255,35 +257,140 @@ xfs_end_io(
*/
error = blk_status_to_errno(ioend->io_bio->bi_status);
if (unlikely(error)) {
- switch (ioend->io_type) {
- case XFS_IO_COW:
+ if (ioend->io_fork == XFS_COW_FORK)
xfs_reflink_cancel_cow_range(ip, offset, size, true);
- break;
- }
-
goto done;
}
/*
- * Success: commit the COW or unwritten blocks if needed.
+ * Success: commit the COW or unwritten blocks if needed.
*/
- switch (ioend->io_type) {
- case XFS_IO_COW:
+ if (ioend->io_fork == XFS_COW_FORK)
error = xfs_reflink_end_cow(ip, offset, size);
- break;
- case XFS_IO_UNWRITTEN:
- /* writeback should never update isize */
+ else if (ioend->io_state == XFS_EXT_UNWRITTEN)
error = xfs_iomap_write_unwritten(ip, offset, size, false);
- break;
- default:
+ else
ASSERT(!xfs_ioend_is_append(ioend) || ioend->io_append_trans);
- break;
- }
done:
if (ioend->io_append_trans)
error = xfs_setfilesize_ioend(ioend, error);
+ list_replace_init(&ioend->io_list, &ioend_list);
xfs_destroy_ioend(ioend, error);
+
+ while (!list_empty(&ioend_list)) {
+ ioend = list_first_entry(&ioend_list, struct xfs_ioend,
+ io_list);
+ list_del_init(&ioend->io_list);
+ xfs_destroy_ioend(ioend, error);
+ }
+
+ memalloc_nofs_restore(nofs_flag);
+}
+
+/*
+ * We can merge two adjacent ioends if they have the same set of work to do.
+ */
+static bool
+xfs_ioend_can_merge(
+ struct xfs_ioend *ioend,
+ struct xfs_ioend *next)
+{
+ if (ioend->io_bio->bi_status != next->io_bio->bi_status)
+ return false;
+ if ((ioend->io_fork == XFS_COW_FORK) ^ (next->io_fork == XFS_COW_FORK))
+ return false;
+ if ((ioend->io_state == XFS_EXT_UNWRITTEN) ^
+ (next->io_state == XFS_EXT_UNWRITTEN))
+ return false;
+ if (ioend->io_offset + ioend->io_size != next->io_offset)
+ return false;
+ return true;
+}
+
+/*
+ * If the to be merged ioend has a preallocated transaction for file
+ * size updates we need to ensure the ioend it is merged into also
+ * has one. If it already has one we can simply cancel the transaction
+ * as it is guaranteed to be clean.
+ */
+static void
+xfs_ioend_merge_append_transactions(
+ struct xfs_ioend *ioend,
+ struct xfs_ioend *next)
+{
+ if (!ioend->io_append_trans) {
+ ioend->io_append_trans = next->io_append_trans;
+ next->io_append_trans = NULL;
+ } else {
+ xfs_setfilesize_ioend(next, -ECANCELED);
+ }
+}
+
+/* Try to merge adjacent completions. */
+STATIC void
+xfs_ioend_try_merge(
+ struct xfs_ioend *ioend,
+ struct list_head *more_ioends)
+{
+ struct xfs_ioend *next_ioend;
+
+ while (!list_empty(more_ioends)) {
+ next_ioend = list_first_entry(more_ioends, struct xfs_ioend,
+ io_list);
+ if (!xfs_ioend_can_merge(ioend, next_ioend))
+ break;
+ list_move_tail(&next_ioend->io_list, &ioend->io_list);
+ ioend->io_size += next_ioend->io_size;
+ if (next_ioend->io_append_trans)
+ xfs_ioend_merge_append_transactions(ioend, next_ioend);
+ }
+}
+
+/* list_sort compare function for ioends */
+static int
+xfs_ioend_compare(
+ void *priv,
+ struct list_head *a,
+ struct list_head *b)
+{
+ struct xfs_ioend *ia;
+ struct xfs_ioend *ib;
+
+ ia = container_of(a, struct xfs_ioend, io_list);
+ ib = container_of(b, struct xfs_ioend, io_list);
+ if (ia->io_offset < ib->io_offset)
+ return -1;
+ else if (ia->io_offset > ib->io_offset)
+ return 1;
+ return 0;
+}
+
+/* Finish all pending io completions. */
+void
+xfs_end_io(
+ struct work_struct *work)
+{
+ struct xfs_inode *ip;
+ struct xfs_ioend *ioend;
+ struct list_head completion_list;
+ unsigned long flags;
+
+ ip = container_of(work, struct xfs_inode, i_ioend_work);
+
+ spin_lock_irqsave(&ip->i_ioend_lock, flags);
+ list_replace_init(&ip->i_ioend_list, &completion_list);
+ spin_unlock_irqrestore(&ip->i_ioend_lock, flags);
+
+ list_sort(NULL, &completion_list, xfs_ioend_compare);
+
+ while (!list_empty(&completion_list)) {
+ ioend = list_first_entry(&completion_list, struct xfs_ioend,
+ io_list);
+ list_del_init(&ioend->io_list);
+ xfs_ioend_try_merge(ioend, &completion_list);
+ xfs_end_ioend(ioend);
+ }
}
STATIC void
@@ -291,16 +398,92 @@ xfs_end_bio(
struct bio *bio)
{
struct xfs_ioend *ioend = bio->bi_private;
- struct xfs_mount *mp = XFS_I(ioend->io_inode)->i_mount;
-
- if (ioend->io_type == XFS_IO_UNWRITTEN || ioend->io_type == XFS_IO_COW)
- queue_work(mp->m_unwritten_workqueue, &ioend->io_work);
- else if (ioend->io_append_trans)
- queue_work(mp->m_data_workqueue, &ioend->io_work);
- else
+ struct xfs_inode *ip = XFS_I(ioend->io_inode);
+ struct xfs_mount *mp = ip->i_mount;
+ unsigned long flags;
+
+ if (ioend->io_fork == XFS_COW_FORK ||
+ ioend->io_state == XFS_EXT_UNWRITTEN ||
+ ioend->io_append_trans != NULL) {
+ spin_lock_irqsave(&ip->i_ioend_lock, flags);
+ if (list_empty(&ip->i_ioend_list))
+ WARN_ON_ONCE(!queue_work(mp->m_unwritten_workqueue,
+ &ip->i_ioend_work));
+ list_add_tail(&ioend->io_list, &ip->i_ioend_list);
+ spin_unlock_irqrestore(&ip->i_ioend_lock, flags);
+ } else
xfs_destroy_ioend(ioend, blk_status_to_errno(bio->bi_status));
}
+/*
+ * Fast revalidation of the cached writeback mapping. Return true if the current
+ * mapping is valid, false otherwise.
+ */
+static bool
+xfs_imap_valid(
+ struct xfs_writepage_ctx *wpc,
+ struct xfs_inode *ip,
+ xfs_fileoff_t offset_fsb)
+{
+ if (offset_fsb < wpc->imap.br_startoff ||
+ offset_fsb >= wpc->imap.br_startoff + wpc->imap.br_blockcount)
+ return false;
+ /*
+ * If this is a COW mapping, it is sufficient to check that the mapping
+ * covers the offset. Be careful to check this first because the caller
+ * can revalidate a COW mapping without updating the data seqno.
+ */
+ if (wpc->fork == XFS_COW_FORK)
+ return true;
+
+ /*
+ * This is not a COW mapping. Check the sequence number of the data fork
+ * because concurrent changes could have invalidated the extent. Check
+ * the COW fork because concurrent changes since the last time we
+ * checked (and found nothing at this offset) could have added
+ * overlapping blocks.
+ */
+ if (wpc->data_seq != READ_ONCE(ip->i_df.if_seq))
+ return false;
+ if (xfs_inode_has_cow_data(ip) &&
+ wpc->cow_seq != READ_ONCE(ip->i_cowfp->if_seq))
+ return false;
+ return true;
+}
+
+/*
+ * Pass in a dellalloc extent and convert it to real extents, return the real
+ * extent that maps offset_fsb in wpc->imap.
+ *
+ * The current page is held locked so nothing could have removed the block
+ * backing offset_fsb, although it could have moved from the COW to the data
+ * fork by another thread.
+ */
+static int
+xfs_convert_blocks(
+ struct xfs_writepage_ctx *wpc,
+ struct xfs_inode *ip,
+ xfs_fileoff_t offset_fsb)
+{
+ int error;
+
+ /*
+ * Attempt to allocate whatever delalloc extent currently backs
+ * offset_fsb and put the result into wpc->imap. Allocate in a loop
+ * because it may take several attempts to allocate real blocks for a
+ * contiguous delalloc extent if free space is sufficiently fragmented.
+ */
+ do {
+ error = xfs_bmapi_convert_delalloc(ip, wpc->fork, offset_fsb,
+ &wpc->imap, wpc->fork == XFS_COW_FORK ?
+ &wpc->cow_seq : &wpc->data_seq);
+ if (error)
+ return error;
+ } while (wpc->imap.br_startoff + wpc->imap.br_blockcount <= offset_fsb);
+
+ return 0;
+}
+
STATIC int
xfs_map_blocks(
struct xfs_writepage_ctx *wpc,
@@ -310,26 +493,16 @@ xfs_map_blocks(
struct xfs_inode *ip = XFS_I(inode);
struct xfs_mount *mp = ip->i_mount;
ssize_t count = i_blocksize(inode);
- xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset), end_fsb;
+ xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
+ xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + count);
xfs_fileoff_t cow_fsb = NULLFILEOFF;
struct xfs_bmbt_irec imap;
- int whichfork = XFS_DATA_FORK;
struct xfs_iext_cursor icur;
- bool imap_valid;
+ int retries = 0;
int error = 0;
- /*
- * We have to make sure the cached mapping is within EOF to protect
- * against eofblocks trimming on file release leaving us with a stale
- * mapping. Otherwise, a page for a subsequent file extending buffered
- * write could get picked up by this writeback cycle and written to the
- * wrong blocks.
- *
- * Note that what we really want here is a generic mapping invalidation
- * mechanism to protect us from arbitrary extent modifying contexts, not
- * just eofblocks.
- */
- xfs_trim_extent_eof(&wpc->imap, ip);
+ if (XFS_FORCED_SHUTDOWN(mp))
+ return -EIO;
/*
* COW fork blocks can overlap data fork blocks even if the blocks
@@ -346,31 +519,19 @@ xfs_map_blocks(
* against concurrent updates and provides a memory barrier on the way
* out that ensures that we always see the current value.
*/
- imap_valid = offset_fsb >= wpc->imap.br_startoff &&
- offset_fsb < wpc->imap.br_startoff + wpc->imap.br_blockcount;
- if (imap_valid &&
- (!xfs_inode_has_cow_data(ip) ||
- wpc->io_type == XFS_IO_COW ||
- wpc->cow_seq == READ_ONCE(ip->i_cowfp->if_seq)))
+ if (xfs_imap_valid(wpc, ip, offset_fsb))
return 0;
- if (XFS_FORCED_SHUTDOWN(mp))
- return -EIO;
-
/*
* If we don't have a valid map, now it's time to get a new one for this
* offset. This will convert delayed allocations (including COW ones)
* into real extents. If we return without a valid map, it means we
* landed in a hole and we skip the block.
*/
+retry:
xfs_ilock(ip, XFS_ILOCK_SHARED);
ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
(ip->i_df.if_flags & XFS_IFEXTENTS));
- ASSERT(offset <= mp->m_super->s_maxbytes);
-
- if (offset > mp->m_super->s_maxbytes - count)
- count = mp->m_super->s_maxbytes - offset;
- end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count);
/*
* Check if this is offset is covered by a COW extents, and if yes use
@@ -382,30 +543,16 @@ xfs_map_blocks(
if (cow_fsb != NULLFILEOFF && cow_fsb <= offset_fsb) {
wpc->cow_seq = READ_ONCE(ip->i_cowfp->if_seq);
xfs_iunlock(ip, XFS_ILOCK_SHARED);
- /*
- * Truncate can race with writeback since writeback doesn't
- * take the iolock and truncate decreases the file size before
- * it starts truncating the pages between new_size and old_size.
- * Therefore, we can end up in the situation where writeback
- * gets a CoW fork mapping but the truncate makes the mapping
- * invalid and we end up in here trying to get a new mapping.
- * bail out here so that we simply never get a valid mapping
- * and so we drop the write altogether. The page truncation
- * will kill the contents anyway.
- */
- if (offset > i_size_read(inode)) {
- wpc->io_type = XFS_IO_HOLE;
- return 0;
- }
- whichfork = XFS_COW_FORK;
- wpc->io_type = XFS_IO_COW;
+
+ wpc->fork = XFS_COW_FORK;
goto allocate_blocks;
}
/*
- * Map valid and no COW extent in the way? We're done.
+ * No COW extent overlap. Revalidate now that we may have updated
+ * ->cow_seq. If the data mapping is still valid, we're done.
*/
- if (imap_valid) {
+ if (xfs_imap_valid(wpc, ip, offset_fsb)) {
xfs_iunlock(ip, XFS_ILOCK_SHARED);
return 0;
}
@@ -417,51 +564,65 @@ xfs_map_blocks(
*/
if (!xfs_iext_lookup_extent(ip, &ip->i_df, offset_fsb, &icur, &imap))
imap.br_startoff = end_fsb; /* fake a hole past EOF */
+ wpc->data_seq = READ_ONCE(ip->i_df.if_seq);
xfs_iunlock(ip, XFS_ILOCK_SHARED);
+ wpc->fork = XFS_DATA_FORK;
+
+ /* landed in a hole or beyond EOF? */
if (imap.br_startoff > offset_fsb) {
- /* landed in a hole or beyond EOF */
imap.br_blockcount = imap.br_startoff - offset_fsb;
imap.br_startoff = offset_fsb;
imap.br_startblock = HOLESTARTBLOCK;
- wpc->io_type = XFS_IO_HOLE;
- } else {
- /*
- * Truncate to the next COW extent if there is one. This is the
- * only opportunity to do this because we can skip COW fork
- * lookups for the subsequent blocks in the mapping; however,
- * the requirement to treat the COW range separately remains.
- */
- if (cow_fsb != NULLFILEOFF &&
- cow_fsb < imap.br_startoff + imap.br_blockcount)
- imap.br_blockcount = cow_fsb - imap.br_startoff;
-
- if (isnullstartblock(imap.br_startblock)) {
- /* got a delalloc extent */
- wpc->io_type = XFS_IO_DELALLOC;
- goto allocate_blocks;
- }
-
- if (imap.br_state == XFS_EXT_UNWRITTEN)
- wpc->io_type = XFS_IO_UNWRITTEN;
- else
- wpc->io_type = XFS_IO_OVERWRITE;
+ imap.br_state = XFS_EXT_NORM;
}
+ /*
+ * Truncate to the next COW extent if there is one. This is the only
+ * opportunity to do this because we can skip COW fork lookups for the
+ * subsequent blocks in the mapping; however, the requirement to treat
+ * the COW range separately remains.
+ */
+ if (cow_fsb != NULLFILEOFF &&
+ cow_fsb < imap.br_startoff + imap.br_blockcount)
+ imap.br_blockcount = cow_fsb - imap.br_startoff;
+
+ /* got a delalloc extent? */
+ if (imap.br_startblock != HOLESTARTBLOCK &&
+ isnullstartblock(imap.br_startblock))
+ goto allocate_blocks;
+
wpc->imap = imap;
- xfs_trim_extent_eof(&wpc->imap, ip);
- trace_xfs_map_blocks_found(ip, offset, count, wpc->io_type, &imap);
+ trace_xfs_map_blocks_found(ip, offset, count, wpc->fork, &imap);
return 0;
allocate_blocks:
- error = xfs_iomap_write_allocate(ip, whichfork, offset, &imap,
- &wpc->cow_seq);
- if (error)
+ error = xfs_convert_blocks(wpc, ip, offset_fsb);
+ if (error) {
+ /*
+ * If we failed to find the extent in the COW fork we might have
+ * raced with a COW to data fork conversion or truncate.
+ * Restart the lookup to catch the extent in the data fork for
+ * the former case, but prevent additional retries to avoid
+ * looping forever for the latter case.
+ */
+ if (error == -EAGAIN && wpc->fork == XFS_COW_FORK && !retries++)
+ goto retry;
+ ASSERT(error != -EAGAIN);
return error;
- ASSERT(whichfork == XFS_COW_FORK || cow_fsb == NULLFILEOFF ||
- imap.br_startoff + imap.br_blockcount <= cow_fsb);
- wpc->imap = imap;
- xfs_trim_extent_eof(&wpc->imap, ip);
- trace_xfs_map_blocks_alloc(ip, offset, count, wpc->io_type, &imap);
+ }
+
+ /*
+ * Due to merging the return real extent might be larger than the
+ * original delalloc one. Trim the return extent to the next COW
+ * boundary again to force a re-lookup.
+ */
+ if (wpc->fork != XFS_COW_FORK && cow_fsb != NULLFILEOFF &&
+ cow_fsb < wpc->imap.br_startoff + wpc->imap.br_blockcount)
+ wpc->imap.br_blockcount = cow_fsb - wpc->imap.br_startoff;
+
+ ASSERT(wpc->imap.br_startoff <= offset_fsb);
+ ASSERT(wpc->imap.br_startoff + wpc->imap.br_blockcount > offset_fsb);
+ trace_xfs_map_blocks_alloc(ip, offset, count, wpc->fork, &imap);
return 0;
}
@@ -473,7 +634,7 @@ allocate_blocks:
* reference to the ioend to ensure that the ioend completion is only done once
* all bios have been submitted and the ioend is really done.
*
- * If @fail is non-zero, it means that we have a situation where some part of
+ * If @status is non-zero, it means that we have a situation where some part of
* the submission process has failed after we have marked paged for writeback
* and unlocked them. In this situation, we need to fail the bio and ioend
* rather than submit it to IO. This typically only happens on a filesystem
@@ -485,33 +646,33 @@ xfs_submit_ioend(
struct xfs_ioend *ioend,
int status)
{
- /* Convert CoW extents to regular */
- if (!status && ioend->io_type == XFS_IO_COW) {
- /*
- * Yuk. This can do memory allocation, but is not a
- * transactional operation so everything is done in GFP_KERNEL
- * context. That can deadlock, because we hold pages in
- * writeback state and GFP_KERNEL allocations can block on them.
- * Hence we must operate in nofs conditions here.
- */
- unsigned nofs_flag;
+ unsigned int nofs_flag;
- nofs_flag = memalloc_nofs_save();
+ /*
+ * We can allocate memory here while doing writeback on behalf of
+ * memory reclaim. To avoid memory allocation deadlocks set the
+ * task-wide nofs context for the following operations.
+ */
+ nofs_flag = memalloc_nofs_save();
+
+ /* Convert CoW extents to regular */
+ if (!status && ioend->io_fork == XFS_COW_FORK) {
status = xfs_reflink_convert_cow(XFS_I(ioend->io_inode),
ioend->io_offset, ioend->io_size);
- memalloc_nofs_restore(nofs_flag);
}
/* Reserve log space if we might write beyond the on-disk inode size. */
if (!status &&
- ioend->io_type != XFS_IO_UNWRITTEN &&
+ (ioend->io_fork == XFS_COW_FORK ||
+ ioend->io_state != XFS_EXT_UNWRITTEN) &&
xfs_ioend_is_append(ioend) &&
!ioend->io_append_trans)
status = xfs_setfilesize_trans_alloc(ioend);
+ memalloc_nofs_restore(nofs_flag);
+
ioend->io_bio->bi_private = ioend;
ioend->io_bio->bi_end_io = xfs_end_bio;
- ioend->io_bio->bi_opf = REQ_OP_WRITE | wbc_to_write_flags(wbc);
/*
* If we are failing the IO now, just mark the ioend with an
@@ -525,7 +686,6 @@ xfs_submit_ioend(
return status;
}
- ioend->io_bio->bi_write_hint = ioend->io_inode->i_write_hint;
submit_bio(ioend->io_bio);
return 0;
}
@@ -533,10 +693,12 @@ xfs_submit_ioend(
static struct xfs_ioend *
xfs_alloc_ioend(
struct inode *inode,
- unsigned int type,
+ int fork,
+ xfs_exntst_t state,
xfs_off_t offset,
struct block_device *bdev,
- sector_t sector)
+ sector_t sector,
+ struct writeback_control *wbc)
{
struct xfs_ioend *ioend;
struct bio *bio;
@@ -544,14 +706,17 @@ xfs_alloc_ioend(
bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_PAGES, &xfs_ioend_bioset);
bio_set_dev(bio, bdev);
bio->bi_iter.bi_sector = sector;
+ bio->bi_opf = REQ_OP_WRITE | wbc_to_write_flags(wbc);
+ bio->bi_write_hint = inode->i_write_hint;
+ wbc_init_bio(wbc, bio);
ioend = container_of(bio, struct xfs_ioend, io_inline_bio);
INIT_LIST_HEAD(&ioend->io_list);
- ioend->io_type = type;
+ ioend->io_fork = fork;
+ ioend->io_state = state;
ioend->io_inode = inode;
ioend->io_size = 0;
ioend->io_offset = offset;
- INIT_WORK(&ioend->io_work, xfs_end_io);
ioend->io_append_trans = NULL;
ioend->io_bio = bio;
return ioend;
@@ -564,24 +729,22 @@ xfs_alloc_ioend(
* so that the bi_private linkage is set up in the right direction for the
* traversal in xfs_destroy_ioend().
*/
-static void
+static struct bio *
xfs_chain_bio(
- struct xfs_ioend *ioend,
- struct writeback_control *wbc,
- struct block_device *bdev,
- sector_t sector)
+ struct bio *prev)
{
struct bio *new;
new = bio_alloc(GFP_NOFS, BIO_MAX_PAGES);
- bio_set_dev(new, bdev);
- new->bi_iter.bi_sector = sector;
- bio_chain(ioend->io_bio, new);
- bio_get(ioend->io_bio); /* for xfs_destroy_ioend */
- ioend->io_bio->bi_opf = REQ_OP_WRITE | wbc_to_write_flags(wbc);
- ioend->io_bio->bi_write_hint = ioend->io_inode->i_write_hint;
- submit_bio(ioend->io_bio);
- ioend->io_bio = new;
+ bio_copy_dev(new, prev);/* also copies over blkcg information */
+ new->bi_iter.bi_sector = bio_end_sector(prev);
+ new->bi_opf = prev->bi_opf;
+ new->bi_write_hint = prev->bi_write_hint;
+
+ bio_chain(prev, new);
+ bio_get(prev); /* for xfs_destroy_ioend */
+ submit_bio(prev);
+ return new;
}
/*
@@ -603,29 +766,37 @@ xfs_add_to_ioend(
struct block_device *bdev = xfs_find_bdev_for_inode(inode);
unsigned len = i_blocksize(inode);
unsigned poff = offset & (PAGE_SIZE - 1);
+ bool merged, same_page = false;
sector_t sector;
sector = xfs_fsb_to_db(ip, wpc->imap.br_startblock) +
((offset - XFS_FSB_TO_B(mp, wpc->imap.br_startoff)) >> 9);
- if (!wpc->ioend || wpc->io_type != wpc->ioend->io_type ||
+ if (!wpc->ioend ||
+ wpc->fork != wpc->ioend->io_fork ||
+ wpc->imap.br_state != wpc->ioend->io_state ||
sector != bio_end_sector(wpc->ioend->io_bio) ||
offset != wpc->ioend->io_offset + wpc->ioend->io_size) {
if (wpc->ioend)
list_add(&wpc->ioend->io_list, iolist);
- wpc->ioend = xfs_alloc_ioend(inode, wpc->io_type, offset,
- bdev, sector);
+ wpc->ioend = xfs_alloc_ioend(inode, wpc->fork,
+ wpc->imap.br_state, offset, bdev, sector, wbc);
}
- if (!__bio_try_merge_page(wpc->ioend->io_bio, page, len, poff)) {
- if (iop)
- atomic_inc(&iop->write_count);
- if (bio_full(wpc->ioend->io_bio))
- xfs_chain_bio(wpc->ioend, wbc, bdev, sector);
- __bio_add_page(wpc->ioend->io_bio, page, len, poff);
+ merged = __bio_try_merge_page(wpc->ioend->io_bio, page, len, poff,
+ &same_page);
+
+ if (iop && !same_page)
+ atomic_inc(&iop->write_count);
+
+ if (!merged) {
+ if (bio_full(wpc->ioend->io_bio, len))
+ wpc->ioend->io_bio = xfs_chain_bio(wpc->ioend->io_bio);
+ bio_add_page(wpc->ioend->io_bio, page, len, poff);
}
wpc->ioend->io_size += len;
+ wbc_account_cgroup_owner(wbc, page, len);
}
STATIC void
@@ -723,7 +894,7 @@ xfs_writepage_map(
error = xfs_map_blocks(wpc, inode, file_offset);
if (error)
break;
- if (wpc->io_type == XFS_IO_HOLE)
+ if (wpc->imap.br_startblock == HOLESTARTBLOCK)
continue;
xfs_add_to_ioend(inode, file_offset, page, iop, wpc, wbc,
&submit_list);
@@ -918,9 +1089,7 @@ xfs_vm_writepage(
struct page *page,
struct writeback_control *wbc)
{
- struct xfs_writepage_ctx wpc = {
- .io_type = XFS_IO_HOLE,
- };
+ struct xfs_writepage_ctx wpc = { };
int ret;
ret = xfs_do_writepage(page, wbc, &wpc);
@@ -934,9 +1103,7 @@ xfs_vm_writepages(
struct address_space *mapping,
struct writeback_control *wbc)
{
- struct xfs_writepage_ctx wpc = {
- .io_type = XFS_IO_HOLE,
- };
+ struct xfs_writepage_ctx wpc = { };
int ret;
xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED);
@@ -983,7 +1150,7 @@ xfs_vm_bmap(
* Since we don't pass back blockdev info, we can't return bmap
* information for rt files either.
*/
- if (xfs_is_reflink_inode(ip) || XFS_IS_REALTIME_INODE(ip))
+ if (xfs_is_cow_inode(ip) || XFS_IS_REALTIME_INODE(ip))
return 0;
return iomap_bmap(mapping, block, &xfs_iomap_ops);
}
diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h
index e5c23948a8ab..45a1ea240cbb 100644
--- a/fs/xfs/xfs_aops.h
+++ b/fs/xfs/xfs_aops.h
@@ -9,36 +9,15 @@
extern struct bio_set xfs_ioend_bioset;
/*
- * Types of I/O for bmap clustering and I/O completion tracking.
- *
- * This enum is used in string mapping in xfs_trace.h; please keep the
- * TRACE_DEFINE_ENUMs for it up to date.
- */
-enum {
- XFS_IO_HOLE, /* covers region without any block allocation */
- XFS_IO_DELALLOC, /* covers delalloc region */
- XFS_IO_UNWRITTEN, /* covers allocated but uninitialized data */
- XFS_IO_OVERWRITE, /* covers already allocated extent */
- XFS_IO_COW, /* covers copy-on-write extent */
-};
-
-#define XFS_IO_TYPES \
- { XFS_IO_HOLE, "hole" }, \
- { XFS_IO_DELALLOC, "delalloc" }, \
- { XFS_IO_UNWRITTEN, "unwritten" }, \
- { XFS_IO_OVERWRITE, "overwrite" }, \
- { XFS_IO_COW, "CoW" }
-
-/*
* Structure for buffered I/O completions.
*/
struct xfs_ioend {
struct list_head io_list; /* next ioend in chain */
- unsigned int io_type; /* delalloc / unwritten */
+ int io_fork; /* inode fork written back */
+ xfs_exntst_t io_state; /* extent state */
struct inode *io_inode; /* file being written to */
size_t io_size; /* size of the extent */
xfs_off_t io_offset; /* offset in the file */
- struct work_struct io_work; /* xfsdatad work queue */
struct xfs_trans *io_append_trans;/* xact. for size update */
struct bio *io_bio; /* bio being built */
struct bio io_inline_bio; /* MUST BE LAST! */
@@ -49,7 +28,6 @@ extern const struct address_space_operations xfs_dax_aops;
int xfs_setfilesize(struct xfs_inode *ip, xfs_off_t offset, size_t size);
-extern void xfs_count_page_state(struct page *, int *, int *);
extern struct block_device *xfs_find_bdev_for_inode(struct inode *);
extern struct dax_device *xfs_find_daxdev_for_inode(struct inode *);
diff --git a/fs/xfs/xfs_attr_inactive.c b/fs/xfs/xfs_attr_inactive.c
index 228821b2ebe0..dc93c51c17de 100644
--- a/fs/xfs/xfs_attr_inactive.c
+++ b/fs/xfs/xfs_attr_inactive.c
@@ -15,18 +15,13 @@
#include "xfs_da_format.h"
#include "xfs_da_btree.h"
#include "xfs_inode.h"
-#include "xfs_alloc.h"
#include "xfs_attr_remote.h"
#include "xfs_trans.h"
-#include "xfs_inode_item.h"
#include "xfs_bmap.h"
#include "xfs_attr.h"
#include "xfs_attr_leaf.h"
-#include "xfs_error.h"
#include "xfs_quota.h"
-#include "xfs_trace.h"
#include "xfs_dir2.h"
-#include "xfs_defer.h"
/*
* Look at all the extents for this logical region,
@@ -121,7 +116,7 @@ xfs_attr3_leaf_inactive(
int size;
int tmp;
int i;
- struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_mount *mp = bp->b_mount;
leaf = bp->b_addr;
xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr, leaf);
diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c
index a58034049995..58fc820a70c6 100644
--- a/fs/xfs/xfs_attr_list.c
+++ b/fs/xfs/xfs_attr_list.c
@@ -6,25 +6,20 @@
*/
#include "xfs.h"
#include "xfs_fs.h"
+#include "xfs_shared.h"
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
-#include "xfs_bit.h"
#include "xfs_mount.h"
#include "xfs_da_format.h"
-#include "xfs_da_btree.h"
#include "xfs_inode.h"
#include "xfs_trans.h"
-#include "xfs_inode_item.h"
#include "xfs_bmap.h"
#include "xfs_attr.h"
#include "xfs_attr_sf.h"
-#include "xfs_attr_remote.h"
#include "xfs_attr_leaf.h"
#include "xfs_error.h"
#include "xfs_trace.h"
-#include "xfs_buf_item.h"
-#include "xfs_cksum.h"
#include "xfs_dir2.h"
STATIC int
@@ -555,6 +550,7 @@ xfs_attr_put_listent(
attrlist_ent_t *aep;
int arraytop;
+ ASSERT(!context->seen_enough);
ASSERT(!(context->flags & ATTR_KERNOVAL));
ASSERT(context->count >= 0);
ASSERT(context->count < (ATTR_MAX_VALUELEN/8));
diff --git a/fs/xfs/xfs_bio_io.c b/fs/xfs/xfs_bio_io.c
new file mode 100644
index 000000000000..e2148f2d5d6b
--- /dev/null
+++ b/fs/xfs/xfs_bio_io.c
@@ -0,0 +1,61 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2019 Christoph Hellwig.
+ */
+#include "xfs.h"
+
+static inline unsigned int bio_max_vecs(unsigned int count)
+{
+ return min_t(unsigned, howmany(count, PAGE_SIZE), BIO_MAX_PAGES);
+}
+
+int
+xfs_rw_bdev(
+ struct block_device *bdev,
+ sector_t sector,
+ unsigned int count,
+ char *data,
+ unsigned int op)
+
+{
+ unsigned int is_vmalloc = is_vmalloc_addr(data);
+ unsigned int left = count;
+ int error;
+ struct bio *bio;
+
+ if (is_vmalloc && op == REQ_OP_WRITE)
+ flush_kernel_vmap_range(data, count);
+
+ bio = bio_alloc(GFP_KERNEL, bio_max_vecs(left));
+ bio_set_dev(bio, bdev);
+ bio->bi_iter.bi_sector = sector;
+ bio->bi_opf = op | REQ_META | REQ_SYNC;
+
+ do {
+ struct page *page = kmem_to_page(data);
+ unsigned int off = offset_in_page(data);
+ unsigned int len = min_t(unsigned, left, PAGE_SIZE - off);
+
+ while (bio_add_page(bio, page, len, off) != len) {
+ struct bio *prev = bio;
+
+ bio = bio_alloc(GFP_KERNEL, bio_max_vecs(left));
+ bio_copy_dev(bio, prev);
+ bio->bi_iter.bi_sector = bio_end_sector(prev);
+ bio->bi_opf = prev->bi_opf;
+ bio_chain(prev, bio);
+
+ submit_bio(prev);
+ }
+
+ data += len;
+ left -= len;
+ } while (left > 0);
+
+ error = submit_bio_wait(bio);
+ bio_put(bio);
+
+ if (is_vmalloc && op == REQ_OP_READ)
+ invalidate_kernel_vmap_range(data, count);
+ return error;
+}
diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c
index ce45f066995e..9fa4a7ee8cfc 100644
--- a/fs/xfs/xfs_bmap_item.c
+++ b/fs/xfs/xfs_bmap_item.c
@@ -9,17 +9,16 @@
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
#include "xfs_bit.h"
+#include "xfs_shared.h"
#include "xfs_mount.h"
#include "xfs_defer.h"
#include "xfs_inode.h"
#include "xfs_trans.h"
#include "xfs_trans_priv.h"
-#include "xfs_buf_item.h"
#include "xfs_bmap_item.h"
#include "xfs_log.h"
#include "xfs_bmap.h"
#include "xfs_icache.h"
-#include "xfs_trace.h"
#include "xfs_bmap_btree.h"
#include "xfs_trans_space.h"
@@ -96,15 +95,6 @@ xfs_bui_item_format(
}
/*
- * Pinning has no meaning for an bui item, so just return.
- */
-STATIC void
-xfs_bui_item_pin(
- struct xfs_log_item *lip)
-{
-}
-
-/*
* The unpin operation is the last place an BUI is manipulated in the log. It is
* either inserted in the AIL or aborted in the event of a log I/O error. In
* either case, the BUI transaction has been successfully committed to make it
@@ -123,71 +113,22 @@ xfs_bui_item_unpin(
}
/*
- * BUI items have no locking or pushing. However, since BUIs are pulled from
- * the AIL when their corresponding BUDs are committed to disk, their situation
- * is very similar to being pinned. Return XFS_ITEM_PINNED so that the caller
- * will eventually flush the log. This should help in getting the BUI out of
- * the AIL.
- */
-STATIC uint
-xfs_bui_item_push(
- struct xfs_log_item *lip,
- struct list_head *buffer_list)
-{
- return XFS_ITEM_PINNED;
-}
-
-/*
* The BUI has been either committed or aborted if the transaction has been
* cancelled. If the transaction was cancelled, an BUD isn't going to be
* constructed and thus we free the BUI here directly.
*/
STATIC void
-xfs_bui_item_unlock(
+xfs_bui_item_release(
struct xfs_log_item *lip)
{
- if (test_bit(XFS_LI_ABORTED, &lip->li_flags))
- xfs_bui_release(BUI_ITEM(lip));
-}
-
-/*
- * The BUI is logged only once and cannot be moved in the log, so simply return
- * the lsn at which it's been logged.
- */
-STATIC xfs_lsn_t
-xfs_bui_item_committed(
- struct xfs_log_item *lip,
- xfs_lsn_t lsn)
-{
- return lsn;
+ xfs_bui_release(BUI_ITEM(lip));
}
-/*
- * The BUI dependency tracking op doesn't do squat. It can't because
- * it doesn't know where the free extent is coming from. The dependency
- * tracking has to be handled by the "enclosing" metadata object. For
- * example, for inodes, the inode is locked throughout the extent freeing
- * so the dependency should be recorded there.
- */
-STATIC void
-xfs_bui_item_committing(
- struct xfs_log_item *lip,
- xfs_lsn_t lsn)
-{
-}
-
-/*
- * This is the ops vector shared by all bui log items.
- */
static const struct xfs_item_ops xfs_bui_item_ops = {
.iop_size = xfs_bui_item_size,
.iop_format = xfs_bui_item_format,
- .iop_pin = xfs_bui_item_pin,
.iop_unpin = xfs_bui_item_unpin,
- .iop_unlock = xfs_bui_item_unlock,
- .iop_committed = xfs_bui_item_committed,
- .iop_push = xfs_bui_item_push,
- .iop_committing = xfs_bui_item_committing,
+ .iop_release = xfs_bui_item_release,
};
/*
@@ -249,126 +190,241 @@ xfs_bud_item_format(
}
/*
- * Pinning has no meaning for an bud item, so just return.
+ * The BUD is either committed or aborted if the transaction is cancelled. If
+ * the transaction is cancelled, drop our reference to the BUI and free the
+ * BUD.
*/
STATIC void
-xfs_bud_item_pin(
+xfs_bud_item_release(
struct xfs_log_item *lip)
{
+ struct xfs_bud_log_item *budp = BUD_ITEM(lip);
+
+ xfs_bui_release(budp->bud_buip);
+ kmem_zone_free(xfs_bud_zone, budp);
}
-/*
- * Since pinning has no meaning for an bud item, unpinning does
- * not either.
- */
-STATIC void
-xfs_bud_item_unpin(
- struct xfs_log_item *lip,
- int remove)
+static const struct xfs_item_ops xfs_bud_item_ops = {
+ .flags = XFS_ITEM_RELEASE_WHEN_COMMITTED,
+ .iop_size = xfs_bud_item_size,
+ .iop_format = xfs_bud_item_format,
+ .iop_release = xfs_bud_item_release,
+};
+
+static struct xfs_bud_log_item *
+xfs_trans_get_bud(
+ struct xfs_trans *tp,
+ struct xfs_bui_log_item *buip)
{
+ struct xfs_bud_log_item *budp;
+
+ budp = kmem_zone_zalloc(xfs_bud_zone, KM_SLEEP);
+ xfs_log_item_init(tp->t_mountp, &budp->bud_item, XFS_LI_BUD,
+ &xfs_bud_item_ops);
+ budp->bud_buip = buip;
+ budp->bud_format.bud_bui_id = buip->bui_format.bui_id;
+
+ xfs_trans_add_item(tp, &budp->bud_item);
+ return budp;
}
/*
- * There isn't much you can do to push on an bud item. It is simply stuck
- * waiting for the log to be flushed to disk.
+ * Finish an bmap update and log it to the BUD. Note that the
+ * transaction is marked dirty regardless of whether the bmap update
+ * succeeds or fails to support the BUI/BUD lifecycle rules.
*/
-STATIC uint
-xfs_bud_item_push(
- struct xfs_log_item *lip,
- struct list_head *buffer_list)
+static int
+xfs_trans_log_finish_bmap_update(
+ struct xfs_trans *tp,
+ struct xfs_bud_log_item *budp,
+ enum xfs_bmap_intent_type type,
+ struct xfs_inode *ip,
+ int whichfork,
+ xfs_fileoff_t startoff,
+ xfs_fsblock_t startblock,
+ xfs_filblks_t *blockcount,
+ xfs_exntst_t state)
{
- return XFS_ITEM_PINNED;
+ int error;
+
+ error = xfs_bmap_finish_one(tp, ip, type, whichfork, startoff,
+ startblock, blockcount, state);
+
+ /*
+ * Mark the transaction dirty, even on error. This ensures the
+ * transaction is aborted, which:
+ *
+ * 1.) releases the BUI and frees the BUD
+ * 2.) shuts down the filesystem
+ */
+ tp->t_flags |= XFS_TRANS_DIRTY;
+ set_bit(XFS_LI_DIRTY, &budp->bud_item.li_flags);
+
+ return error;
}
-/*
- * The BUD is either committed or aborted if the transaction is cancelled. If
- * the transaction is cancelled, drop our reference to the BUI and free the
- * BUD.
- */
-STATIC void
-xfs_bud_item_unlock(
- struct xfs_log_item *lip)
+/* Sort bmap intents by inode. */
+static int
+xfs_bmap_update_diff_items(
+ void *priv,
+ struct list_head *a,
+ struct list_head *b)
{
- struct xfs_bud_log_item *budp = BUD_ITEM(lip);
+ struct xfs_bmap_intent *ba;
+ struct xfs_bmap_intent *bb;
- if (test_bit(XFS_LI_ABORTED, &lip->li_flags)) {
- xfs_bui_release(budp->bud_buip);
- kmem_zone_free(xfs_bud_zone, budp);
- }
+ ba = container_of(a, struct xfs_bmap_intent, bi_list);
+ bb = container_of(b, struct xfs_bmap_intent, bi_list);
+ return ba->bi_owner->i_ino - bb->bi_owner->i_ino;
}
-/*
- * When the bud item is committed to disk, all we need to do is delete our
- * reference to our partner bui item and then free ourselves. Since we're
- * freeing ourselves we must return -1 to keep the transaction code from
- * further referencing this item.
- */
-STATIC xfs_lsn_t
-xfs_bud_item_committed(
- struct xfs_log_item *lip,
- xfs_lsn_t lsn)
+/* Get an BUI. */
+STATIC void *
+xfs_bmap_update_create_intent(
+ struct xfs_trans *tp,
+ unsigned int count)
{
- struct xfs_bud_log_item *budp = BUD_ITEM(lip);
+ struct xfs_bui_log_item *buip;
+
+ ASSERT(count == XFS_BUI_MAX_FAST_EXTENTS);
+ ASSERT(tp != NULL);
+
+ buip = xfs_bui_init(tp->t_mountp);
+ ASSERT(buip != NULL);
/*
- * Drop the BUI reference regardless of whether the BUD has been
- * aborted. Once the BUD transaction is constructed, it is the sole
- * responsibility of the BUD to release the BUI (even if the BUI is
- * aborted due to log I/O error).
+ * Get a log_item_desc to point at the new item.
*/
- xfs_bui_release(budp->bud_buip);
- kmem_zone_free(xfs_bud_zone, budp);
+ xfs_trans_add_item(tp, &buip->bui_item);
+ return buip;
+}
- return (xfs_lsn_t)-1;
+/* Set the map extent flags for this mapping. */
+static void
+xfs_trans_set_bmap_flags(
+ struct xfs_map_extent *bmap,
+ enum xfs_bmap_intent_type type,
+ int whichfork,
+ xfs_exntst_t state)
+{
+ bmap->me_flags = 0;
+ switch (type) {
+ case XFS_BMAP_MAP:
+ case XFS_BMAP_UNMAP:
+ bmap->me_flags = type;
+ break;
+ default:
+ ASSERT(0);
+ }
+ if (state == XFS_EXT_UNWRITTEN)
+ bmap->me_flags |= XFS_BMAP_EXTENT_UNWRITTEN;
+ if (whichfork == XFS_ATTR_FORK)
+ bmap->me_flags |= XFS_BMAP_EXTENT_ATTR_FORK;
}
-/*
- * The BUD dependency tracking op doesn't do squat. It can't because
- * it doesn't know where the free extent is coming from. The dependency
- * tracking has to be handled by the "enclosing" metadata object. For
- * example, for inodes, the inode is locked throughout the extent freeing
- * so the dependency should be recorded there.
- */
+/* Log bmap updates in the intent item. */
STATIC void
-xfs_bud_item_committing(
- struct xfs_log_item *lip,
- xfs_lsn_t lsn)
+xfs_bmap_update_log_item(
+ struct xfs_trans *tp,
+ void *intent,
+ struct list_head *item)
{
+ struct xfs_bui_log_item *buip = intent;
+ struct xfs_bmap_intent *bmap;
+ uint next_extent;
+ struct xfs_map_extent *map;
+
+ bmap = container_of(item, struct xfs_bmap_intent, bi_list);
+
+ tp->t_flags |= XFS_TRANS_DIRTY;
+ set_bit(XFS_LI_DIRTY, &buip->bui_item.li_flags);
+
+ /*
+ * atomic_inc_return gives us the value after the increment;
+ * we want to use it as an array index so we need to subtract 1 from
+ * it.
+ */
+ next_extent = atomic_inc_return(&buip->bui_next_extent) - 1;
+ ASSERT(next_extent < buip->bui_format.bui_nextents);
+ map = &buip->bui_format.bui_extents[next_extent];
+ map->me_owner = bmap->bi_owner->i_ino;
+ map->me_startblock = bmap->bi_bmap.br_startblock;
+ map->me_startoff = bmap->bi_bmap.br_startoff;
+ map->me_len = bmap->bi_bmap.br_blockcount;
+ xfs_trans_set_bmap_flags(map, bmap->bi_type, bmap->bi_whichfork,
+ bmap->bi_bmap.br_state);
}
-/*
- * This is the ops vector shared by all bud log items.
- */
-static const struct xfs_item_ops xfs_bud_item_ops = {
- .iop_size = xfs_bud_item_size,
- .iop_format = xfs_bud_item_format,
- .iop_pin = xfs_bud_item_pin,
- .iop_unpin = xfs_bud_item_unpin,
- .iop_unlock = xfs_bud_item_unlock,
- .iop_committed = xfs_bud_item_committed,
- .iop_push = xfs_bud_item_push,
- .iop_committing = xfs_bud_item_committing,
-};
+/* Get an BUD so we can process all the deferred rmap updates. */
+STATIC void *
+xfs_bmap_update_create_done(
+ struct xfs_trans *tp,
+ void *intent,
+ unsigned int count)
+{
+ return xfs_trans_get_bud(tp, intent);
+}
-/*
- * Allocate and initialize an bud item with the given number of extents.
- */
-struct xfs_bud_log_item *
-xfs_bud_init(
- struct xfs_mount *mp,
- struct xfs_bui_log_item *buip)
+/* Process a deferred rmap update. */
+STATIC int
+xfs_bmap_update_finish_item(
+ struct xfs_trans *tp,
+ struct list_head *item,
+ void *done_item,
+ void **state)
+{
+ struct xfs_bmap_intent *bmap;
+ xfs_filblks_t count;
+ int error;
+
+ bmap = container_of(item, struct xfs_bmap_intent, bi_list);
+ count = bmap->bi_bmap.br_blockcount;
+ error = xfs_trans_log_finish_bmap_update(tp, done_item,
+ bmap->bi_type,
+ bmap->bi_owner, bmap->bi_whichfork,
+ bmap->bi_bmap.br_startoff,
+ bmap->bi_bmap.br_startblock,
+ &count,
+ bmap->bi_bmap.br_state);
+ if (!error && count > 0) {
+ ASSERT(bmap->bi_type == XFS_BMAP_UNMAP);
+ bmap->bi_bmap.br_blockcount = count;
+ return -EAGAIN;
+ }
+ kmem_free(bmap);
+ return error;
+}
+/* Abort all pending BUIs. */
+STATIC void
+xfs_bmap_update_abort_intent(
+ void *intent)
{
- struct xfs_bud_log_item *budp;
+ xfs_bui_release(intent);
+}
- budp = kmem_zone_zalloc(xfs_bud_zone, KM_SLEEP);
- xfs_log_item_init(mp, &budp->bud_item, XFS_LI_BUD, &xfs_bud_item_ops);
- budp->bud_buip = buip;
- budp->bud_format.bud_bui_id = buip->bui_format.bui_id;
+/* Cancel a deferred rmap update. */
+STATIC void
+xfs_bmap_update_cancel_item(
+ struct list_head *item)
+{
+ struct xfs_bmap_intent *bmap;
- return budp;
+ bmap = container_of(item, struct xfs_bmap_intent, bi_list);
+ kmem_free(bmap);
}
+const struct xfs_defer_op_type xfs_bmap_update_defer_type = {
+ .max_items = XFS_BUI_MAX_FAST_EXTENTS,
+ .diff_items = xfs_bmap_update_diff_items,
+ .create_intent = xfs_bmap_update_create_intent,
+ .abort_intent = xfs_bmap_update_abort_intent,
+ .log_item = xfs_bmap_update_log_item,
+ .create_done = xfs_bmap_update_create_done,
+ .finish_item = xfs_bmap_update_finish_item,
+ .cancel_item = xfs_bmap_update_cancel_item,
+};
+
/*
* Process a bmap update intent item that was recovered from the log.
* We need to update some inode's bmbt.
diff --git a/fs/xfs/xfs_bmap_item.h b/fs/xfs/xfs_bmap_item.h
index 89e043a88bb8..ad479cc73de8 100644
--- a/fs/xfs/xfs_bmap_item.h
+++ b/fs/xfs/xfs_bmap_item.h
@@ -75,8 +75,6 @@ extern struct kmem_zone *xfs_bui_zone;
extern struct kmem_zone *xfs_bud_zone;
struct xfs_bui_log_item *xfs_bui_init(struct xfs_mount *);
-struct xfs_bud_log_item *xfs_bud_init(struct xfs_mount *,
- struct xfs_bui_log_item *);
void xfs_bui_item_free(struct xfs_bui_log_item *);
void xfs_bui_release(struct xfs_bui_log_item *);
int xfs_bui_recover(struct xfs_trans *parent_tp, struct xfs_bui_log_item *buip);
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 1ee8c5539fa4..98c6a7a71427 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -12,12 +12,10 @@
#include "xfs_trans_resv.h"
#include "xfs_bit.h"
#include "xfs_mount.h"
-#include "xfs_da_format.h"
#include "xfs_defer.h"
#include "xfs_inode.h"
#include "xfs_btree.h"
#include "xfs_trans.h"
-#include "xfs_extfree_item.h"
#include "xfs_alloc.h"
#include "xfs_bmap.h"
#include "xfs_bmap_util.h"
@@ -28,11 +26,8 @@
#include "xfs_trans_space.h"
#include "xfs_trace.h"
#include "xfs_icache.h"
-#include "xfs_log.h"
-#include "xfs_rmap_btree.h"
#include "xfs_iomap.h"
#include "xfs_reflink.h"
-#include "xfs_refcount.h"
/* Kernel only BMAP related definitions and functions */
@@ -276,7 +271,7 @@ xfs_bmap_count_tree(
struct xfs_btree_block *block, *nextblock;
int numrecs;
- error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, XFS_BMAP_BTREE_REF,
+ error = xfs_btree_read_bufl(mp, tp, bno, &bp, XFS_BMAP_BTREE_REF,
&xfs_bmbt_buf_ops);
if (error)
return error;
@@ -287,7 +282,7 @@ xfs_bmap_count_tree(
/* Not at node above leaves, count this level of nodes */
nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib);
while (nextbno != NULLFSBLOCK) {
- error = xfs_btree_read_bufl(mp, tp, nextbno, 0, &nbp,
+ error = xfs_btree_read_bufl(mp, tp, nextbno, &nbp,
XFS_BMAP_BTREE_REF,
&xfs_bmbt_buf_ops);
if (error)
@@ -321,7 +316,7 @@ xfs_bmap_count_tree(
if (nextbno == NULLFSBLOCK)
break;
bno = nextbno;
- error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp,
+ error = xfs_btree_read_bufl(mp, tp, bno, &bp,
XFS_BMAP_BTREE_REF,
&xfs_bmbt_buf_ops);
if (error)
@@ -1162,16 +1157,13 @@ xfs_zero_file_space(
* by virtue of the hole punch.
*/
error = xfs_free_file_space(ip, offset, len);
- if (error)
- goto out;
+ if (error || xfs_is_always_cow_inode(ip))
+ return error;
- error = xfs_alloc_file_space(ip, round_down(offset, blksize),
+ return xfs_alloc_file_space(ip, round_down(offset, blksize),
round_up(offset + len, blksize) -
round_down(offset, blksize),
XFS_BMAPI_PREALLOC);
-out:
- return error;
-
}
static int
@@ -1196,6 +1188,8 @@ xfs_prepare_shift(
* about to shift down every extent from offset to EOF.
*/
error = xfs_flush_unmap_range(ip, offset, XFS_ISIZE(ip));
+ if (error)
+ return error;
/*
* Clean out anything hanging around in the cow fork now that
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 4f5f2ff3f70f..ca0849043f54 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -4,24 +4,9 @@
* All Rights Reserved.
*/
#include "xfs.h"
-#include <linux/stddef.h>
-#include <linux/errno.h>
-#include <linux/gfp.h>
-#include <linux/pagemap.h>
-#include <linux/init.h>
-#include <linux/vmalloc.h>
-#include <linux/bio.h>
-#include <linux/sysctl.h>
-#include <linux/proc_fs.h>
-#include <linux/workqueue.h>
-#include <linux/percpu.h>
-#include <linux/blkdev.h>
-#include <linux/hash.h>
-#include <linux/kthread.h>
-#include <linux/migrate.h>
#include <linux/backing-dev.h>
-#include <linux/freezer.h>
+#include "xfs_shared.h"
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
@@ -213,7 +198,7 @@ xfs_buf_free_maps(
}
}
-struct xfs_buf *
+static struct xfs_buf *
_xfs_buf_alloc(
struct xfs_buftarg *target,
struct xfs_buf_map *map,
@@ -243,6 +228,7 @@ _xfs_buf_alloc(
sema_init(&bp->b_sema, 0); /* held, no waiters */
spin_lock_init(&bp->b_lock);
bp->b_target = target;
+ bp->b_mount = target->bt_mount;
bp->b_flags = flags;
/*
@@ -263,12 +249,11 @@ _xfs_buf_alloc(
bp->b_maps[i].bm_len = map[i].bm_len;
bp->b_length += map[i].bm_len;
}
- bp->b_io_length = bp->b_length;
atomic_set(&bp->b_pin_count, 0);
init_waitqueue_head(&bp->b_waiters);
- XFS_STATS_INC(target->bt_mount, xb_create);
+ XFS_STATS_INC(bp->b_mount, xb_create);
trace_xfs_buf_init(bp, _RET_IP_);
return bp;
@@ -425,12 +410,12 @@ retry:
current->comm, current->pid,
__func__, gfp_mask);
- XFS_STATS_INC(bp->b_target->bt_mount, xb_page_retries);
+ XFS_STATS_INC(bp->b_mount, xb_page_retries);
congestion_wait(BLK_RW_ASYNC, HZ/50);
goto retry;
}
- XFS_STATS_INC(bp->b_target->bt_mount, xb_page_found);
+ XFS_STATS_INC(bp->b_mount, xb_page_found);
nbytes = min_t(size_t, size, PAGE_SIZE - offset);
size -= nbytes;
@@ -776,29 +761,24 @@ _xfs_buf_read(
}
/*
- * Set buffer ops on an unchecked buffer and validate it, if possible.
- *
- * If the caller passed in an ops structure and the buffer doesn't have ops
- * assigned, set the ops and use them to verify the contents. If the contents
- * cannot be verified, we'll clear XBF_DONE. We assume the buffer has no
- * recorded errors and is already in XBF_DONE state.
+ * Reverify a buffer found in cache without an attached ->b_ops.
*
- * Under normal operations, every in-core buffer must have buffer ops assigned
- * to them when the buffer is read in from disk so that we can validate the
- * metadata.
+ * If the caller passed an ops structure and the buffer doesn't have ops
+ * assigned, set the ops and use it to verify the contents. If verification
+ * fails, clear XBF_DONE. We assume the buffer has no recorded errors and is
+ * already in XBF_DONE state on entry.
*
- * However, there are two scenarios where one can encounter in-core buffers
- * that don't have buffer ops. The first is during log recovery of buffers on
- * a V4 filesystem, though these buffers are purged at the end of recovery.
- *
- * The other is online repair, which tries to match arbitrary metadata blocks
- * with btree types in order to find the root. If online repair doesn't match
- * the buffer with /any/ btree type, the buffer remains in memory in DONE state
- * with no ops, and a subsequent read_buf call from elsewhere will not set the
- * ops. This function helps us fix this situation.
+ * Under normal operations, every in-core buffer is verified on read I/O
+ * completion. There are two scenarios that can lead to in-core buffers without
+ * an assigned ->b_ops. The first is during log recovery of buffers on a V4
+ * filesystem, though these buffers are purged at the end of recovery. The
+ * other is online repair, which intentionally reads with a NULL buffer ops to
+ * run several verifiers across an in-core buffer in order to establish buffer
+ * type. If repair can't establish that, the buffer will be left in memory
+ * with NULL buffer ops.
*/
int
-xfs_buf_ensure_ops(
+xfs_buf_reverify(
struct xfs_buf *bp,
const struct xfs_buf_ops *ops)
{
@@ -840,7 +820,7 @@ xfs_buf_read_map(
return bp;
}
- xfs_buf_ensure_ops(bp, ops);
+ xfs_buf_reverify(bp, ops);
if (flags & XBF_ASYNC) {
/*
@@ -914,83 +894,6 @@ xfs_buf_read_uncached(
return 0;
}
-/*
- * Return a buffer allocated as an empty buffer and associated to external
- * memory via xfs_buf_associate_memory() back to it's empty state.
- */
-void
-xfs_buf_set_empty(
- struct xfs_buf *bp,
- size_t numblks)
-{
- if (bp->b_pages)
- _xfs_buf_free_pages(bp);
-
- bp->b_pages = NULL;
- bp->b_page_count = 0;
- bp->b_addr = NULL;
- bp->b_length = numblks;
- bp->b_io_length = numblks;
-
- ASSERT(bp->b_map_count == 1);
- bp->b_bn = XFS_BUF_DADDR_NULL;
- bp->b_maps[0].bm_bn = XFS_BUF_DADDR_NULL;
- bp->b_maps[0].bm_len = bp->b_length;
-}
-
-static inline struct page *
-mem_to_page(
- void *addr)
-{
- if ((!is_vmalloc_addr(addr))) {
- return virt_to_page(addr);
- } else {
- return vmalloc_to_page(addr);
- }
-}
-
-int
-xfs_buf_associate_memory(
- xfs_buf_t *bp,
- void *mem,
- size_t len)
-{
- int rval;
- int i = 0;
- unsigned long pageaddr;
- unsigned long offset;
- size_t buflen;
- int page_count;
-
- pageaddr = (unsigned long)mem & PAGE_MASK;
- offset = (unsigned long)mem - pageaddr;
- buflen = PAGE_ALIGN(len + offset);
- page_count = buflen >> PAGE_SHIFT;
-
- /* Free any previous set of page pointers */
- if (bp->b_pages)
- _xfs_buf_free_pages(bp);
-
- bp->b_pages = NULL;
- bp->b_addr = mem;
-
- rval = _xfs_buf_get_pages(bp, page_count);
- if (rval)
- return rval;
-
- bp->b_offset = offset;
-
- for (i = 0; i < bp->b_page_count; i++) {
- bp->b_pages[i] = mem_to_page((void *)pageaddr);
- pageaddr += PAGE_SIZE;
- }
-
- bp->b_io_length = BTOBB(len);
- bp->b_length = BTOBB(buflen);
-
- return 0;
-}
-
xfs_buf_t *
xfs_buf_get_uncached(
struct xfs_buftarg *target,
@@ -1185,7 +1088,7 @@ xfs_buf_lock(
trace_xfs_buf_lock(bp, _RET_IP_);
if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE))
- xfs_log_force(bp->b_target->bt_mount, 0);
+ xfs_log_force(bp->b_mount, 0);
down(&bp->b_sema);
trace_xfs_buf_lock_done(bp, _RET_IP_);
@@ -1274,7 +1177,7 @@ xfs_buf_ioend_async(
struct xfs_buf *bp)
{
INIT_WORK(&bp->b_ioend_work, xfs_buf_ioend_work);
- queue_work(bp->b_ioend_wq, &bp->b_ioend_work);
+ queue_work(bp->b_mount->m_buf_workqueue, &bp->b_ioend_work);
}
void
@@ -1293,7 +1196,7 @@ xfs_buf_ioerror_alert(
struct xfs_buf *bp,
const char *func)
{
- xfs_alert(bp->b_target->bt_mount,
+ xfs_alert(bp->b_mount,
"metadata I/O error in \"%s\" at daddr 0x%llx len %d error %d",
func, (uint64_t)XFS_BUF_ADDR(bp), bp->b_length,
-bp->b_error);
@@ -1312,10 +1215,8 @@ xfs_bwrite(
XBF_WRITE_FAIL | XBF_DONE);
error = xfs_buf_submit(bp);
- if (error) {
- xfs_force_shutdown(bp->b_target->bt_mount,
- SHUTDOWN_META_IO_ERROR);
- }
+ if (error)
+ xfs_force_shutdown(bp->b_mount, SHUTDOWN_META_IO_ERROR);
return error;
}
@@ -1441,21 +1342,8 @@ _xfs_buf_ioapply(
*/
bp->b_error = 0;
- /*
- * Initialize the I/O completion workqueue if we haven't yet or the
- * submitter has not opted to specify a custom one.
- */
- if (!bp->b_ioend_wq)
- bp->b_ioend_wq = bp->b_target->bt_mount->m_buf_workqueue;
-
if (bp->b_flags & XBF_WRITE) {
op = REQ_OP_WRITE;
- if (bp->b_flags & XBF_SYNCIO)
- op_flags = REQ_SYNC;
- if (bp->b_flags & XBF_FUA)
- op_flags |= REQ_FUA;
- if (bp->b_flags & XBF_FLUSH)
- op_flags |= REQ_PREFLUSH;
/*
* Run the write verifier callback function if it exists. If
@@ -1465,12 +1353,12 @@ _xfs_buf_ioapply(
if (bp->b_ops) {
bp->b_ops->verify_write(bp);
if (bp->b_error) {
- xfs_force_shutdown(bp->b_target->bt_mount,
+ xfs_force_shutdown(bp->b_mount,
SHUTDOWN_CORRUPT_INCORE);
return;
}
} else if (bp->b_bn != XFS_BUF_DADDR_NULL) {
- struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_mount *mp = bp->b_mount;
/*
* non-crc filesystems don't attach verifiers during
@@ -1502,7 +1390,7 @@ _xfs_buf_ioapply(
* subsequent call.
*/
offset = bp->b_offset;
- size = BBTOB(bp->b_io_length);
+ size = BBTOB(bp->b_length);
blk_start_plug(&plug);
for (i = 0; i < bp->b_map_count; i++) {
xfs_buf_ioapply_map(bp, i, &offset, &size, op, op_flags);
@@ -1548,7 +1436,7 @@ __xfs_buf_submit(
ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
/* on shutdown we stale and complete the buffer immediately */
- if (XFS_FORCED_SHUTDOWN(bp->b_target->bt_mount)) {
+ if (XFS_FORCED_SHUTDOWN(bp->b_mount)) {
xfs_buf_ioerror(bp, -EIO);
bp->b_flags &= ~XBF_DONE;
xfs_buf_stale(bp);
@@ -1618,16 +1506,11 @@ xfs_buf_offset(
return page_address(page) + (offset & (PAGE_SIZE-1));
}
-/*
- * Move data into or out of a buffer.
- */
void
-xfs_buf_iomove(
- xfs_buf_t *bp, /* buffer to process */
- size_t boff, /* starting buffer offset */
- size_t bsize, /* length to copy */
- void *data, /* data address */
- xfs_buf_rw_t mode) /* read/write/zero flag */
+xfs_buf_zero(
+ struct xfs_buf *bp,
+ size_t boff,
+ size_t bsize)
{
size_t bend;
@@ -1640,23 +1523,13 @@ xfs_buf_iomove(
page_offset = (boff + bp->b_offset) & ~PAGE_MASK;
page = bp->b_pages[page_index];
csize = min_t(size_t, PAGE_SIZE - page_offset,
- BBTOB(bp->b_io_length) - boff);
+ BBTOB(bp->b_length) - boff);
ASSERT((csize + page_offset) <= PAGE_SIZE);
- switch (mode) {
- case XBRW_ZERO:
- memset(page_address(page) + page_offset, 0, csize);
- break;
- case XBRW_READ:
- memcpy(data, page_address(page) + page_offset, csize);
- break;
- case XBRW_WRITE:
- memcpy(page_address(page) + page_offset, data, csize);
- }
+ memset(page_address(page) + page_offset, 0, csize);
boff += csize;
- data += csize;
}
}
@@ -2203,9 +2076,45 @@ void xfs_buf_set_ref(struct xfs_buf *bp, int lru_ref)
* This allows userspace to disrupt buffer caching for debug/testing
* purposes.
*/
- if (XFS_TEST_ERROR(false, bp->b_target->bt_mount,
- XFS_ERRTAG_BUF_LRU_REF))
+ if (XFS_TEST_ERROR(false, bp->b_mount, XFS_ERRTAG_BUF_LRU_REF))
lru_ref = 0;
atomic_set(&bp->b_lru_ref, lru_ref);
}
+
+/*
+ * Verify an on-disk magic value against the magic value specified in the
+ * verifier structure. The verifier magic is in disk byte order so the caller is
+ * expected to pass the value directly from disk.
+ */
+bool
+xfs_verify_magic(
+ struct xfs_buf *bp,
+ __be32 dmagic)
+{
+ struct xfs_mount *mp = bp->b_mount;
+ int idx;
+
+ idx = xfs_sb_version_hascrc(&mp->m_sb);
+ if (unlikely(WARN_ON(!bp->b_ops || !bp->b_ops->magic[idx])))
+ return false;
+ return dmagic == bp->b_ops->magic[idx];
+}
+/*
+ * Verify an on-disk magic value against the magic value specified in the
+ * verifier structure. The verifier magic is in disk byte order so the caller is
+ * expected to pass the value directly from disk.
+ */
+bool
+xfs_verify_magic16(
+ struct xfs_buf *bp,
+ __be16 dmagic)
+{
+ struct xfs_mount *mp = bp->b_mount;
+ int idx;
+
+ idx = xfs_sb_version_hascrc(&mp->m_sb);
+ if (unlikely(WARN_ON(!bp->b_ops || !bp->b_ops->magic16[idx])))
+ return false;
+ return dmagic == bp->b_ops->magic16[idx];
+}
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index b9f5511ea998..c6e57a3f409e 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -21,12 +21,6 @@
#define XFS_BUF_DADDR_NULL ((xfs_daddr_t) (-1LL))
-typedef enum {
- XBRW_READ = 1, /* transfer into target memory */
- XBRW_WRITE = 2, /* transfer from target memory */
- XBRW_ZERO = 3, /* Zero target memory */
-} xfs_buf_rw_t;
-
#define XBF_READ (1 << 0) /* buffer intended for reading from device */
#define XBF_WRITE (1 << 1) /* buffer intended for writing to device */
#define XBF_READ_AHEAD (1 << 2) /* asynchronous read-ahead */
@@ -34,12 +28,7 @@ typedef enum {
#define XBF_ASYNC (1 << 4) /* initiator will not wait for completion */
#define XBF_DONE (1 << 5) /* all pages in the buffer uptodate */
#define XBF_STALE (1 << 6) /* buffer has been staled, do not find it */
-#define XBF_WRITE_FAIL (1 << 24)/* async writes have failed on this buffer */
-
-/* I/O hints for the BIO layer */
-#define XBF_SYNCIO (1 << 10)/* treat this buffer as synchronous I/O */
-#define XBF_FUA (1 << 11)/* force cache write through mode */
-#define XBF_FLUSH (1 << 12)/* flush the disk cache before a write */
+#define XBF_WRITE_FAIL (1 << 7) /* async writes have failed on this buffer */
/* flags used only as arguments to access routines */
#define XBF_TRYLOCK (1 << 16)/* lock requested, but do not wait */
@@ -49,7 +38,6 @@ typedef enum {
#define _XBF_PAGES (1 << 20)/* backed by refcounted pages */
#define _XBF_KMEM (1 << 21)/* backed by heap memory */
#define _XBF_DELWRI_Q (1 << 22)/* buffer on a delwri queue */
-#define _XBF_COMPOUND (1 << 23)/* compound buffer */
typedef unsigned int xfs_buf_flags_t;
@@ -62,15 +50,11 @@ typedef unsigned int xfs_buf_flags_t;
{ XBF_DONE, "DONE" }, \
{ XBF_STALE, "STALE" }, \
{ XBF_WRITE_FAIL, "WRITE_FAIL" }, \
- { XBF_SYNCIO, "SYNCIO" }, \
- { XBF_FUA, "FUA" }, \
- { XBF_FLUSH, "FLUSH" }, \
{ XBF_TRYLOCK, "TRYLOCK" }, /* should never be set */\
{ XBF_UNMAPPED, "UNMAPPED" }, /* ditto */\
{ _XBF_PAGES, "PAGES" }, \
{ _XBF_KMEM, "KMEM" }, \
- { _XBF_DELWRI_Q, "DELWRI_Q" }, \
- { _XBF_COMPOUND, "COMPOUND" }
+ { _XBF_DELWRI_Q, "DELWRI_Q" }
/*
@@ -125,6 +109,10 @@ struct xfs_buf_map {
struct xfs_buf_ops {
char *name;
+ union {
+ __be32 magic[2]; /* v4 and v5 on disk magic values */
+ __be16 magic16[2]; /* v4 and v5 on disk magic values */
+ };
void (*verify_read)(struct xfs_buf *);
void (*verify_write)(struct xfs_buf *);
xfs_failaddr_t (*verify_struct)(struct xfs_buf *bp);
@@ -157,13 +145,13 @@ typedef struct xfs_buf {
wait_queue_head_t b_waiters; /* unpin waiters */
struct list_head b_list;
struct xfs_perag *b_pag; /* contains rbtree root */
+ struct xfs_mount *b_mount;
xfs_buftarg_t *b_target; /* buffer target (device) */
void *b_addr; /* virtual address of buffer */
struct work_struct b_ioend_work;
- struct workqueue_struct *b_ioend_wq; /* I/O completion wq */
xfs_buf_iodone_t b_iodone; /* I/O completion function */
struct completion b_iowait; /* queue for I/O waiters */
- void *b_log_item;
+ struct xfs_buf_log_item *b_log_item;
struct list_head b_li_list; /* Log items list head */
struct xfs_trans *b_transp;
struct page **b_pages; /* array of page pointers */
@@ -171,7 +159,6 @@ typedef struct xfs_buf {
struct xfs_buf_map *b_maps; /* compound buffer map */
struct xfs_buf_map __b_map; /* inline compound buffer map */
int b_map_count;
- int b_io_length; /* IO size in BBs */
atomic_t b_pin_count; /* pin count */
atomic_t b_io_remaining; /* #outstanding I/O requests */
unsigned int b_page_count; /* size of page array */
@@ -205,21 +192,6 @@ struct xfs_buf *xfs_buf_incore(struct xfs_buftarg *target,
xfs_daddr_t blkno, size_t numblks,
xfs_buf_flags_t flags);
-struct xfs_buf *_xfs_buf_alloc(struct xfs_buftarg *target,
- struct xfs_buf_map *map, int nmaps,
- xfs_buf_flags_t flags);
-
-static inline struct xfs_buf *
-xfs_buf_alloc(
- struct xfs_buftarg *target,
- xfs_daddr_t blkno,
- size_t numblks,
- xfs_buf_flags_t flags)
-{
- DEFINE_SINGLE_BUF_MAP(map, blkno, numblks);
- return _xfs_buf_alloc(target, &map, 1, flags);
-}
-
struct xfs_buf *xfs_buf_get_map(struct xfs_buftarg *target,
struct xfs_buf_map *map, int nmaps,
xfs_buf_flags_t flags);
@@ -235,11 +207,10 @@ static inline struct xfs_buf *
xfs_buf_get(
struct xfs_buftarg *target,
xfs_daddr_t blkno,
- size_t numblks,
- xfs_buf_flags_t flags)
+ size_t numblks)
{
DEFINE_SINGLE_BUF_MAP(map, blkno, numblks);
- return xfs_buf_get_map(target, &map, 1, flags);
+ return xfs_buf_get_map(target, &map, 1, 0);
}
static inline struct xfs_buf *
@@ -265,9 +236,6 @@ xfs_buf_readahead(
return xfs_buf_readahead_map(target, &map, 1, ops);
}
-void xfs_buf_set_empty(struct xfs_buf *bp, size_t numblks);
-int xfs_buf_associate_memory(struct xfs_buf *bp, void *mem, size_t length);
-
struct xfs_buf *xfs_buf_get_uncached(struct xfs_buftarg *target, size_t numblks,
int flags);
int xfs_buf_read_uncached(struct xfs_buftarg *target, xfs_daddr_t daddr,
@@ -301,10 +269,7 @@ static inline int xfs_buf_submit(struct xfs_buf *bp)
return __xfs_buf_submit(bp, wait);
}
-extern void xfs_buf_iomove(xfs_buf_t *, size_t, size_t, void *,
- xfs_buf_rw_t);
-#define xfs_buf_zero(bp, off, len) \
- xfs_buf_iomove((bp), (off), (len), NULL, XBRW_ZERO)
+void xfs_buf_zero(struct xfs_buf *bp, size_t boff, size_t bsize);
/* Buffer Utility Routines */
extern void *xfs_buf_offset(struct xfs_buf *, size_t);
@@ -385,6 +350,8 @@ extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int);
#define xfs_getsize_buftarg(buftarg) block_size((buftarg)->bt_bdev)
#define xfs_readonly_buftarg(buftarg) bdev_read_only((buftarg)->bt_bdev)
-int xfs_buf_ensure_ops(struct xfs_buf *bp, const struct xfs_buf_ops *ops);
+int xfs_buf_reverify(struct xfs_buf *bp, const struct xfs_buf_ops *ops);
+bool xfs_verify_magic(struct xfs_buf *bp, __be32 dmagic);
+bool xfs_verify_magic16(struct xfs_buf *bp, __be16 dmagic);
#endif /* __XFS_BUF_H__ */
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 010db5f8fb00..7dcaec54a20b 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -5,19 +5,17 @@
*/
#include "xfs.h"
#include "xfs_fs.h"
+#include "xfs_shared.h"
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
#include "xfs_bit.h"
-#include "xfs_sb.h"
#include "xfs_mount.h"
#include "xfs_trans.h"
#include "xfs_buf_item.h"
#include "xfs_trans_priv.h"
-#include "xfs_error.h"
#include "xfs_trace.h"
#include "xfs_log.h"
-#include "xfs_inode.h"
kmem_zone_t *xfs_buf_item_zone;
@@ -520,7 +518,7 @@ xfs_buf_item_push(
/* has a previous flush failed due to IO errors? */
if ((bp->b_flags & XBF_WRITE_FAIL) &&
___ratelimit(&xfs_buf_write_fail_rl_state, "XFS: Failing async write")) {
- xfs_warn(bp->b_target->bt_mount,
+ xfs_warn(bp->b_mount,
"Failing async write on buffer block 0x%llx. Retrying async write.",
(long long)bp->b_bn);
}
@@ -594,7 +592,7 @@ xfs_buf_item_put(
* free the item.
*/
STATIC void
-xfs_buf_item_unlock(
+xfs_buf_item_release(
struct xfs_log_item *lip)
{
struct xfs_buf_log_item *bip = BUF_ITEM(lip);
@@ -605,9 +603,11 @@ xfs_buf_item_unlock(
#if defined(DEBUG) || defined(XFS_WARN)
bool ordered = bip->bli_flags & XFS_BLI_ORDERED;
bool dirty = bip->bli_flags & XFS_BLI_DIRTY;
+ bool aborted = test_bit(XFS_LI_ABORTED,
+ &lip->li_flags);
#endif
- trace_xfs_buf_item_unlock(bip);
+ trace_xfs_buf_item_release(bip);
/*
* The bli dirty state should match whether the blf has logged segments
@@ -633,10 +633,18 @@ xfs_buf_item_unlock(
released = xfs_buf_item_put(bip);
if (hold || (stale && !released))
return;
- ASSERT(!stale || test_bit(XFS_LI_ABORTED, &lip->li_flags));
+ ASSERT(!stale || aborted);
xfs_buf_relse(bp);
}
+STATIC void
+xfs_buf_item_committing(
+ struct xfs_log_item *lip,
+ xfs_lsn_t commit_lsn)
+{
+ return xfs_buf_item_release(lip);
+}
+
/*
* This is called to find out where the oldest active copy of the
* buf log item in the on disk log resides now that the last log
@@ -669,25 +677,15 @@ xfs_buf_item_committed(
return lsn;
}
-STATIC void
-xfs_buf_item_committing(
- struct xfs_log_item *lip,
- xfs_lsn_t commit_lsn)
-{
-}
-
-/*
- * This is the ops vector shared by all buf log items.
- */
static const struct xfs_item_ops xfs_buf_item_ops = {
.iop_size = xfs_buf_item_size,
.iop_format = xfs_buf_item_format,
.iop_pin = xfs_buf_item_pin,
.iop_unpin = xfs_buf_item_unpin,
- .iop_unlock = xfs_buf_item_unlock,
+ .iop_release = xfs_buf_item_release,
+ .iop_committing = xfs_buf_item_committing,
.iop_committed = xfs_buf_item_committed,
.iop_push = xfs_buf_item_push,
- .iop_committing = xfs_buf_item_committing
};
STATIC int
@@ -741,7 +739,7 @@ xfs_buf_item_init(
* this buffer. If we do already have one, there is
* nothing to do here so return.
*/
- ASSERT(bp->b_target->bt_mount == mp);
+ ASSERT(bp->b_mount == mp);
if (bip) {
ASSERT(bip->bli_item.li_type == XFS_LI_BUF);
ASSERT(!bp->b_transp);
@@ -978,9 +976,9 @@ xfs_buf_item_relse(
*/
void
xfs_buf_attach_iodone(
- xfs_buf_t *bp,
- void (*cb)(xfs_buf_t *, xfs_log_item_t *),
- xfs_log_item_t *lip)
+ struct xfs_buf *bp,
+ void (*cb)(struct xfs_buf *, struct xfs_log_item *),
+ struct xfs_log_item *lip)
{
ASSERT(xfs_buf_islocked(bp));
diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h
index 90f65f891fab..4a054b11011a 100644
--- a/fs/xfs/xfs_buf_item.h
+++ b/fs/xfs/xfs_buf_item.h
@@ -39,7 +39,7 @@ struct xfs_buf_log_item;
* locked, and which 128 byte chunks of the buffer are dirty.
*/
struct xfs_buf_log_item {
- xfs_log_item_t bli_item; /* common item structure */
+ struct xfs_log_item bli_item; /* common item structure */
struct xfs_buf *bli_buf; /* real buffer pointer */
unsigned int bli_flags; /* misc flags */
unsigned int bli_recur; /* lock recursion count */
@@ -55,8 +55,8 @@ bool xfs_buf_item_put(struct xfs_buf_log_item *);
void xfs_buf_item_log(struct xfs_buf_log_item *, uint, uint);
bool xfs_buf_item_dirty_format(struct xfs_buf_log_item *);
void xfs_buf_attach_iodone(struct xfs_buf *,
- void(*)(struct xfs_buf *, xfs_log_item_t *),
- xfs_log_item_t *);
+ void(*)(struct xfs_buf *, struct xfs_log_item *),
+ struct xfs_log_item *);
void xfs_buf_iodone_callbacks(struct xfs_buf *);
void xfs_buf_iodone(struct xfs_buf *, struct xfs_log_item *);
bool xfs_buf_resubmit_failed_buffers(struct xfs_buf *,
diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c
index 5142e64e2345..283df898dd9f 100644
--- a/fs/xfs/xfs_dir2_readdir.c
+++ b/fs/xfs/xfs_dir2_readdir.c
@@ -6,17 +6,14 @@
*/
#include "xfs.h"
#include "xfs_fs.h"
+#include "xfs_shared.h"
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
-#include "xfs_bit.h"
#include "xfs_mount.h"
-#include "xfs_da_format.h"
-#include "xfs_da_btree.h"
#include "xfs_inode.h"
#include "xfs_dir2.h"
#include "xfs_dir2_priv.h"
-#include "xfs_error.h"
#include "xfs_trace.h"
#include "xfs_bmap.h"
#include "xfs_trans.h"
diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c
index 93f07edafd81..8ec7aab89044 100644
--- a/fs/xfs/xfs_discard.c
+++ b/fs/xfs/xfs_discard.c
@@ -4,19 +4,17 @@
* All Rights Reserved.
*/
#include "xfs.h"
+#include "xfs_shared.h"
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
#include "xfs_sb.h"
#include "xfs_mount.h"
-#include "xfs_quota.h"
-#include "xfs_inode.h"
#include "xfs_btree.h"
#include "xfs_alloc_btree.h"
#include "xfs_alloc.h"
#include "xfs_error.h"
#include "xfs_extent_busy.h"
-#include "xfs_discard.h"
#include "xfs_trace.h"
#include "xfs_log.h"
@@ -161,9 +159,19 @@ xfs_ioc_trim(
return -EPERM;
if (!blk_queue_discard(q))
return -EOPNOTSUPP;
+
+ /*
+ * We haven't recovered the log, so we cannot use our bnobt-guided
+ * storage zapping commands.
+ */
+ if (mp->m_flags & XFS_MOUNT_NORECOVERY)
+ return -EROFS;
+
if (copy_from_user(&range, urange, sizeof(range)))
return -EFAULT;
+ range.minlen = max_t(u64, granularity, range.minlen);
+ minlen = BTOBB(range.minlen);
/*
* Truncating down the len isn't actually quite correct, but using
* BBTOB would mean we trivially get overflows for values
@@ -178,7 +186,6 @@ xfs_ioc_trim(
start = BTOBB(range.start);
end = start + BTOBBT(range.len) - 1;
- minlen = BTOBB(max_t(u64, granularity, range.minlen));
if (end > XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks) - 1)
end = XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)- 1;
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index 87e6dd5326d5..fb1ad4483081 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -14,16 +14,12 @@
#include "xfs_defer.h"
#include "xfs_inode.h"
#include "xfs_bmap.h"
-#include "xfs_bmap_util.h"
-#include "xfs_alloc.h"
#include "xfs_quota.h"
-#include "xfs_error.h"
#include "xfs_trans.h"
#include "xfs_buf_item.h"
#include "xfs_trans_space.h"
#include "xfs_trans_priv.h"
#include "xfs_qm.h"
-#include "xfs_cksum.h"
#include "xfs_trace.h"
#include "xfs_log.h"
#include "xfs_bmap_btree.h"
@@ -277,7 +273,8 @@ xfs_dquot_set_prealloc_limits(struct xfs_dquot *dqp)
/*
* Ensure that the given in-core dquot has a buffer on disk backing it, and
- * return the buffer. This is called when the bmapi finds a hole.
+ * return the buffer locked and held. This is called when the bmapi finds a
+ * hole.
*/
STATIC int
xfs_dquot_disk_alloc(
@@ -355,13 +352,14 @@ xfs_dquot_disk_alloc(
* If everything succeeds, the caller of this function is returned a
* buffer that is locked and held to the transaction. The caller
* is responsible for unlocking any buffer passed back, either
- * manually or by committing the transaction.
+ * manually or by committing the transaction. On error, the buffer is
+ * released and not passed back.
*/
xfs_trans_bhold(tp, bp);
error = xfs_defer_finish(tpp);
- tp = *tpp;
if (error) {
- xfs_buf_relse(bp);
+ xfs_trans_bhold_release(*tpp, bp);
+ xfs_trans_brelse(*tpp, bp);
return error;
}
*bpp = bp;
@@ -521,7 +519,6 @@ xfs_qm_dqread_alloc(
struct xfs_buf **bpp)
{
struct xfs_trans *tp;
- struct xfs_buf *bp;
int error;
error = xfs_trans_alloc(mp, &M_RES(mp)->tr_qm_dqalloc,
@@ -529,7 +526,7 @@ xfs_qm_dqread_alloc(
if (error)
goto err;
- error = xfs_dquot_disk_alloc(&tp, dqp, &bp);
+ error = xfs_dquot_disk_alloc(&tp, dqp, bpp);
if (error)
goto err_cancel;
@@ -539,10 +536,10 @@ xfs_qm_dqread_alloc(
* Buffer was held to the transaction, so we have to unlock it
* manually here because we're not passing it back.
*/
- xfs_buf_relse(bp);
+ xfs_buf_relse(*bpp);
+ *bpp = NULL;
goto err;
}
- *bpp = bp;
return 0;
err_cancel:
@@ -1242,7 +1239,7 @@ xfs_qm_exit(void)
/*
* Iterate every dquot of a particular type. The caller must ensure that the
* particular quota type is active. iter_fn can return negative error codes,
- * or XFS_BTREE_QUERY_RANGE_ABORT to indicate that it wants to stop iterating.
+ * or XFS_ITER_ABORT to indicate that it wants to stop iterating.
*/
int
xfs_qm_dqiterate(
diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h
index 64bd8640f6e8..4fe85709d55d 100644
--- a/fs/xfs/xfs_dquot.h
+++ b/fs/xfs/xfs_dquot.h
@@ -34,7 +34,6 @@ typedef struct xfs_dquot {
uint dq_flags; /* various flags (XFS_DQ_*) */
struct list_head q_lru; /* global free list of dquots */
struct xfs_mount*q_mount; /* filesystem this relates to */
- struct xfs_trans*q_transp; /* trans this belongs to currently */
uint q_nrefs; /* # active refs from inodes */
xfs_daddr_t q_blkno; /* blkno of dquot buffer */
int q_bufoffset; /* off of dq in buffer (# dquots) */
diff --git a/fs/xfs/xfs_dquot_item.c b/fs/xfs/xfs_dquot_item.c
index 7dedd17c4813..282ec5af293e 100644
--- a/fs/xfs/xfs_dquot_item.c
+++ b/fs/xfs/xfs_dquot_item.c
@@ -5,13 +5,13 @@
*/
#include "xfs.h"
#include "xfs_fs.h"
+#include "xfs_shared.h"
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
#include "xfs_inode.h"
#include "xfs_quota.h"
-#include "xfs_error.h"
#include "xfs_trans.h"
#include "xfs_buf_item.h"
#include "xfs_trans_priv.h"
@@ -94,18 +94,6 @@ xfs_qm_dquot_logitem_unpin(
wake_up(&dqp->q_pinwait);
}
-STATIC xfs_lsn_t
-xfs_qm_dquot_logitem_committed(
- struct xfs_log_item *lip,
- xfs_lsn_t lsn)
-{
- /*
- * We always re-log the entire dquot when it becomes dirty,
- * so, the latest copy _is_ the only one that matters.
- */
- return lsn;
-}
-
/*
* This is called to wait for the given dquot to be unpinned.
* Most of these pin/unpin routines are plagiarized from inode code.
@@ -209,14 +197,8 @@ out_unlock:
return rval;
}
-/*
- * Unlock the dquot associated with the log item.
- * Clear the fields of the dquot and dquot log item that
- * are specific to the current transaction. If the
- * hold flags is set, do not unlock the dquot.
- */
STATIC void
-xfs_qm_dquot_logitem_unlock(
+xfs_qm_dquot_logitem_release(
struct xfs_log_item *lip)
{
struct xfs_dquot *dqp = DQUOT_ITEM(lip)->qli_dquot;
@@ -224,11 +206,6 @@ xfs_qm_dquot_logitem_unlock(
ASSERT(XFS_DQ_IS_LOCKED(dqp));
/*
- * Clear the transaction pointer in the dquot
- */
- dqp->q_transp = NULL;
-
- /*
* dquots are never 'held' from getting unlocked at the end of
* a transaction. Their locking and unlocking is hidden inside the
* transaction layer, within trans_commit. Hence, no LI_HOLD flag
@@ -237,30 +214,22 @@ xfs_qm_dquot_logitem_unlock(
xfs_dqunlock(dqp);
}
-/*
- * this needs to stamp an lsn into the dquot, I think.
- * rpc's that look at user dquot's would then have to
- * push on the dependency recorded in the dquot
- */
STATIC void
xfs_qm_dquot_logitem_committing(
struct xfs_log_item *lip,
- xfs_lsn_t lsn)
+ xfs_lsn_t commit_lsn)
{
+ return xfs_qm_dquot_logitem_release(lip);
}
-/*
- * This is the ops vector for dquots
- */
static const struct xfs_item_ops xfs_dquot_item_ops = {
.iop_size = xfs_qm_dquot_logitem_size,
.iop_format = xfs_qm_dquot_logitem_format,
.iop_pin = xfs_qm_dquot_logitem_pin,
.iop_unpin = xfs_qm_dquot_logitem_unpin,
- .iop_unlock = xfs_qm_dquot_logitem_unlock,
- .iop_committed = xfs_qm_dquot_logitem_committed,
+ .iop_release = xfs_qm_dquot_logitem_release,
+ .iop_committing = xfs_qm_dquot_logitem_committing,
.iop_push = xfs_qm_dquot_logitem_push,
- .iop_committing = xfs_qm_dquot_logitem_committing,
.iop_error = xfs_dquot_item_error
};
@@ -320,26 +289,6 @@ xfs_qm_qoff_logitem_format(
}
/*
- * Pinning has no meaning for an quotaoff item, so just return.
- */
-STATIC void
-xfs_qm_qoff_logitem_pin(
- struct xfs_log_item *lip)
-{
-}
-
-/*
- * Since pinning has no meaning for an quotaoff item, unpinning does
- * not either.
- */
-STATIC void
-xfs_qm_qoff_logitem_unpin(
- struct xfs_log_item *lip,
- int remove)
-{
-}
-
-/*
* There isn't much you can do to push a quotaoff item. It is simply
* stuck waiting for the log to be flushed to disk.
*/
@@ -351,28 +300,6 @@ xfs_qm_qoff_logitem_push(
return XFS_ITEM_LOCKED;
}
-/*
- * Quotaoff items have no locking or pushing, so return failure
- * so that the caller doesn't bother with us.
- */
-STATIC void
-xfs_qm_qoff_logitem_unlock(
- struct xfs_log_item *lip)
-{
-}
-
-/*
- * The quotaoff-start-item is logged only once and cannot be moved in the log,
- * so simply return the lsn at which it's been logged.
- */
-STATIC xfs_lsn_t
-xfs_qm_qoff_logitem_committed(
- struct xfs_log_item *lip,
- xfs_lsn_t lsn)
-{
- return lsn;
-}
-
STATIC xfs_lsn_t
xfs_qm_qoffend_logitem_committed(
struct xfs_log_item *lip,
@@ -396,50 +323,17 @@ xfs_qm_qoffend_logitem_committed(
return (xfs_lsn_t)-1;
}
-/*
- * XXX rcc - don't know quite what to do with this. I think we can
- * just ignore it. The only time that isn't the case is if we allow
- * the client to somehow see that quotas have been turned off in which
- * we can't allow that to get back until the quotaoff hits the disk.
- * So how would that happen? Also, do we need different routines for
- * quotaoff start and quotaoff end? I suspect the answer is yes but
- * to be sure, I need to look at the recovery code and see how quota off
- * recovery is handled (do we roll forward or back or do something else).
- * If we roll forwards or backwards, then we need two separate routines,
- * one that does nothing and one that stamps in the lsn that matters
- * (truly makes the quotaoff irrevocable). If we do something else,
- * then maybe we don't need two.
- */
-STATIC void
-xfs_qm_qoff_logitem_committing(
- struct xfs_log_item *lip,
- xfs_lsn_t commit_lsn)
-{
-}
-
static const struct xfs_item_ops xfs_qm_qoffend_logitem_ops = {
.iop_size = xfs_qm_qoff_logitem_size,
.iop_format = xfs_qm_qoff_logitem_format,
- .iop_pin = xfs_qm_qoff_logitem_pin,
- .iop_unpin = xfs_qm_qoff_logitem_unpin,
- .iop_unlock = xfs_qm_qoff_logitem_unlock,
.iop_committed = xfs_qm_qoffend_logitem_committed,
.iop_push = xfs_qm_qoff_logitem_push,
- .iop_committing = xfs_qm_qoff_logitem_committing
};
-/*
- * This is the ops vector shared by all quotaoff-start log items.
- */
static const struct xfs_item_ops xfs_qm_qoff_logitem_ops = {
.iop_size = xfs_qm_qoff_logitem_size,
.iop_format = xfs_qm_qoff_logitem_format,
- .iop_pin = xfs_qm_qoff_logitem_pin,
- .iop_unpin = xfs_qm_qoff_logitem_unpin,
- .iop_unlock = xfs_qm_qoff_logitem_unlock,
- .iop_committed = xfs_qm_qoff_logitem_committed,
.iop_push = xfs_qm_qoff_logitem_push,
- .iop_committing = xfs_qm_qoff_logitem_committing
};
/*
diff --git a/fs/xfs/xfs_dquot_item.h b/fs/xfs/xfs_dquot_item.h
index db9df710a308..1aed34ccdabc 100644
--- a/fs/xfs/xfs_dquot_item.h
+++ b/fs/xfs/xfs_dquot_item.h
@@ -12,13 +12,13 @@ struct xfs_mount;
struct xfs_qoff_logitem;
typedef struct xfs_dq_logitem {
- xfs_log_item_t qli_item; /* common portion */
+ struct xfs_log_item qli_item; /* common portion */
struct xfs_dquot *qli_dquot; /* dquot ptr */
xfs_lsn_t qli_flush_lsn; /* lsn at last flush */
} xfs_dq_logitem_t;
typedef struct xfs_qoff_logitem {
- xfs_log_item_t qql_item; /* common portion */
+ struct xfs_log_item qql_item; /* common portion */
struct xfs_qoff_logitem *qql_start_lip; /* qoff-start logitem, if any */
unsigned int qql_flags;
} xfs_qoff_logitem_t;
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index 9866f542e77b..544c9482a0ef 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -4,6 +4,7 @@
* All Rights Reserved.
*/
#include "xfs.h"
+#include "xfs_shared.h"
#include "xfs_format.h"
#include "xfs_fs.h"
#include "xfs_log_format.h"
@@ -51,6 +52,7 @@ static unsigned int xfs_errortag_random_default[] = {
XFS_RANDOM_BUF_LRU_REF,
XFS_RANDOM_FORCE_SCRUB_REPAIR,
XFS_RANDOM_FORCE_SUMMARY_RECALC,
+ XFS_RANDOM_IUNLINK_FALLBACK,
};
struct xfs_errortag_attr {
@@ -159,6 +161,7 @@ XFS_ERRORTAG_ATTR_RW(log_item_pin, XFS_ERRTAG_LOG_ITEM_PIN);
XFS_ERRORTAG_ATTR_RW(buf_lru_ref, XFS_ERRTAG_BUF_LRU_REF);
XFS_ERRORTAG_ATTR_RW(force_repair, XFS_ERRTAG_FORCE_SCRUB_REPAIR);
XFS_ERRORTAG_ATTR_RW(bad_summary, XFS_ERRTAG_FORCE_SUMMARY_RECALC);
+XFS_ERRORTAG_ATTR_RW(iunlink_fallback, XFS_ERRTAG_IUNLINK_FALLBACK);
static struct attribute *xfs_errortag_attrs[] = {
XFS_ERRORTAG_ATTR_LIST(noerror),
@@ -195,6 +198,7 @@ static struct attribute *xfs_errortag_attrs[] = {
XFS_ERRORTAG_ATTR_LIST(buf_lru_ref),
XFS_ERRORTAG_ATTR_LIST(force_repair),
XFS_ERRORTAG_ATTR_LIST(bad_summary),
+ XFS_ERRORTAG_ATTR_LIST(iunlink_fallback),
NULL,
};
@@ -350,14 +354,15 @@ xfs_buf_verifier_error(
size_t bufsz,
xfs_failaddr_t failaddr)
{
- struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_mount *mp = bp->b_mount;
xfs_failaddr_t fa;
int sz;
fa = failaddr ? failaddr : __return_address;
__xfs_buf_ioerror(bp, error, fa);
- xfs_alert(mp, "Metadata %s detected at %pS, %s block 0x%llx %s",
+ xfs_alert_tag(mp, XFS_PTAG_VERIFIER_ERROR,
+ "Metadata %s detected at %pS, %s block 0x%llx %s",
bp->b_error == -EFSBADCRC ? "CRC error" : "corruption",
fa, bp->b_ops->name, bp->b_bn, name);
diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h
index 246d3e989c6c..602aa7d62b66 100644
--- a/fs/xfs/xfs_error.h
+++ b/fs/xfs/xfs_error.h
@@ -98,5 +98,6 @@ extern int xfs_errortag_clearall(struct xfs_mount *mp);
#define XFS_PTAG_SHUTDOWN_IOERROR 0x00000020
#define XFS_PTAG_SHUTDOWN_LOGERROR 0x00000040
#define XFS_PTAG_FSBLOCK_ZERO 0x00000080
+#define XFS_PTAG_VERIFIER_ERROR 0x00000100
#endif /* __XFS_ERROR_H__ */
diff --git a/fs/xfs/xfs_export.c b/fs/xfs/xfs_export.c
index f2284ceb129f..f1372f9046e3 100644
--- a/fs/xfs/xfs_export.c
+++ b/fs/xfs/xfs_export.c
@@ -4,18 +4,16 @@
* All Rights Reserved.
*/
#include "xfs.h"
+#include "xfs_shared.h"
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
-#include "xfs_da_format.h"
-#include "xfs_da_btree.h"
#include "xfs_dir2.h"
#include "xfs_export.h"
#include "xfs_inode.h"
#include "xfs_trans.h"
#include "xfs_inode_item.h"
-#include "xfs_trace.h"
#include "xfs_icache.h"
#include "xfs_log.h"
#include "xfs_pnfs.h"
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index 74ddf66f4cfe..86f6512d6864 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -9,14 +9,18 @@
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
#include "xfs_bit.h"
+#include "xfs_shared.h"
#include "xfs_mount.h"
+#include "xfs_defer.h"
#include "xfs_trans.h"
#include "xfs_trans_priv.h"
-#include "xfs_buf_item.h"
#include "xfs_extfree_item.h"
#include "xfs_log.h"
#include "xfs_btree.h"
#include "xfs_rmap.h"
+#include "xfs_alloc.h"
+#include "xfs_bmap.h"
+#include "xfs_trace.h"
kmem_zone_t *xfs_efi_zone;
@@ -107,15 +111,6 @@ xfs_efi_item_format(
/*
- * Pinning has no meaning for an efi item, so just return.
- */
-STATIC void
-xfs_efi_item_pin(
- struct xfs_log_item *lip)
-{
-}
-
-/*
* The unpin operation is the last place an EFI is manipulated in the log. It is
* either inserted in the AIL or aborted in the event of a log I/O error. In
* either case, the EFI transaction has been successfully committed to make it
@@ -133,71 +128,22 @@ xfs_efi_item_unpin(
}
/*
- * Efi items have no locking or pushing. However, since EFIs are pulled from
- * the AIL when their corresponding EFDs are committed to disk, their situation
- * is very similar to being pinned. Return XFS_ITEM_PINNED so that the caller
- * will eventually flush the log. This should help in getting the EFI out of
- * the AIL.
- */
-STATIC uint
-xfs_efi_item_push(
- struct xfs_log_item *lip,
- struct list_head *buffer_list)
-{
- return XFS_ITEM_PINNED;
-}
-
-/*
* The EFI has been either committed or aborted if the transaction has been
* cancelled. If the transaction was cancelled, an EFD isn't going to be
* constructed and thus we free the EFI here directly.
*/
STATIC void
-xfs_efi_item_unlock(
+xfs_efi_item_release(
struct xfs_log_item *lip)
{
- if (test_bit(XFS_LI_ABORTED, &lip->li_flags))
- xfs_efi_release(EFI_ITEM(lip));
-}
-
-/*
- * The EFI is logged only once and cannot be moved in the log, so simply return
- * the lsn at which it's been logged.
- */
-STATIC xfs_lsn_t
-xfs_efi_item_committed(
- struct xfs_log_item *lip,
- xfs_lsn_t lsn)
-{
- return lsn;
-}
-
-/*
- * The EFI dependency tracking op doesn't do squat. It can't because
- * it doesn't know where the free extent is coming from. The dependency
- * tracking has to be handled by the "enclosing" metadata object. For
- * example, for inodes, the inode is locked throughout the extent freeing
- * so the dependency should be recorded there.
- */
-STATIC void
-xfs_efi_item_committing(
- struct xfs_log_item *lip,
- xfs_lsn_t lsn)
-{
+ xfs_efi_release(EFI_ITEM(lip));
}
-/*
- * This is the ops vector shared by all efi log items.
- */
static const struct xfs_item_ops xfs_efi_item_ops = {
.iop_size = xfs_efi_item_size,
.iop_format = xfs_efi_item_format,
- .iop_pin = xfs_efi_item_pin,
.iop_unpin = xfs_efi_item_unpin,
- .iop_unlock = xfs_efi_item_unlock,
- .iop_committed = xfs_efi_item_committed,
- .iop_push = xfs_efi_item_push,
- .iop_committing = xfs_efi_item_committing
+ .iop_release = xfs_efi_item_release,
};
@@ -349,136 +295,298 @@ xfs_efd_item_format(
}
/*
- * Pinning has no meaning for an efd item, so just return.
+ * The EFD is either committed or aborted if the transaction is cancelled. If
+ * the transaction is cancelled, drop our reference to the EFI and free the EFD.
*/
STATIC void
-xfs_efd_item_pin(
+xfs_efd_item_release(
struct xfs_log_item *lip)
{
+ struct xfs_efd_log_item *efdp = EFD_ITEM(lip);
+
+ xfs_efi_release(efdp->efd_efip);
+ xfs_efd_item_free(efdp);
}
+static const struct xfs_item_ops xfs_efd_item_ops = {
+ .flags = XFS_ITEM_RELEASE_WHEN_COMMITTED,
+ .iop_size = xfs_efd_item_size,
+ .iop_format = xfs_efd_item_format,
+ .iop_release = xfs_efd_item_release,
+};
+
/*
- * Since pinning has no meaning for an efd item, unpinning does
- * not either.
+ * Allocate an "extent free done" log item that will hold nextents worth of
+ * extents. The caller must use all nextents extents, because we are not
+ * flexible about this at all.
*/
-STATIC void
-xfs_efd_item_unpin(
- struct xfs_log_item *lip,
- int remove)
+static struct xfs_efd_log_item *
+xfs_trans_get_efd(
+ struct xfs_trans *tp,
+ struct xfs_efi_log_item *efip,
+ unsigned int nextents)
{
+ struct xfs_efd_log_item *efdp;
+
+ ASSERT(nextents > 0);
+
+ if (nextents > XFS_EFD_MAX_FAST_EXTENTS) {
+ efdp = kmem_zalloc(sizeof(struct xfs_efd_log_item) +
+ (nextents - 1) * sizeof(struct xfs_extent),
+ KM_SLEEP);
+ } else {
+ efdp = kmem_zone_zalloc(xfs_efd_zone, KM_SLEEP);
+ }
+
+ xfs_log_item_init(tp->t_mountp, &efdp->efd_item, XFS_LI_EFD,
+ &xfs_efd_item_ops);
+ efdp->efd_efip = efip;
+ efdp->efd_format.efd_nextents = nextents;
+ efdp->efd_format.efd_efi_id = efip->efi_format.efi_id;
+
+ xfs_trans_add_item(tp, &efdp->efd_item);
+ return efdp;
}
/*
- * There isn't much you can do to push on an efd item. It is simply stuck
- * waiting for the log to be flushed to disk.
+ * Free an extent and log it to the EFD. Note that the transaction is marked
+ * dirty regardless of whether the extent free succeeds or fails to support the
+ * EFI/EFD lifecycle rules.
*/
-STATIC uint
-xfs_efd_item_push(
- struct xfs_log_item *lip,
- struct list_head *buffer_list)
+static int
+xfs_trans_free_extent(
+ struct xfs_trans *tp,
+ struct xfs_efd_log_item *efdp,
+ xfs_fsblock_t start_block,
+ xfs_extlen_t ext_len,
+ const struct xfs_owner_info *oinfo,
+ bool skip_discard)
{
- return XFS_ITEM_PINNED;
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_extent *extp;
+ uint next_extent;
+ xfs_agnumber_t agno = XFS_FSB_TO_AGNO(mp, start_block);
+ xfs_agblock_t agbno = XFS_FSB_TO_AGBNO(mp,
+ start_block);
+ int error;
+
+ trace_xfs_bmap_free_deferred(tp->t_mountp, agno, 0, agbno, ext_len);
+
+ error = __xfs_free_extent(tp, start_block, ext_len,
+ oinfo, XFS_AG_RESV_NONE, skip_discard);
+ /*
+ * Mark the transaction dirty, even on error. This ensures the
+ * transaction is aborted, which:
+ *
+ * 1.) releases the EFI and frees the EFD
+ * 2.) shuts down the filesystem
+ */
+ tp->t_flags |= XFS_TRANS_DIRTY;
+ set_bit(XFS_LI_DIRTY, &efdp->efd_item.li_flags);
+
+ next_extent = efdp->efd_next_extent;
+ ASSERT(next_extent < efdp->efd_format.efd_nextents);
+ extp = &(efdp->efd_format.efd_extents[next_extent]);
+ extp->ext_start = start_block;
+ extp->ext_len = ext_len;
+ efdp->efd_next_extent++;
+
+ return error;
}
-/*
- * The EFD is either committed or aborted if the transaction is cancelled. If
- * the transaction is cancelled, drop our reference to the EFI and free the EFD.
- */
-STATIC void
-xfs_efd_item_unlock(
- struct xfs_log_item *lip)
+/* Sort bmap items by AG. */
+static int
+xfs_extent_free_diff_items(
+ void *priv,
+ struct list_head *a,
+ struct list_head *b)
{
- struct xfs_efd_log_item *efdp = EFD_ITEM(lip);
+ struct xfs_mount *mp = priv;
+ struct xfs_extent_free_item *ra;
+ struct xfs_extent_free_item *rb;
+
+ ra = container_of(a, struct xfs_extent_free_item, xefi_list);
+ rb = container_of(b, struct xfs_extent_free_item, xefi_list);
+ return XFS_FSB_TO_AGNO(mp, ra->xefi_startblock) -
+ XFS_FSB_TO_AGNO(mp, rb->xefi_startblock);
+}
- if (test_bit(XFS_LI_ABORTED, &lip->li_flags)) {
- xfs_efi_release(efdp->efd_efip);
- xfs_efd_item_free(efdp);
- }
+/* Get an EFI. */
+STATIC void *
+xfs_extent_free_create_intent(
+ struct xfs_trans *tp,
+ unsigned int count)
+{
+ struct xfs_efi_log_item *efip;
+
+ ASSERT(tp != NULL);
+ ASSERT(count > 0);
+
+ efip = xfs_efi_init(tp->t_mountp, count);
+ ASSERT(efip != NULL);
+
+ /*
+ * Get a log_item_desc to point at the new item.
+ */
+ xfs_trans_add_item(tp, &efip->efi_item);
+ return efip;
}
-/*
- * When the efd item is committed to disk, all we need to do is delete our
- * reference to our partner efi item and then free ourselves. Since we're
- * freeing ourselves we must return -1 to keep the transaction code from further
- * referencing this item.
- */
-STATIC xfs_lsn_t
-xfs_efd_item_committed(
- struct xfs_log_item *lip,
- xfs_lsn_t lsn)
+/* Log a free extent to the intent item. */
+STATIC void
+xfs_extent_free_log_item(
+ struct xfs_trans *tp,
+ void *intent,
+ struct list_head *item)
{
- struct xfs_efd_log_item *efdp = EFD_ITEM(lip);
+ struct xfs_efi_log_item *efip = intent;
+ struct xfs_extent_free_item *free;
+ uint next_extent;
+ struct xfs_extent *extp;
+
+ free = container_of(item, struct xfs_extent_free_item, xefi_list);
+
+ tp->t_flags |= XFS_TRANS_DIRTY;
+ set_bit(XFS_LI_DIRTY, &efip->efi_item.li_flags);
/*
- * Drop the EFI reference regardless of whether the EFD has been
- * aborted. Once the EFD transaction is constructed, it is the sole
- * responsibility of the EFD to release the EFI (even if the EFI is
- * aborted due to log I/O error).
+ * atomic_inc_return gives us the value after the increment;
+ * we want to use it as an array index so we need to subtract 1 from
+ * it.
*/
- xfs_efi_release(efdp->efd_efip);
- xfs_efd_item_free(efdp);
+ next_extent = atomic_inc_return(&efip->efi_next_extent) - 1;
+ ASSERT(next_extent < efip->efi_format.efi_nextents);
+ extp = &efip->efi_format.efi_extents[next_extent];
+ extp->ext_start = free->xefi_startblock;
+ extp->ext_len = free->xefi_blockcount;
+}
- return (xfs_lsn_t)-1;
+/* Get an EFD so we can process all the free extents. */
+STATIC void *
+xfs_extent_free_create_done(
+ struct xfs_trans *tp,
+ void *intent,
+ unsigned int count)
+{
+ return xfs_trans_get_efd(tp, intent, count);
}
-/*
- * The EFD dependency tracking op doesn't do squat. It can't because
- * it doesn't know where the free extent is coming from. The dependency
- * tracking has to be handled by the "enclosing" metadata object. For
- * example, for inodes, the inode is locked throughout the extent freeing
- * so the dependency should be recorded there.
- */
+/* Process a free extent. */
+STATIC int
+xfs_extent_free_finish_item(
+ struct xfs_trans *tp,
+ struct list_head *item,
+ void *done_item,
+ void **state)
+{
+ struct xfs_extent_free_item *free;
+ int error;
+
+ free = container_of(item, struct xfs_extent_free_item, xefi_list);
+ error = xfs_trans_free_extent(tp, done_item,
+ free->xefi_startblock,
+ free->xefi_blockcount,
+ &free->xefi_oinfo, free->xefi_skip_discard);
+ kmem_free(free);
+ return error;
+}
+
+/* Abort all pending EFIs. */
STATIC void
-xfs_efd_item_committing(
- struct xfs_log_item *lip,
- xfs_lsn_t lsn)
+xfs_extent_free_abort_intent(
+ void *intent)
{
+ xfs_efi_release(intent);
}
-/*
- * This is the ops vector shared by all efd log items.
- */
-static const struct xfs_item_ops xfs_efd_item_ops = {
- .iop_size = xfs_efd_item_size,
- .iop_format = xfs_efd_item_format,
- .iop_pin = xfs_efd_item_pin,
- .iop_unpin = xfs_efd_item_unpin,
- .iop_unlock = xfs_efd_item_unlock,
- .iop_committed = xfs_efd_item_committed,
- .iop_push = xfs_efd_item_push,
- .iop_committing = xfs_efd_item_committing
+/* Cancel a free extent. */
+STATIC void
+xfs_extent_free_cancel_item(
+ struct list_head *item)
+{
+ struct xfs_extent_free_item *free;
+
+ free = container_of(item, struct xfs_extent_free_item, xefi_list);
+ kmem_free(free);
+}
+
+const struct xfs_defer_op_type xfs_extent_free_defer_type = {
+ .max_items = XFS_EFI_MAX_FAST_EXTENTS,
+ .diff_items = xfs_extent_free_diff_items,
+ .create_intent = xfs_extent_free_create_intent,
+ .abort_intent = xfs_extent_free_abort_intent,
+ .log_item = xfs_extent_free_log_item,
+ .create_done = xfs_extent_free_create_done,
+ .finish_item = xfs_extent_free_finish_item,
+ .cancel_item = xfs_extent_free_cancel_item,
};
/*
- * Allocate and initialize an efd item with the given number of extents.
+ * AGFL blocks are accounted differently in the reserve pools and are not
+ * inserted into the busy extent list.
*/
-struct xfs_efd_log_item *
-xfs_efd_init(
- struct xfs_mount *mp,
- struct xfs_efi_log_item *efip,
- uint nextents)
-
+STATIC int
+xfs_agfl_free_finish_item(
+ struct xfs_trans *tp,
+ struct list_head *item,
+ void *done_item,
+ void **state)
{
- struct xfs_efd_log_item *efdp;
- uint size;
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_efd_log_item *efdp = done_item;
+ struct xfs_extent_free_item *free;
+ struct xfs_extent *extp;
+ struct xfs_buf *agbp;
+ int error;
+ xfs_agnumber_t agno;
+ xfs_agblock_t agbno;
+ uint next_extent;
+
+ free = container_of(item, struct xfs_extent_free_item, xefi_list);
+ ASSERT(free->xefi_blockcount == 1);
+ agno = XFS_FSB_TO_AGNO(mp, free->xefi_startblock);
+ agbno = XFS_FSB_TO_AGBNO(mp, free->xefi_startblock);
+
+ trace_xfs_agfl_free_deferred(mp, agno, 0, agbno, free->xefi_blockcount);
+
+ error = xfs_alloc_read_agf(mp, tp, agno, 0, &agbp);
+ if (!error)
+ error = xfs_free_agfl_block(tp, agno, agbno, agbp,
+ &free->xefi_oinfo);
- ASSERT(nextents > 0);
- if (nextents > XFS_EFD_MAX_FAST_EXTENTS) {
- size = (uint)(sizeof(xfs_efd_log_item_t) +
- ((nextents - 1) * sizeof(xfs_extent_t)));
- efdp = kmem_zalloc(size, KM_SLEEP);
- } else {
- efdp = kmem_zone_zalloc(xfs_efd_zone, KM_SLEEP);
- }
+ /*
+ * Mark the transaction dirty, even on error. This ensures the
+ * transaction is aborted, which:
+ *
+ * 1.) releases the EFI and frees the EFD
+ * 2.) shuts down the filesystem
+ */
+ tp->t_flags |= XFS_TRANS_DIRTY;
+ set_bit(XFS_LI_DIRTY, &efdp->efd_item.li_flags);
- xfs_log_item_init(mp, &efdp->efd_item, XFS_LI_EFD, &xfs_efd_item_ops);
- efdp->efd_efip = efip;
- efdp->efd_format.efd_nextents = nextents;
- efdp->efd_format.efd_efi_id = efip->efi_format.efi_id;
+ next_extent = efdp->efd_next_extent;
+ ASSERT(next_extent < efdp->efd_format.efd_nextents);
+ extp = &(efdp->efd_format.efd_extents[next_extent]);
+ extp->ext_start = free->xefi_startblock;
+ extp->ext_len = free->xefi_blockcount;
+ efdp->efd_next_extent++;
- return efdp;
+ kmem_free(free);
+ return error;
}
+/* sub-type with special handling for AGFL deferred frees */
+const struct xfs_defer_op_type xfs_agfl_free_defer_type = {
+ .max_items = XFS_EFI_MAX_FAST_EXTENTS,
+ .diff_items = xfs_extent_free_diff_items,
+ .create_intent = xfs_extent_free_create_intent,
+ .abort_intent = xfs_extent_free_abort_intent,
+ .log_item = xfs_extent_free_log_item,
+ .create_done = xfs_extent_free_create_done,
+ .finish_item = xfs_agfl_free_finish_item,
+ .cancel_item = xfs_extent_free_cancel_item,
+};
+
/*
* Process an extent free intent item that was recovered from
* the log. We need to free the extents that it describes.
diff --git a/fs/xfs/xfs_extfree_item.h b/fs/xfs/xfs_extfree_item.h
index 2a6a895ca73e..16aaab06d4ec 100644
--- a/fs/xfs/xfs_extfree_item.h
+++ b/fs/xfs/xfs_extfree_item.h
@@ -51,7 +51,7 @@ struct kmem_zone;
* AIL, so at this point both the EFI and EFD are freed.
*/
typedef struct xfs_efi_log_item {
- xfs_log_item_t efi_item;
+ struct xfs_log_item efi_item;
atomic_t efi_refcount;
atomic_t efi_next_extent;
unsigned long efi_flags; /* misc flags */
@@ -64,7 +64,7 @@ typedef struct xfs_efi_log_item {
* have been freed.
*/
typedef struct xfs_efd_log_item {
- xfs_log_item_t efd_item;
+ struct xfs_log_item efd_item;
xfs_efi_log_item_t *efd_efip;
uint efd_next_extent;
xfs_efd_log_format_t efd_format;
@@ -79,8 +79,6 @@ extern struct kmem_zone *xfs_efi_zone;
extern struct kmem_zone *xfs_efd_zone;
xfs_efi_log_item_t *xfs_efi_init(struct xfs_mount *, uint);
-xfs_efd_log_item_t *xfs_efd_init(struct xfs_mount *, xfs_efi_log_item_t *,
- uint);
int xfs_efi_copy_format(xfs_log_iovec_t *buf,
xfs_efi_log_format_t *dst_efi_fmt);
void xfs_efi_item_free(xfs_efi_log_item_t *);
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index e47425071e65..28101bbc0b78 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -10,14 +10,11 @@
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
-#include "xfs_da_format.h"
-#include "xfs_da_btree.h"
#include "xfs_inode.h"
#include "xfs_trans.h"
#include "xfs_inode_item.h"
#include "xfs_bmap.h"
#include "xfs_bmap_util.h"
-#include "xfs_error.h"
#include "xfs_dir2.h"
#include "xfs_dir2_priv.h"
#include "xfs_ioctl.h"
@@ -28,9 +25,7 @@
#include "xfs_iomap.h"
#include "xfs_reflink.h"
-#include <linux/dcache.h>
#include <linux/falloc.h>
-#include <linux/pagevec.h>
#include <linux/backing-dev.h>
#include <linux/mman.h>
@@ -367,20 +362,7 @@ restart:
* lock above. Eventually we should look into a way to avoid
* the pointless lock roundtrip.
*/
- if (likely(!(file->f_mode & FMODE_NOCMTIME))) {
- error = file_update_time(file);
- if (error)
- return error;
- }
-
- /*
- * If we're writing the file then make sure to clear the setuid and
- * setgid bits if the process is not being run by root. This keeps
- * people from modifying setuid and setgid binaries.
- */
- if (!IS_NOSEC(inode))
- return file_remove_privs(file);
- return 0;
+ return file_modified(file);
}
static int
@@ -392,6 +374,7 @@ xfs_dio_write_end_io(
struct inode *inode = file_inode(iocb->ki_filp);
struct xfs_inode *ip = XFS_I(inode);
loff_t offset = iocb->ki_pos;
+ unsigned int nofs_flag;
int error = 0;
trace_xfs_end_io_direct_write(ip, offset, size);
@@ -408,10 +391,17 @@ xfs_dio_write_end_io(
*/
XFS_STATS_ADD(ip->i_mount, xs_write_bytes, size);
+ /*
+ * We can allocate memory here while doing writeback on behalf of
+ * memory reclaim. To avoid memory allocation deadlocks set the
+ * task-wide nofs context for the following operations.
+ */
+ nofs_flag = memalloc_nofs_save();
+
if (flags & IOMAP_DIO_COW) {
error = xfs_reflink_end_cow(ip, offset, size);
if (error)
- return error;
+ goto out;
}
/*
@@ -420,8 +410,10 @@ xfs_dio_write_end_io(
* earlier allows a racing dio read to find unwritten extents before
* they are converted.
*/
- if (flags & IOMAP_DIO_UNWRITTEN)
- return xfs_iomap_write_unwritten(ip, offset, size, true);
+ if (flags & IOMAP_DIO_UNWRITTEN) {
+ error = xfs_iomap_write_unwritten(ip, offset, size, true);
+ goto out;
+ }
/*
* We need to update the in-core inode size here so that we don't end up
@@ -443,6 +435,8 @@ xfs_dio_write_end_io(
spin_unlock(&ip->i_flags_lock);
}
+out:
+ memalloc_nofs_restore(nofs_flag);
return error;
}
@@ -507,7 +501,7 @@ xfs_file_dio_aio_write(
* We can't properly handle unaligned direct I/O to reflink
* files yet, as we can't unshare a partial block.
*/
- if (xfs_is_reflink_inode(ip)) {
+ if (xfs_is_cow_inode(ip)) {
trace_xfs_reflink_bounce_dio_write(ip, iocb->ki_pos, count);
return -EREMCHG;
}
@@ -517,6 +511,9 @@ xfs_file_dio_aio_write(
}
if (iocb->ki_flags & IOCB_NOWAIT) {
+ /* unaligned dio always waits, bail */
+ if (unaligned_io)
+ return -EAGAIN;
if (!xfs_ilock_nowait(ip, iolock))
return -EAGAIN;
} else {
@@ -529,18 +526,14 @@ xfs_file_dio_aio_write(
count = iov_iter_count(from);
/*
- * If we are doing unaligned IO, wait for all other IO to drain,
- * otherwise demote the lock if we had to take the exclusive lock
- * for other reasons in xfs_file_aio_write_checks.
+ * If we are doing unaligned IO, we can't allow any other overlapping IO
+ * in-flight at the same time or we risk data corruption. Wait for all
+ * other IO to drain before we submit. If the IO is aligned, demote the
+ * iolock if we had to take the exclusive lock in
+ * xfs_file_aio_write_checks() for other reasons.
*/
if (unaligned_io) {
- /* If we are going to wait for other DIO to finish, bail */
- if (iocb->ki_flags & IOCB_NOWAIT) {
- if (atomic_read(&inode->i_dio_count))
- return -EAGAIN;
- } else {
- inode_dio_wait(inode);
- }
+ inode_dio_wait(inode);
} else if (iolock == XFS_IOLOCK_EXCL) {
xfs_ilock_demote(ip, XFS_IOLOCK_EXCL);
iolock = XFS_IOLOCK_SHARED;
@@ -548,6 +541,14 @@ xfs_file_dio_aio_write(
trace_xfs_file_direct_write(ip, count, iocb->ki_pos);
ret = iomap_dio_rw(iocb, from, &xfs_iomap_ops, xfs_dio_write_end_io);
+
+ /*
+ * If unaligned, this is the only IO in-flight. If it has not yet
+ * completed, wait on it before we release the iolock to prevent
+ * subsequent overlapping IO.
+ */
+ if (ret == -EIOCBQUEUED && unaligned_io)
+ inode_dio_wait(inode);
out:
xfs_iunlock(ip, iolock);
@@ -872,14 +873,27 @@ xfs_file_fallocate(
goto out_unlock;
}
- if (mode & FALLOC_FL_ZERO_RANGE)
+ if (mode & FALLOC_FL_ZERO_RANGE) {
error = xfs_zero_file_space(ip, offset, len);
- else {
- if (mode & FALLOC_FL_UNSHARE_RANGE) {
- error = xfs_reflink_unshare(ip, offset, len);
- if (error)
- goto out_unlock;
+ } else if (mode & FALLOC_FL_UNSHARE_RANGE) {
+ error = xfs_reflink_unshare(ip, offset, len);
+ if (error)
+ goto out_unlock;
+
+ if (!xfs_is_always_cow_inode(ip)) {
+ error = xfs_alloc_file_space(ip, offset, len,
+ XFS_BMAPI_PREALLOC);
}
+ } else {
+ /*
+ * If always_cow mode we can't use preallocations and
+ * thus should not create them.
+ */
+ if (xfs_is_always_cow_inode(ip)) {
+ error = -EOPNOTSUPP;
+ goto out_unlock;
+ }
+
error = xfs_alloc_file_space(ip, offset, len,
XFS_BMAPI_PREALLOC);
}
@@ -1068,10 +1082,10 @@ xfs_file_llseek(
default:
return generic_file_llseek(file, offset, whence);
case SEEK_HOLE:
- offset = iomap_seek_hole(inode, offset, &xfs_iomap_ops);
+ offset = iomap_seek_hole(inode, offset, &xfs_seek_iomap_ops);
break;
case SEEK_DATA:
- offset = iomap_seek_data(inode, offset, &xfs_iomap_ops);
+ offset = iomap_seek_data(inode, offset, &xfs_seek_iomap_ops);
break;
}
@@ -1183,11 +1197,14 @@ xfs_file_mmap(
struct file *filp,
struct vm_area_struct *vma)
{
+ struct dax_device *dax_dev;
+
+ dax_dev = xfs_find_daxdev_for_inode(file_inode(filp));
/*
- * We don't support synchronous mappings for non-DAX files. At least
- * until someone comes with a sensible use case.
+ * We don't support synchronous mappings for non-DAX files and
+ * for DAX files if underneath dax_device is not synchronous.
*/
- if (!IS_DAX(file_inode(filp)) && (vma->vm_flags & VM_SYNC))
+ if (!daxdev_mapping_supported(vma, dax_dev))
return -EOPNOTSUPP;
file_accessed(filp);
@@ -1203,6 +1220,7 @@ const struct file_operations xfs_file_operations = {
.write_iter = xfs_file_write_iter,
.splice_read = generic_file_splice_read,
.splice_write = iter_file_splice_write,
+ .iopoll = iomap_dio_iopoll,
.unlocked_ioctl = xfs_file_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = xfs_file_compat_ioctl,
diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c
index 182501373af2..574a7a8b4736 100644
--- a/fs/xfs/xfs_filestream.c
+++ b/fs/xfs/xfs_filestream.c
@@ -5,22 +5,19 @@
* All Rights Reserved.
*/
#include "xfs.h"
+#include "xfs_shared.h"
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
#include "xfs_sb.h"
#include "xfs_mount.h"
-#include "xfs_defer.h"
#include "xfs_inode.h"
#include "xfs_bmap.h"
-#include "xfs_bmap_util.h"
#include "xfs_alloc.h"
#include "xfs_mru_cache.h"
-#include "xfs_filestream.h"
#include "xfs_trace.h"
#include "xfs_ag_resv.h"
#include "xfs_trans.h"
-#include "xfs_shared.h"
struct xfs_fstrm_item {
struct xfs_mru_cache_elem mru;
diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c
index 3d76a9e35870..5a8f9641562a 100644
--- a/fs/xfs/xfs_fsmap.c
+++ b/fs/xfs/xfs_fsmap.c
@@ -9,16 +9,12 @@
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
-#include "xfs_sb.h"
#include "xfs_mount.h"
-#include "xfs_defer.h"
#include "xfs_inode.h"
#include "xfs_trans.h"
-#include "xfs_error.h"
#include "xfs_btree.h"
#include "xfs_rmap_btree.h"
#include "xfs_trace.h"
-#include "xfs_log.h"
#include "xfs_rmap.h"
#include "xfs_alloc.h"
#include "xfs_bit.h"
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index f3ef70c542e1..3e61d0cc23f8 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -11,15 +11,11 @@
#include "xfs_trans_resv.h"
#include "xfs_sb.h"
#include "xfs_mount.h"
-#include "xfs_defer.h"
#include "xfs_trans.h"
#include "xfs_error.h"
-#include "xfs_btree.h"
#include "xfs_alloc.h"
#include "xfs_fsops.h"
#include "xfs_trans_space.h"
-#include "xfs_rtalloc.h"
-#include "xfs_trace.h"
#include "xfs_log.h"
#include "xfs_ag.h"
#include "xfs_ag_resv.h"
@@ -251,9 +247,9 @@ xfs_growfs_data(
if (mp->m_sb.sb_imax_pct) {
uint64_t icount = mp->m_sb.sb_dblocks * mp->m_sb.sb_imax_pct;
do_div(icount, 100);
- mp->m_maxicount = XFS_FSB_TO_INO(mp, icount);
+ M_IGEO(mp)->maxicount = XFS_FSB_TO_INO(mp, icount);
} else
- mp->m_maxicount = 0;
+ M_IGEO(mp)->maxicount = 0;
/* Update secondary superblocks now the physical grow has completed */
error = xfs_update_secondary_sbs(mp);
@@ -289,7 +285,7 @@ xfs_growfs_log(
* exported through ioctl XFS_IOC_FSCOUNTS
*/
-int
+void
xfs_fs_counts(
xfs_mount_t *mp,
xfs_fsop_counts_t *cnt)
@@ -302,7 +298,6 @@ xfs_fs_counts(
spin_lock(&mp->m_sb_lock);
cnt->freertx = mp->m_sb.sb_frextents;
spin_unlock(&mp->m_sb_lock);
- return 0;
}
/*
@@ -533,6 +528,7 @@ xfs_fs_reserve_ag_blocks(
int error = 0;
int err2;
+ mp->m_finobt_nores = false;
for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
pag = xfs_perag_get(mp, agno);
err2 = xfs_ag_resv_init(pag, NULL);
diff --git a/fs/xfs/xfs_fsops.h b/fs/xfs/xfs_fsops.h
index d023db0862c2..92869f6ec8d3 100644
--- a/fs/xfs/xfs_fsops.h
+++ b/fs/xfs/xfs_fsops.h
@@ -8,7 +8,7 @@
extern int xfs_growfs_data(xfs_mount_t *mp, xfs_growfs_data_t *in);
extern int xfs_growfs_log(xfs_mount_t *mp, xfs_growfs_log_t *in);
-extern int xfs_fs_counts(xfs_mount_t *mp, xfs_fsop_counts_t *cnt);
+extern void xfs_fs_counts(xfs_mount_t *mp, xfs_fsop_counts_t *cnt);
extern int xfs_reserve_blocks(xfs_mount_t *mp, uint64_t *inval,
xfs_fsop_resblks_t *outval);
extern int xfs_fs_goingdown(xfs_mount_t *mp, uint32_t inflags);
diff --git a/fs/xfs/xfs_globals.c b/fs/xfs/xfs_globals.c
index 5169e84ae382..fa55ab8b8d80 100644
--- a/fs/xfs/xfs_globals.c
+++ b/fs/xfs/xfs_globals.c
@@ -4,7 +4,6 @@
* All Rights Reserved.
*/
#include "xfs.h"
-#include "xfs_sysctl.h"
/*
* Tunable XFS parameters. xfs_params is required even when CONFIG_SYSCTL=n,
@@ -16,7 +15,7 @@ xfs_param_t xfs_params = {
/* MIN DFLT MAX */
.sgid_inherit = { 0, 0, 1 },
.symlink_mode = { 0, 0, 1 },
- .panic_mask = { 0, 0, 255 },
+ .panic_mask = { 0, 0, 256 },
.error_level = { 0, 3, 11 },
.syncd_timer = { 1*100, 30*100, 7200*100},
.stats_clear = { 0, 0, 1 },
@@ -41,4 +40,7 @@ struct xfs_globals xfs_globals = {
#else
.bug_on_assert = false, /* assert failures WARN() */
#endif
+#ifdef DEBUG
+ .pwork_threads = -1, /* automatic thread detection */
+#endif
};
diff --git a/fs/xfs/xfs_health.c b/fs/xfs/xfs_health.c
new file mode 100644
index 000000000000..8e0cb05a7142
--- /dev/null
+++ b/fs/xfs/xfs_health.c
@@ -0,0 +1,388 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Copyright (C) 2019 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_sb.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_trace.h"
+#include "xfs_health.h"
+
+/*
+ * Warn about metadata corruption that we detected but haven't fixed, and
+ * make sure we're not sitting on anything that would get in the way of
+ * recovery.
+ */
+void
+xfs_health_unmount(
+ struct xfs_mount *mp)
+{
+ struct xfs_perag *pag;
+ xfs_agnumber_t agno;
+ unsigned int sick = 0;
+ unsigned int checked = 0;
+ bool warn = false;
+
+ if (XFS_FORCED_SHUTDOWN(mp))
+ return;
+
+ /* Measure AG corruption levels. */
+ for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
+ pag = xfs_perag_get(mp, agno);
+ xfs_ag_measure_sickness(pag, &sick, &checked);
+ if (sick) {
+ trace_xfs_ag_unfixed_corruption(mp, agno, sick);
+ warn = true;
+ }
+ xfs_perag_put(pag);
+ }
+
+ /* Measure realtime volume corruption levels. */
+ xfs_rt_measure_sickness(mp, &sick, &checked);
+ if (sick) {
+ trace_xfs_rt_unfixed_corruption(mp, sick);
+ warn = true;
+ }
+
+ /*
+ * Measure fs corruption and keep the sample around for the warning.
+ * See the note below for why we exempt FS_COUNTERS.
+ */
+ xfs_fs_measure_sickness(mp, &sick, &checked);
+ if (sick & ~XFS_SICK_FS_COUNTERS) {
+ trace_xfs_fs_unfixed_corruption(mp, sick);
+ warn = true;
+ }
+
+ if (warn) {
+ xfs_warn(mp,
+"Uncorrected metadata errors detected; please run xfs_repair.");
+
+ /*
+ * We discovered uncorrected metadata problems at some point
+ * during this filesystem mount and have advised the
+ * administrator to run repair once the unmount completes.
+ *
+ * However, we must be careful -- when FSCOUNTERS are flagged
+ * unhealthy, the unmount procedure omits writing the clean
+ * unmount record to the log so that the next mount will run
+ * recovery and recompute the summary counters. In other
+ * words, we leave a dirty log to get the counters fixed.
+ *
+ * Unfortunately, xfs_repair cannot recover dirty logs, so if
+ * there were filesystem problems, FSCOUNTERS was flagged, and
+ * the administrator takes our advice to run xfs_repair,
+ * they'll have to zap the log before repairing structures.
+ * We don't really want to encourage this, so we mark the
+ * FSCOUNTERS healthy so that a subsequent repair run won't see
+ * a dirty log.
+ */
+ if (sick & XFS_SICK_FS_COUNTERS)
+ xfs_fs_mark_healthy(mp, XFS_SICK_FS_COUNTERS);
+ }
+}
+
+/* Mark unhealthy per-fs metadata. */
+void
+xfs_fs_mark_sick(
+ struct xfs_mount *mp,
+ unsigned int mask)
+{
+ ASSERT(!(mask & ~XFS_SICK_FS_PRIMARY));
+ trace_xfs_fs_mark_sick(mp, mask);
+
+ spin_lock(&mp->m_sb_lock);
+ mp->m_fs_sick |= mask;
+ mp->m_fs_checked |= mask;
+ spin_unlock(&mp->m_sb_lock);
+}
+
+/* Mark a per-fs metadata healed. */
+void
+xfs_fs_mark_healthy(
+ struct xfs_mount *mp,
+ unsigned int mask)
+{
+ ASSERT(!(mask & ~XFS_SICK_FS_PRIMARY));
+ trace_xfs_fs_mark_healthy(mp, mask);
+
+ spin_lock(&mp->m_sb_lock);
+ mp->m_fs_sick &= ~mask;
+ mp->m_fs_checked |= mask;
+ spin_unlock(&mp->m_sb_lock);
+}
+
+/* Sample which per-fs metadata are unhealthy. */
+void
+xfs_fs_measure_sickness(
+ struct xfs_mount *mp,
+ unsigned int *sick,
+ unsigned int *checked)
+{
+ spin_lock(&mp->m_sb_lock);
+ *sick = mp->m_fs_sick;
+ *checked = mp->m_fs_checked;
+ spin_unlock(&mp->m_sb_lock);
+}
+
+/* Mark unhealthy realtime metadata. */
+void
+xfs_rt_mark_sick(
+ struct xfs_mount *mp,
+ unsigned int mask)
+{
+ ASSERT(!(mask & ~XFS_SICK_RT_PRIMARY));
+ trace_xfs_rt_mark_sick(mp, mask);
+
+ spin_lock(&mp->m_sb_lock);
+ mp->m_rt_sick |= mask;
+ mp->m_rt_checked |= mask;
+ spin_unlock(&mp->m_sb_lock);
+}
+
+/* Mark a realtime metadata healed. */
+void
+xfs_rt_mark_healthy(
+ struct xfs_mount *mp,
+ unsigned int mask)
+{
+ ASSERT(!(mask & ~XFS_SICK_RT_PRIMARY));
+ trace_xfs_rt_mark_healthy(mp, mask);
+
+ spin_lock(&mp->m_sb_lock);
+ mp->m_rt_sick &= ~mask;
+ mp->m_rt_checked |= mask;
+ spin_unlock(&mp->m_sb_lock);
+}
+
+/* Sample which realtime metadata are unhealthy. */
+void
+xfs_rt_measure_sickness(
+ struct xfs_mount *mp,
+ unsigned int *sick,
+ unsigned int *checked)
+{
+ spin_lock(&mp->m_sb_lock);
+ *sick = mp->m_rt_sick;
+ *checked = mp->m_rt_checked;
+ spin_unlock(&mp->m_sb_lock);
+}
+
+/* Mark unhealthy per-ag metadata. */
+void
+xfs_ag_mark_sick(
+ struct xfs_perag *pag,
+ unsigned int mask)
+{
+ ASSERT(!(mask & ~XFS_SICK_AG_PRIMARY));
+ trace_xfs_ag_mark_sick(pag->pag_mount, pag->pag_agno, mask);
+
+ spin_lock(&pag->pag_state_lock);
+ pag->pag_sick |= mask;
+ pag->pag_checked |= mask;
+ spin_unlock(&pag->pag_state_lock);
+}
+
+/* Mark per-ag metadata ok. */
+void
+xfs_ag_mark_healthy(
+ struct xfs_perag *pag,
+ unsigned int mask)
+{
+ ASSERT(!(mask & ~XFS_SICK_AG_PRIMARY));
+ trace_xfs_ag_mark_healthy(pag->pag_mount, pag->pag_agno, mask);
+
+ spin_lock(&pag->pag_state_lock);
+ pag->pag_sick &= ~mask;
+ pag->pag_checked |= mask;
+ spin_unlock(&pag->pag_state_lock);
+}
+
+/* Sample which per-ag metadata are unhealthy. */
+void
+xfs_ag_measure_sickness(
+ struct xfs_perag *pag,
+ unsigned int *sick,
+ unsigned int *checked)
+{
+ spin_lock(&pag->pag_state_lock);
+ *sick = pag->pag_sick;
+ *checked = pag->pag_checked;
+ spin_unlock(&pag->pag_state_lock);
+}
+
+/* Mark the unhealthy parts of an inode. */
+void
+xfs_inode_mark_sick(
+ struct xfs_inode *ip,
+ unsigned int mask)
+{
+ ASSERT(!(mask & ~XFS_SICK_INO_PRIMARY));
+ trace_xfs_inode_mark_sick(ip, mask);
+
+ spin_lock(&ip->i_flags_lock);
+ ip->i_sick |= mask;
+ ip->i_checked |= mask;
+ spin_unlock(&ip->i_flags_lock);
+}
+
+/* Mark parts of an inode healed. */
+void
+xfs_inode_mark_healthy(
+ struct xfs_inode *ip,
+ unsigned int mask)
+{
+ ASSERT(!(mask & ~XFS_SICK_INO_PRIMARY));
+ trace_xfs_inode_mark_healthy(ip, mask);
+
+ spin_lock(&ip->i_flags_lock);
+ ip->i_sick &= ~mask;
+ ip->i_checked |= mask;
+ spin_unlock(&ip->i_flags_lock);
+}
+
+/* Sample which parts of an inode are unhealthy. */
+void
+xfs_inode_measure_sickness(
+ struct xfs_inode *ip,
+ unsigned int *sick,
+ unsigned int *checked)
+{
+ spin_lock(&ip->i_flags_lock);
+ *sick = ip->i_sick;
+ *checked = ip->i_checked;
+ spin_unlock(&ip->i_flags_lock);
+}
+
+/* Mappings between internal sick masks and ioctl sick masks. */
+
+struct ioctl_sick_map {
+ unsigned int sick_mask;
+ unsigned int ioctl_mask;
+};
+
+static const struct ioctl_sick_map fs_map[] = {
+ { XFS_SICK_FS_COUNTERS, XFS_FSOP_GEOM_SICK_COUNTERS},
+ { XFS_SICK_FS_UQUOTA, XFS_FSOP_GEOM_SICK_UQUOTA },
+ { XFS_SICK_FS_GQUOTA, XFS_FSOP_GEOM_SICK_GQUOTA },
+ { XFS_SICK_FS_PQUOTA, XFS_FSOP_GEOM_SICK_PQUOTA },
+ { 0, 0 },
+};
+
+static const struct ioctl_sick_map rt_map[] = {
+ { XFS_SICK_RT_BITMAP, XFS_FSOP_GEOM_SICK_RT_BITMAP },
+ { XFS_SICK_RT_SUMMARY, XFS_FSOP_GEOM_SICK_RT_SUMMARY },
+ { 0, 0 },
+};
+
+static inline void
+xfgeo_health_tick(
+ struct xfs_fsop_geom *geo,
+ unsigned int sick,
+ unsigned int checked,
+ const struct ioctl_sick_map *m)
+{
+ if (checked & m->sick_mask)
+ geo->checked |= m->ioctl_mask;
+ if (sick & m->sick_mask)
+ geo->sick |= m->ioctl_mask;
+}
+
+/* Fill out fs geometry health info. */
+void
+xfs_fsop_geom_health(
+ struct xfs_mount *mp,
+ struct xfs_fsop_geom *geo)
+{
+ const struct ioctl_sick_map *m;
+ unsigned int sick;
+ unsigned int checked;
+
+ geo->sick = 0;
+ geo->checked = 0;
+
+ xfs_fs_measure_sickness(mp, &sick, &checked);
+ for (m = fs_map; m->sick_mask; m++)
+ xfgeo_health_tick(geo, sick, checked, m);
+
+ xfs_rt_measure_sickness(mp, &sick, &checked);
+ for (m = rt_map; m->sick_mask; m++)
+ xfgeo_health_tick(geo, sick, checked, m);
+}
+
+static const struct ioctl_sick_map ag_map[] = {
+ { XFS_SICK_AG_SB, XFS_AG_GEOM_SICK_SB },
+ { XFS_SICK_AG_AGF, XFS_AG_GEOM_SICK_AGF },
+ { XFS_SICK_AG_AGFL, XFS_AG_GEOM_SICK_AGFL },
+ { XFS_SICK_AG_AGI, XFS_AG_GEOM_SICK_AGI },
+ { XFS_SICK_AG_BNOBT, XFS_AG_GEOM_SICK_BNOBT },
+ { XFS_SICK_AG_CNTBT, XFS_AG_GEOM_SICK_CNTBT },
+ { XFS_SICK_AG_INOBT, XFS_AG_GEOM_SICK_INOBT },
+ { XFS_SICK_AG_FINOBT, XFS_AG_GEOM_SICK_FINOBT },
+ { XFS_SICK_AG_RMAPBT, XFS_AG_GEOM_SICK_RMAPBT },
+ { XFS_SICK_AG_REFCNTBT, XFS_AG_GEOM_SICK_REFCNTBT },
+ { 0, 0 },
+};
+
+/* Fill out ag geometry health info. */
+void
+xfs_ag_geom_health(
+ struct xfs_perag *pag,
+ struct xfs_ag_geometry *ageo)
+{
+ const struct ioctl_sick_map *m;
+ unsigned int sick;
+ unsigned int checked;
+
+ ageo->ag_sick = 0;
+ ageo->ag_checked = 0;
+
+ xfs_ag_measure_sickness(pag, &sick, &checked);
+ for (m = ag_map; m->sick_mask; m++) {
+ if (checked & m->sick_mask)
+ ageo->ag_checked |= m->ioctl_mask;
+ if (sick & m->sick_mask)
+ ageo->ag_sick |= m->ioctl_mask;
+ }
+}
+
+static const struct ioctl_sick_map ino_map[] = {
+ { XFS_SICK_INO_CORE, XFS_BS_SICK_INODE },
+ { XFS_SICK_INO_BMBTD, XFS_BS_SICK_BMBTD },
+ { XFS_SICK_INO_BMBTA, XFS_BS_SICK_BMBTA },
+ { XFS_SICK_INO_BMBTC, XFS_BS_SICK_BMBTC },
+ { XFS_SICK_INO_DIR, XFS_BS_SICK_DIR },
+ { XFS_SICK_INO_XATTR, XFS_BS_SICK_XATTR },
+ { XFS_SICK_INO_SYMLINK, XFS_BS_SICK_SYMLINK },
+ { XFS_SICK_INO_PARENT, XFS_BS_SICK_PARENT },
+ { 0, 0 },
+};
+
+/* Fill out bulkstat health info. */
+void
+xfs_bulkstat_health(
+ struct xfs_inode *ip,
+ struct xfs_bulkstat *bs)
+{
+ const struct ioctl_sick_map *m;
+ unsigned int sick;
+ unsigned int checked;
+
+ bs->bs_sick = 0;
+ bs->bs_checked = 0;
+
+ xfs_inode_measure_sickness(ip, &sick, &checked);
+ for (m = ino_map; m->sick_mask; m++) {
+ if (checked & m->sick_mask)
+ bs->bs_checked |= m->ioctl_mask;
+ if (sick & m->sick_mask)
+ bs->bs_sick |= m->ioctl_mask;
+ }
+}
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index 245483cc282b..0b0fd10a36d4 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -5,13 +5,13 @@
*/
#include "xfs.h"
#include "xfs_fs.h"
+#include "xfs_shared.h"
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
#include "xfs_sb.h"
#include "xfs_mount.h"
#include "xfs_inode.h"
-#include "xfs_error.h"
#include "xfs_trans.h"
#include "xfs_trans_priv.h"
#include "xfs_inode_item.h"
@@ -23,8 +23,6 @@
#include "xfs_dquot.h"
#include "xfs_reflink.h"
-#include <linux/kthread.h>
-#include <linux/freezer.h>
#include <linux/iversion.h>
/*
@@ -70,6 +68,11 @@ xfs_inode_alloc(
ip->i_flags = 0;
ip->i_delayed_blks = 0;
memset(&ip->i_d, 0, sizeof(ip->i_d));
+ ip->i_sick = 0;
+ ip->i_checked = 0;
+ INIT_WORK(&ip->i_ioend_work, xfs_end_io);
+ INIT_LIST_HEAD(&ip->i_ioend_list);
+ spin_lock_init(&ip->i_ioend_lock);
return ip;
}
@@ -446,6 +449,8 @@ xfs_iget_cache_hit(
ip->i_flags |= XFS_INEW;
xfs_inode_clear_reclaim_tag(pag, ip->i_ino);
inode->i_state = I_NEW;
+ ip->i_sick = 0;
+ ip->i_checked = 0;
ASSERT(!rwsem_is_locked(&inode->i_rwsem));
init_rwsem(&inode->i_rwsem);
@@ -1815,7 +1820,7 @@ xfs_inode_clear_cowblocks_tag(
/* Disable post-EOF and CoW block auto-reclamation. */
void
-xfs_icache_disable_reclaim(
+xfs_stop_block_reaping(
struct xfs_mount *mp)
{
cancel_delayed_work_sync(&mp->m_eofblocks_work);
@@ -1824,7 +1829,7 @@ xfs_icache_disable_reclaim(
/* Enable post-EOF and CoW block auto-reclamation. */
void
-xfs_icache_enable_reclaim(
+xfs_start_block_reaping(
struct xfs_mount *mp)
{
xfs_queue_eofblocks(mp);
diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h
index 26c0626f1f75..48f1fd2bb6ad 100644
--- a/fs/xfs/xfs_icache.h
+++ b/fs/xfs/xfs_icache.h
@@ -119,7 +119,7 @@ xfs_fs_eofblocks_from_user(
int xfs_icache_inode_is_allocated(struct xfs_mount *mp, struct xfs_trans *tp,
xfs_ino_t ino, bool *inuse);
-void xfs_icache_disable_reclaim(struct xfs_mount *mp);
-void xfs_icache_enable_reclaim(struct xfs_mount *mp);
+void xfs_stop_block_reaping(struct xfs_mount *mp);
+void xfs_start_block_reaping(struct xfs_mount *mp);
#endif
diff --git a/fs/xfs/xfs_icreate_item.c b/fs/xfs/xfs_icreate_item.c
index 8381d34cb102..d99a0a3e5f40 100644
--- a/fs/xfs/xfs_icreate_item.c
+++ b/fs/xfs/xfs_icreate_item.c
@@ -6,14 +6,9 @@
#include "xfs.h"
#include "xfs_fs.h"
#include "xfs_shared.h"
-#include "xfs_format.h"
#include "xfs_log_format.h"
-#include "xfs_trans_resv.h"
-#include "xfs_bit.h"
-#include "xfs_mount.h"
#include "xfs_trans.h"
#include "xfs_trans_priv.h"
-#include "xfs_error.h"
#include "xfs_icreate_item.h"
#include "xfs_log.h"
@@ -56,80 +51,18 @@ xfs_icreate_item_format(
sizeof(struct xfs_icreate_log));
}
-
-/* Pinning has no meaning for the create item, so just return. */
STATIC void
-xfs_icreate_item_pin(
+xfs_icreate_item_release(
struct xfs_log_item *lip)
{
+ kmem_zone_free(xfs_icreate_zone, ICR_ITEM(lip));
}
-
-/* pinning has no meaning for the create item, so just return. */
-STATIC void
-xfs_icreate_item_unpin(
- struct xfs_log_item *lip,
- int remove)
-{
-}
-
-STATIC void
-xfs_icreate_item_unlock(
- struct xfs_log_item *lip)
-{
- struct xfs_icreate_item *icp = ICR_ITEM(lip);
-
- if (test_bit(XFS_LI_ABORTED, &lip->li_flags))
- kmem_zone_free(xfs_icreate_zone, icp);
- return;
-}
-
-/*
- * Because we have ordered buffers being tracked in the AIL for the inode
- * creation, we don't need the create item after this. Hence we can free
- * the log item and return -1 to tell the caller we're done with the item.
- */
-STATIC xfs_lsn_t
-xfs_icreate_item_committed(
- struct xfs_log_item *lip,
- xfs_lsn_t lsn)
-{
- struct xfs_icreate_item *icp = ICR_ITEM(lip);
-
- kmem_zone_free(xfs_icreate_zone, icp);
- return (xfs_lsn_t)-1;
-}
-
-/* item can never get into the AIL */
-STATIC uint
-xfs_icreate_item_push(
- struct xfs_log_item *lip,
- struct list_head *buffer_list)
-{
- ASSERT(0);
- return XFS_ITEM_SUCCESS;
-}
-
-/* Ordered buffers do the dependency tracking here, so this does nothing. */
-STATIC void
-xfs_icreate_item_committing(
- struct xfs_log_item *lip,
- xfs_lsn_t lsn)
-{
-}
-
-/*
- * This is the ops vector shared by all buf log items.
- */
static const struct xfs_item_ops xfs_icreate_item_ops = {
+ .flags = XFS_ITEM_RELEASE_WHEN_COMMITTED,
.iop_size = xfs_icreate_item_size,
.iop_format = xfs_icreate_item_format,
- .iop_pin = xfs_icreate_item_pin,
- .iop_unpin = xfs_icreate_item_unpin,
- .iop_push = xfs_icreate_item_push,
- .iop_unlock = xfs_icreate_item_unlock,
- .iop_committed = xfs_icreate_item_committed,
- .iop_committing = xfs_icreate_item_committing,
+ .iop_release = xfs_icreate_item_release,
};
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index ae667ba74a1c..6467d5e1df2d 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -3,7 +3,6 @@
* Copyright (c) 2000-2006 Silicon Graphics, Inc.
* All Rights Reserved.
*/
-#include <linux/log2.h>
#include <linux/iversion.h>
#include "xfs.h"
@@ -16,10 +15,7 @@
#include "xfs_mount.h"
#include "xfs_defer.h"
#include "xfs_inode.h"
-#include "xfs_da_format.h"
-#include "xfs_da_btree.h"
#include "xfs_dir2.h"
-#include "xfs_attr_sf.h"
#include "xfs_attr.h"
#include "xfs_trans_space.h"
#include "xfs_trans.h"
@@ -32,7 +28,6 @@
#include "xfs_error.h"
#include "xfs_quota.h"
#include "xfs_filestream.h"
-#include "xfs_cksum.h"
#include "xfs_trace.h"
#include "xfs_icache.h"
#include "xfs_symlink.h"
@@ -40,7 +35,6 @@
#include "xfs_log.h"
#include "xfs_bmap_btree.h"
#include "xfs_reflink.h"
-#include "xfs_dir2_priv.h"
kmem_zone_t *xfs_inode_zone;
@@ -441,12 +435,12 @@ xfs_lock_inumorder(int lock_mode, int subclass)
*/
static void
xfs_lock_inodes(
- xfs_inode_t **ips,
- int inodes,
- uint lock_mode)
+ struct xfs_inode **ips,
+ int inodes,
+ uint lock_mode)
{
- int attempts = 0, i, j, try_lock;
- xfs_log_item_t *lp;
+ int attempts = 0, i, j, try_lock;
+ struct xfs_log_item *lp;
/*
* Currently supports between 2 and 5 inodes with exclusive locking. We
@@ -485,7 +479,7 @@ again:
*/
if (!try_lock) {
for (j = (i - 1); j >= 0 && !try_lock; j--) {
- lp = (xfs_log_item_t *)ips[j]->i_itemp;
+ lp = &ips[j]->i_itemp->ili_item;
if (lp && test_bit(XFS_LI_IN_AIL, &lp->li_flags))
try_lock++;
}
@@ -551,7 +545,7 @@ xfs_lock_two_inodes(
struct xfs_inode *temp;
uint mode_temp;
int attempts = 0;
- xfs_log_item_t *lp;
+ struct xfs_log_item *lp;
ASSERT(hweight32(ip0_mode) == 1);
ASSERT(hweight32(ip1_mode) == 1);
@@ -585,7 +579,7 @@ xfs_lock_two_inodes(
* the second lock. If we can't get it, we must release the first one
* and try again.
*/
- lp = (xfs_log_item_t *)ip0->i_itemp;
+ lp = &ip0->i_itemp->ili_item;
if (lp && test_bit(XFS_LI_IN_AIL, &lp->li_flags)) {
if (!xfs_ilock_nowait(ip1, xfs_lock_inumorder(ip1_mode, 1))) {
xfs_iunlock(ip0, ip0_mode);
@@ -1116,7 +1110,7 @@ xfs_droplink(
/*
* Increment the link count on an inode & log the change.
*/
-static int
+static void
xfs_bumplink(
xfs_trans_t *tp,
xfs_inode_t *ip)
@@ -1126,7 +1120,6 @@ xfs_bumplink(
ASSERT(ip->i_d.di_version > 1);
inc_nlink(VFS_I(ip));
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
- return 0;
}
int
@@ -1235,9 +1228,7 @@ xfs_create(
if (error)
goto out_trans_cancel;
- error = xfs_bumplink(tp, dp);
- if (error)
- goto out_trans_cancel;
+ xfs_bumplink(tp, dp);
}
/*
@@ -1332,7 +1323,7 @@ xfs_create_tmpfile(
if (error)
goto out_trans_cancel;
- error = xfs_dir_ialloc(&tp, dp, mode, 1, 0, prid, &ip);
+ error = xfs_dir_ialloc(&tp, dp, mode, 0, 0, prid, &ip);
if (error)
goto out_trans_cancel;
@@ -1454,9 +1445,7 @@ xfs_link(
xfs_trans_ichgtime(tp, tdp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
xfs_trans_log_inode(tp, tdp, XFS_ILOG_CORE);
- error = xfs_bumplink(tp, sip);
- if (error)
- goto error_return;
+ xfs_bumplink(tp, sip);
/*
* If this is a synchronous mount, make sure that the
@@ -1754,7 +1743,7 @@ xfs_inactive_ifree(
* now remains allocated and sits on the unlinked list until the fs is
* repaired.
*/
- if (unlikely(mp->m_inotbt_nores)) {
+ if (unlikely(mp->m_finobt_nores)) {
error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ifree,
XFS_IFREE_SPACE_RES(mp), 0, XFS_TRANS_RESERVE,
&tp);
@@ -1907,86 +1896,510 @@ xfs_inactive(
}
/*
- * This is called when the inode's link count goes to 0 or we are creating a
- * tmpfile via O_TMPFILE. In the case of a tmpfile, @ignore_linkcount will be
- * set to true as the link count is dropped to zero by the VFS after we've
- * created the file successfully, so we have to add it to the unlinked list
- * while the link count is non-zero.
+ * In-Core Unlinked List Lookups
+ * =============================
+ *
+ * Every inode is supposed to be reachable from some other piece of metadata
+ * with the exception of the root directory. Inodes with a connection to a
+ * file descriptor but not linked from anywhere in the on-disk directory tree
+ * are collectively known as unlinked inodes, though the filesystem itself
+ * maintains links to these inodes so that on-disk metadata are consistent.
+ *
+ * XFS implements a per-AG on-disk hash table of unlinked inodes. The AGI
+ * header contains a number of buckets that point to an inode, and each inode
+ * record has a pointer to the next inode in the hash chain. This
+ * singly-linked list causes scaling problems in the iunlink remove function
+ * because we must walk that list to find the inode that points to the inode
+ * being removed from the unlinked hash bucket list.
+ *
+ * What if we modelled the unlinked list as a collection of records capturing
+ * "X.next_unlinked = Y" relations? If we indexed those records on Y, we'd
+ * have a fast way to look up unlinked list predecessors, which avoids the
+ * slow list walk. That's exactly what we do here (in-core) with a per-AG
+ * rhashtable.
+ *
+ * Because this is a backref cache, we ignore operational failures since the
+ * iunlink code can fall back to the slow bucket walk. The only errors that
+ * should bubble out are for obviously incorrect situations.
+ *
+ * All users of the backref cache MUST hold the AGI buffer lock to serialize
+ * access or have otherwise provided for concurrency control.
+ */
+
+/* Capture a "X.next_unlinked = Y" relationship. */
+struct xfs_iunlink {
+ struct rhash_head iu_rhash_head;
+ xfs_agino_t iu_agino; /* X */
+ xfs_agino_t iu_next_unlinked; /* Y */
+};
+
+/* Unlinked list predecessor lookup hashtable construction */
+static int
+xfs_iunlink_obj_cmpfn(
+ struct rhashtable_compare_arg *arg,
+ const void *obj)
+{
+ const xfs_agino_t *key = arg->key;
+ const struct xfs_iunlink *iu = obj;
+
+ if (iu->iu_next_unlinked != *key)
+ return 1;
+ return 0;
+}
+
+static const struct rhashtable_params xfs_iunlink_hash_params = {
+ .min_size = XFS_AGI_UNLINKED_BUCKETS,
+ .key_len = sizeof(xfs_agino_t),
+ .key_offset = offsetof(struct xfs_iunlink,
+ iu_next_unlinked),
+ .head_offset = offsetof(struct xfs_iunlink, iu_rhash_head),
+ .automatic_shrinking = true,
+ .obj_cmpfn = xfs_iunlink_obj_cmpfn,
+};
+
+/*
+ * Return X, where X.next_unlinked == @agino. Returns NULLAGINO if no such
+ * relation is found.
+ */
+static xfs_agino_t
+xfs_iunlink_lookup_backref(
+ struct xfs_perag *pag,
+ xfs_agino_t agino)
+{
+ struct xfs_iunlink *iu;
+
+ iu = rhashtable_lookup_fast(&pag->pagi_unlinked_hash, &agino,
+ xfs_iunlink_hash_params);
+ return iu ? iu->iu_agino : NULLAGINO;
+}
+
+/*
+ * Take ownership of an iunlink cache entry and insert it into the hash table.
+ * If successful, the entry will be owned by the cache; if not, it is freed.
+ * Either way, the caller does not own @iu after this call.
+ */
+static int
+xfs_iunlink_insert_backref(
+ struct xfs_perag *pag,
+ struct xfs_iunlink *iu)
+{
+ int error;
+
+ error = rhashtable_insert_fast(&pag->pagi_unlinked_hash,
+ &iu->iu_rhash_head, xfs_iunlink_hash_params);
+ /*
+ * Fail loudly if there already was an entry because that's a sign of
+ * corruption of in-memory data. Also fail loudly if we see an error
+ * code we didn't anticipate from the rhashtable code. Currently we
+ * only anticipate ENOMEM.
+ */
+ if (error) {
+ WARN(error != -ENOMEM, "iunlink cache insert error %d", error);
+ kmem_free(iu);
+ }
+ /*
+ * Absorb any runtime errors that aren't a result of corruption because
+ * this is a cache and we can always fall back to bucket list scanning.
+ */
+ if (error != 0 && error != -EEXIST)
+ error = 0;
+ return error;
+}
+
+/* Remember that @prev_agino.next_unlinked = @this_agino. */
+static int
+xfs_iunlink_add_backref(
+ struct xfs_perag *pag,
+ xfs_agino_t prev_agino,
+ xfs_agino_t this_agino)
+{
+ struct xfs_iunlink *iu;
+
+ if (XFS_TEST_ERROR(false, pag->pag_mount, XFS_ERRTAG_IUNLINK_FALLBACK))
+ return 0;
+
+ iu = kmem_zalloc(sizeof(*iu), KM_SLEEP | KM_NOFS);
+ iu->iu_agino = prev_agino;
+ iu->iu_next_unlinked = this_agino;
+
+ return xfs_iunlink_insert_backref(pag, iu);
+}
+
+/*
+ * Replace X.next_unlinked = @agino with X.next_unlinked = @next_unlinked.
+ * If @next_unlinked is NULLAGINO, we drop the backref and exit. If there
+ * wasn't any such entry then we don't bother.
+ */
+static int
+xfs_iunlink_change_backref(
+ struct xfs_perag *pag,
+ xfs_agino_t agino,
+ xfs_agino_t next_unlinked)
+{
+ struct xfs_iunlink *iu;
+ int error;
+
+ /* Look up the old entry; if there wasn't one then exit. */
+ iu = rhashtable_lookup_fast(&pag->pagi_unlinked_hash, &agino,
+ xfs_iunlink_hash_params);
+ if (!iu)
+ return 0;
+
+ /*
+ * Remove the entry. This shouldn't ever return an error, but if we
+ * couldn't remove the old entry we don't want to add it again to the
+ * hash table, and if the entry disappeared on us then someone's
+ * violated the locking rules and we need to fail loudly. Either way
+ * we cannot remove the inode because internal state is or would have
+ * been corrupt.
+ */
+ error = rhashtable_remove_fast(&pag->pagi_unlinked_hash,
+ &iu->iu_rhash_head, xfs_iunlink_hash_params);
+ if (error)
+ return error;
+
+ /* If there is no new next entry just free our item and return. */
+ if (next_unlinked == NULLAGINO) {
+ kmem_free(iu);
+ return 0;
+ }
+
+ /* Update the entry and re-add it to the hash table. */
+ iu->iu_next_unlinked = next_unlinked;
+ return xfs_iunlink_insert_backref(pag, iu);
+}
+
+/* Set up the in-core predecessor structures. */
+int
+xfs_iunlink_init(
+ struct xfs_perag *pag)
+{
+ return rhashtable_init(&pag->pagi_unlinked_hash,
+ &xfs_iunlink_hash_params);
+}
+
+/* Free the in-core predecessor structures. */
+static void
+xfs_iunlink_free_item(
+ void *ptr,
+ void *arg)
+{
+ struct xfs_iunlink *iu = ptr;
+ bool *freed_anything = arg;
+
+ *freed_anything = true;
+ kmem_free(iu);
+}
+
+void
+xfs_iunlink_destroy(
+ struct xfs_perag *pag)
+{
+ bool freed_anything = false;
+
+ rhashtable_free_and_destroy(&pag->pagi_unlinked_hash,
+ xfs_iunlink_free_item, &freed_anything);
+
+ ASSERT(freed_anything == false || XFS_FORCED_SHUTDOWN(pag->pag_mount));
+}
+
+/*
+ * Point the AGI unlinked bucket at an inode and log the results. The caller
+ * is responsible for validating the old value.
+ */
+STATIC int
+xfs_iunlink_update_bucket(
+ struct xfs_trans *tp,
+ xfs_agnumber_t agno,
+ struct xfs_buf *agibp,
+ unsigned int bucket_index,
+ xfs_agino_t new_agino)
+{
+ struct xfs_agi *agi = XFS_BUF_TO_AGI(agibp);
+ xfs_agino_t old_value;
+ int offset;
+
+ ASSERT(xfs_verify_agino_or_null(tp->t_mountp, agno, new_agino));
+
+ old_value = be32_to_cpu(agi->agi_unlinked[bucket_index]);
+ trace_xfs_iunlink_update_bucket(tp->t_mountp, agno, bucket_index,
+ old_value, new_agino);
+
+ /*
+ * We should never find the head of the list already set to the value
+ * passed in because either we're adding or removing ourselves from the
+ * head of the list.
+ */
+ if (old_value == new_agino)
+ return -EFSCORRUPTED;
+
+ agi->agi_unlinked[bucket_index] = cpu_to_be32(new_agino);
+ offset = offsetof(struct xfs_agi, agi_unlinked) +
+ (sizeof(xfs_agino_t) * bucket_index);
+ xfs_trans_log_buf(tp, agibp, offset, offset + sizeof(xfs_agino_t) - 1);
+ return 0;
+}
+
+/* Set an on-disk inode's next_unlinked pointer. */
+STATIC void
+xfs_iunlink_update_dinode(
+ struct xfs_trans *tp,
+ xfs_agnumber_t agno,
+ xfs_agino_t agino,
+ struct xfs_buf *ibp,
+ struct xfs_dinode *dip,
+ struct xfs_imap *imap,
+ xfs_agino_t next_agino)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+ int offset;
+
+ ASSERT(xfs_verify_agino_or_null(mp, agno, next_agino));
+
+ trace_xfs_iunlink_update_dinode(mp, agno, agino,
+ be32_to_cpu(dip->di_next_unlinked), next_agino);
+
+ dip->di_next_unlinked = cpu_to_be32(next_agino);
+ offset = imap->im_boffset +
+ offsetof(struct xfs_dinode, di_next_unlinked);
+
+ /* need to recalc the inode CRC if appropriate */
+ xfs_dinode_calc_crc(mp, dip);
+ xfs_trans_inode_buf(tp, ibp);
+ xfs_trans_log_buf(tp, ibp, offset, offset + sizeof(xfs_agino_t) - 1);
+ xfs_inobp_check(mp, ibp);
+}
+
+/* Set an in-core inode's unlinked pointer and return the old value. */
+STATIC int
+xfs_iunlink_update_inode(
+ struct xfs_trans *tp,
+ struct xfs_inode *ip,
+ xfs_agnumber_t agno,
+ xfs_agino_t next_agino,
+ xfs_agino_t *old_next_agino)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_dinode *dip;
+ struct xfs_buf *ibp;
+ xfs_agino_t old_value;
+ int error;
+
+ ASSERT(xfs_verify_agino_or_null(mp, agno, next_agino));
+
+ error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp, 0, 0);
+ if (error)
+ return error;
+
+ /* Make sure the old pointer isn't garbage. */
+ old_value = be32_to_cpu(dip->di_next_unlinked);
+ if (!xfs_verify_agino_or_null(mp, agno, old_value)) {
+ error = -EFSCORRUPTED;
+ goto out;
+ }
+
+ /*
+ * Since we're updating a linked list, we should never find that the
+ * current pointer is the same as the new value, unless we're
+ * terminating the list.
+ */
+ *old_next_agino = old_value;
+ if (old_value == next_agino) {
+ if (next_agino != NULLAGINO)
+ error = -EFSCORRUPTED;
+ goto out;
+ }
+
+ /* Ok, update the new pointer. */
+ xfs_iunlink_update_dinode(tp, agno, XFS_INO_TO_AGINO(mp, ip->i_ino),
+ ibp, dip, &ip->i_imap, next_agino);
+ return 0;
+out:
+ xfs_trans_brelse(tp, ibp);
+ return error;
+}
+
+/*
+ * This is called when the inode's link count has gone to 0 or we are creating
+ * a tmpfile via O_TMPFILE. The inode @ip must have nlink == 0.
*
* We place the on-disk inode on a list in the AGI. It will be pulled from this
* list when the inode is freed.
*/
STATIC int
xfs_iunlink(
- struct xfs_trans *tp,
- struct xfs_inode *ip)
+ struct xfs_trans *tp,
+ struct xfs_inode *ip)
{
- xfs_mount_t *mp = tp->t_mountp;
- xfs_agi_t *agi;
- xfs_dinode_t *dip;
- xfs_buf_t *agibp;
- xfs_buf_t *ibp;
- xfs_agino_t agino;
- short bucket_index;
- int offset;
- int error;
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_agi *agi;
+ struct xfs_buf *agibp;
+ xfs_agino_t next_agino;
+ xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
+ xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
+ short bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
+ int error;
+ ASSERT(VFS_I(ip)->i_nlink == 0);
ASSERT(VFS_I(ip)->i_mode != 0);
+ trace_xfs_iunlink(ip);
- /*
- * Get the agi buffer first. It ensures lock ordering
- * on the list.
- */
- error = xfs_read_agi(mp, tp, XFS_INO_TO_AGNO(mp, ip->i_ino), &agibp);
+ /* Get the agi buffer first. It ensures lock ordering on the list. */
+ error = xfs_read_agi(mp, tp, agno, &agibp);
if (error)
return error;
agi = XFS_BUF_TO_AGI(agibp);
/*
- * Get the index into the agi hash table for the
- * list this inode will go on.
+ * Get the index into the agi hash table for the list this inode will
+ * go on. Make sure the pointer isn't garbage and that this inode
+ * isn't already on the list.
*/
- agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
- ASSERT(agino != 0);
- bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
- ASSERT(agi->agi_unlinked[bucket_index]);
- ASSERT(be32_to_cpu(agi->agi_unlinked[bucket_index]) != agino);
+ next_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]);
+ if (next_agino == agino ||
+ !xfs_verify_agino_or_null(mp, agno, next_agino))
+ return -EFSCORRUPTED;
+
+ if (next_agino != NULLAGINO) {
+ struct xfs_perag *pag;
+ xfs_agino_t old_agino;
- if (agi->agi_unlinked[bucket_index] != cpu_to_be32(NULLAGINO)) {
/*
- * There is already another inode in the bucket we need
- * to add ourselves to. Add us at the front of the list.
- * Here we put the head pointer into our next pointer,
- * and then we fall through to point the head at us.
+ * There is already another inode in the bucket, so point this
+ * inode to the current head of the list.
*/
- error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp,
- 0, 0);
+ error = xfs_iunlink_update_inode(tp, ip, agno, next_agino,
+ &old_agino);
if (error)
return error;
+ ASSERT(old_agino == NULLAGINO);
- ASSERT(dip->di_next_unlinked == cpu_to_be32(NULLAGINO));
- dip->di_next_unlinked = agi->agi_unlinked[bucket_index];
- offset = ip->i_imap.im_boffset +
- offsetof(xfs_dinode_t, di_next_unlinked);
+ /*
+ * agino has been unlinked, add a backref from the next inode
+ * back to agino.
+ */
+ pag = xfs_perag_get(mp, agno);
+ error = xfs_iunlink_add_backref(pag, agino, next_agino);
+ xfs_perag_put(pag);
+ if (error)
+ return error;
+ }
+
+ /* Point the head of the list to point to this inode. */
+ return xfs_iunlink_update_bucket(tp, agno, agibp, bucket_index, agino);
+}
+
+/* Return the imap, dinode pointer, and buffer for an inode. */
+STATIC int
+xfs_iunlink_map_ino(
+ struct xfs_trans *tp,
+ xfs_agnumber_t agno,
+ xfs_agino_t agino,
+ struct xfs_imap *imap,
+ struct xfs_dinode **dipp,
+ struct xfs_buf **bpp)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+ int error;
- /* need to recalc the inode CRC if appropriate */
- xfs_dinode_calc_crc(mp, dip);
+ imap->im_blkno = 0;
+ error = xfs_imap(mp, tp, XFS_AGINO_TO_INO(mp, agno, agino), imap, 0);
+ if (error) {
+ xfs_warn(mp, "%s: xfs_imap returned error %d.",
+ __func__, error);
+ return error;
+ }
- xfs_trans_inode_buf(tp, ibp);
- xfs_trans_log_buf(tp, ibp, offset,
- (offset + sizeof(xfs_agino_t) - 1));
- xfs_inobp_check(mp, ibp);
+ error = xfs_imap_to_bp(mp, tp, imap, dipp, bpp, 0, 0);
+ if (error) {
+ xfs_warn(mp, "%s: xfs_imap_to_bp returned error %d.",
+ __func__, error);
+ return error;
+ }
+
+ return 0;
+}
+
+/*
+ * Walk the unlinked chain from @head_agino until we find the inode that
+ * points to @target_agino. Return the inode number, map, dinode pointer,
+ * and inode cluster buffer of that inode as @agino, @imap, @dipp, and @bpp.
+ *
+ * @tp, @pag, @head_agino, and @target_agino are input parameters.
+ * @agino, @imap, @dipp, and @bpp are all output parameters.
+ *
+ * Do not call this function if @target_agino is the head of the list.
+ */
+STATIC int
+xfs_iunlink_map_prev(
+ struct xfs_trans *tp,
+ xfs_agnumber_t agno,
+ xfs_agino_t head_agino,
+ xfs_agino_t target_agino,
+ xfs_agino_t *agino,
+ struct xfs_imap *imap,
+ struct xfs_dinode **dipp,
+ struct xfs_buf **bpp,
+ struct xfs_perag *pag)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+ xfs_agino_t next_agino;
+ int error;
+
+ ASSERT(head_agino != target_agino);
+ *bpp = NULL;
+
+ /* See if our backref cache can find it faster. */
+ *agino = xfs_iunlink_lookup_backref(pag, target_agino);
+ if (*agino != NULLAGINO) {
+ error = xfs_iunlink_map_ino(tp, agno, *agino, imap, dipp, bpp);
+ if (error)
+ return error;
+
+ if (be32_to_cpu((*dipp)->di_next_unlinked) == target_agino)
+ return 0;
+
+ /*
+ * If we get here the cache contents were corrupt, so drop the
+ * buffer and fall back to walking the bucket list.
+ */
+ xfs_trans_brelse(tp, *bpp);
+ *bpp = NULL;
+ WARN_ON_ONCE(1);
+ }
+
+ trace_xfs_iunlink_map_prev_fallback(mp, agno);
+
+ /* Otherwise, walk the entire bucket until we find it. */
+ next_agino = head_agino;
+ while (next_agino != target_agino) {
+ xfs_agino_t unlinked_agino;
+
+ if (*bpp)
+ xfs_trans_brelse(tp, *bpp);
+
+ *agino = next_agino;
+ error = xfs_iunlink_map_ino(tp, agno, next_agino, imap, dipp,
+ bpp);
+ if (error)
+ return error;
+
+ unlinked_agino = be32_to_cpu((*dipp)->di_next_unlinked);
+ /*
+ * Make sure this pointer is valid and isn't an obvious
+ * infinite loop.
+ */
+ if (!xfs_verify_agino(mp, agno, unlinked_agino) ||
+ next_agino == unlinked_agino) {
+ XFS_CORRUPTION_ERROR(__func__,
+ XFS_ERRLEVEL_LOW, mp,
+ *dipp, sizeof(**dipp));
+ error = -EFSCORRUPTED;
+ return error;
+ }
+ next_agino = unlinked_agino;
}
- /*
- * Point the bucket head pointer at the inode being inserted.
- */
- ASSERT(agino != 0);
- agi->agi_unlinked[bucket_index] = cpu_to_be32(agino);
- offset = offsetof(xfs_agi_t, agi_unlinked) +
- (sizeof(xfs_agino_t) * bucket_index);
- xfs_trans_log_buf(tp, agibp, offset,
- (offset + sizeof(xfs_agino_t) - 1));
return 0;
}
@@ -1995,181 +2408,106 @@ xfs_iunlink(
*/
STATIC int
xfs_iunlink_remove(
- xfs_trans_t *tp,
- xfs_inode_t *ip)
+ struct xfs_trans *tp,
+ struct xfs_inode *ip)
{
- xfs_ino_t next_ino;
- xfs_mount_t *mp;
- xfs_agi_t *agi;
- xfs_dinode_t *dip;
- xfs_buf_t *agibp;
- xfs_buf_t *ibp;
- xfs_agnumber_t agno;
- xfs_agino_t agino;
- xfs_agino_t next_agino;
- xfs_buf_t *last_ibp;
- xfs_dinode_t *last_dip = NULL;
- short bucket_index;
- int offset, last_offset = 0;
- int error;
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_agi *agi;
+ struct xfs_buf *agibp;
+ struct xfs_buf *last_ibp;
+ struct xfs_dinode *last_dip = NULL;
+ struct xfs_perag *pag = NULL;
+ xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
+ xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
+ xfs_agino_t next_agino;
+ xfs_agino_t head_agino;
+ short bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
+ int error;
- mp = tp->t_mountp;
- agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
+ trace_xfs_iunlink_remove(ip);
- /*
- * Get the agi buffer first. It ensures lock ordering
- * on the list.
- */
+ /* Get the agi buffer first. It ensures lock ordering on the list. */
error = xfs_read_agi(mp, tp, agno, &agibp);
if (error)
return error;
-
agi = XFS_BUF_TO_AGI(agibp);
/*
- * Get the index into the agi hash table for the
- * list this inode will go on.
+ * Get the index into the agi hash table for the list this inode will
+ * go on. Make sure the head pointer isn't garbage.
*/
- agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
- if (!xfs_verify_agino(mp, agno, agino))
- return -EFSCORRUPTED;
- bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
- if (!xfs_verify_agino(mp, agno,
- be32_to_cpu(agi->agi_unlinked[bucket_index]))) {
+ head_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]);
+ if (!xfs_verify_agino(mp, agno, head_agino)) {
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
agi, sizeof(*agi));
return -EFSCORRUPTED;
}
- if (be32_to_cpu(agi->agi_unlinked[bucket_index]) == agino) {
- /*
- * We're at the head of the list. Get the inode's on-disk
- * buffer to see if there is anyone after us on the list.
- * Only modify our next pointer if it is not already NULLAGINO.
- * This saves us the overhead of dealing with the buffer when
- * there is no need to change it.
- */
- error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp,
- 0, 0);
- if (error) {
- xfs_warn(mp, "%s: xfs_imap_to_bp returned error %d.",
- __func__, error);
- return error;
- }
- next_agino = be32_to_cpu(dip->di_next_unlinked);
- ASSERT(next_agino != 0);
- if (next_agino != NULLAGINO) {
- dip->di_next_unlinked = cpu_to_be32(NULLAGINO);
- offset = ip->i_imap.im_boffset +
- offsetof(xfs_dinode_t, di_next_unlinked);
-
- /* need to recalc the inode CRC if appropriate */
- xfs_dinode_calc_crc(mp, dip);
-
- xfs_trans_inode_buf(tp, ibp);
- xfs_trans_log_buf(tp, ibp, offset,
- (offset + sizeof(xfs_agino_t) - 1));
- xfs_inobp_check(mp, ibp);
- } else {
- xfs_trans_brelse(tp, ibp);
- }
- /*
- * Point the bucket head pointer at the next inode.
- */
- ASSERT(next_agino != 0);
- ASSERT(next_agino != agino);
- agi->agi_unlinked[bucket_index] = cpu_to_be32(next_agino);
- offset = offsetof(xfs_agi_t, agi_unlinked) +
- (sizeof(xfs_agino_t) * bucket_index);
- xfs_trans_log_buf(tp, agibp, offset,
- (offset + sizeof(xfs_agino_t) - 1));
- } else {
- /*
- * We need to search the list for the inode being freed.
- */
- next_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]);
- last_ibp = NULL;
- while (next_agino != agino) {
- struct xfs_imap imap;
+ /*
+ * Set our inode's next_unlinked pointer to NULL and then return
+ * the old pointer value so that we can update whatever was previous
+ * to us in the list to point to whatever was next in the list.
+ */
+ error = xfs_iunlink_update_inode(tp, ip, agno, NULLAGINO, &next_agino);
+ if (error)
+ return error;
- if (last_ibp)
- xfs_trans_brelse(tp, last_ibp);
+ /*
+ * If there was a backref pointing from the next inode back to this
+ * one, remove it because we've removed this inode from the list.
+ *
+ * Later, if this inode was in the middle of the list we'll update
+ * this inode's backref to point from the next inode.
+ */
+ if (next_agino != NULLAGINO) {
+ pag = xfs_perag_get(mp, agno);
+ error = xfs_iunlink_change_backref(pag, next_agino,
+ NULLAGINO);
+ if (error)
+ goto out;
+ }
- imap.im_blkno = 0;
- next_ino = XFS_AGINO_TO_INO(mp, agno, next_agino);
+ if (head_agino == agino) {
+ /* Point the head of the list to the next unlinked inode. */
+ error = xfs_iunlink_update_bucket(tp, agno, agibp, bucket_index,
+ next_agino);
+ if (error)
+ goto out;
+ } else {
+ struct xfs_imap imap;
+ xfs_agino_t prev_agino;
- error = xfs_imap(mp, tp, next_ino, &imap, 0);
- if (error) {
- xfs_warn(mp,
- "%s: xfs_imap returned error %d.",
- __func__, error);
- return error;
- }
+ if (!pag)
+ pag = xfs_perag_get(mp, agno);
- error = xfs_imap_to_bp(mp, tp, &imap, &last_dip,
- &last_ibp, 0, 0);
- if (error) {
- xfs_warn(mp,
- "%s: xfs_imap_to_bp returned error %d.",
- __func__, error);
- return error;
- }
+ /* We need to search the list for the inode being freed. */
+ error = xfs_iunlink_map_prev(tp, agno, head_agino, agino,
+ &prev_agino, &imap, &last_dip, &last_ibp,
+ pag);
+ if (error)
+ goto out;
- last_offset = imap.im_boffset;
- next_agino = be32_to_cpu(last_dip->di_next_unlinked);
- if (!xfs_verify_agino(mp, agno, next_agino)) {
- XFS_CORRUPTION_ERROR(__func__,
- XFS_ERRLEVEL_LOW, mp,
- last_dip, sizeof(*last_dip));
- return -EFSCORRUPTED;
- }
- }
+ /* Point the previous inode on the list to the next inode. */
+ xfs_iunlink_update_dinode(tp, agno, prev_agino, last_ibp,
+ last_dip, &imap, next_agino);
/*
- * Now last_ibp points to the buffer previous to us on the
- * unlinked list. Pull us from the list.
+ * Now we deal with the backref for this inode. If this inode
+ * pointed at a real inode, change the backref that pointed to
+ * us to point to our old next. If this inode was the end of
+ * the list, delete the backref that pointed to us. Note that
+ * change_backref takes care of deleting the backref if
+ * next_agino is NULLAGINO.
*/
- error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp,
- 0, 0);
- if (error) {
- xfs_warn(mp, "%s: xfs_imap_to_bp(2) returned error %d.",
- __func__, error);
- return error;
- }
- next_agino = be32_to_cpu(dip->di_next_unlinked);
- ASSERT(next_agino != 0);
- ASSERT(next_agino != agino);
- if (next_agino != NULLAGINO) {
- dip->di_next_unlinked = cpu_to_be32(NULLAGINO);
- offset = ip->i_imap.im_boffset +
- offsetof(xfs_dinode_t, di_next_unlinked);
-
- /* need to recalc the inode CRC if appropriate */
- xfs_dinode_calc_crc(mp, dip);
-
- xfs_trans_inode_buf(tp, ibp);
- xfs_trans_log_buf(tp, ibp, offset,
- (offset + sizeof(xfs_agino_t) - 1));
- xfs_inobp_check(mp, ibp);
- } else {
- xfs_trans_brelse(tp, ibp);
- }
- /*
- * Point the previous inode on the list to the next inode.
- */
- last_dip->di_next_unlinked = cpu_to_be32(next_agino);
- ASSERT(next_agino != 0);
- offset = last_offset + offsetof(xfs_dinode_t, di_next_unlinked);
-
- /* need to recalc the inode CRC if appropriate */
- xfs_dinode_calc_crc(mp, last_dip);
-
- xfs_trans_inode_buf(tp, last_ibp);
- xfs_trans_log_buf(tp, last_ibp, offset,
- (offset + sizeof(xfs_agino_t) - 1));
- xfs_inobp_check(mp, last_ibp);
+ error = xfs_iunlink_change_backref(pag, agino, next_agino);
+ if (error)
+ goto out;
}
- return 0;
+
+out:
+ if (pag)
+ xfs_perag_put(pag);
+ return error;
}
/*
@@ -2193,13 +2531,14 @@ xfs_ifree_cluster(
xfs_inode_log_item_t *iip;
struct xfs_log_item *lip;
struct xfs_perag *pag;
+ struct xfs_ino_geometry *igeo = M_IGEO(mp);
xfs_ino_t inum;
inum = xic->first_ino;
pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, inum));
- nbufs = mp->m_ialloc_blks / mp->m_blocks_per_cluster;
+ nbufs = igeo->ialloc_blks / igeo->blocks_per_cluster;
- for (j = 0; j < nbufs; j++, inum += mp->m_inodes_per_cluster) {
+ for (j = 0; j < nbufs; j++, inum += igeo->inodes_per_cluster) {
/*
* The allocation bitmap tells us which inodes of the chunk were
* physically allocated. Skip the cluster if an inode falls into
@@ -2207,7 +2546,7 @@ xfs_ifree_cluster(
*/
ioffset = inum - xic->first_ino;
if ((xic->alloc & XFS_INOBT_MASK(ioffset)) == 0) {
- ASSERT(ioffset % mp->m_inodes_per_cluster == 0);
+ ASSERT(ioffset % igeo->inodes_per_cluster == 0);
continue;
}
@@ -2223,7 +2562,7 @@ xfs_ifree_cluster(
* to mark all the active inodes on the buffer stale.
*/
bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno,
- mp->m_bsize * mp->m_blocks_per_cluster,
+ mp->m_bsize * igeo->blocks_per_cluster,
XBF_UNMAPPED);
if (!bp)
@@ -2270,7 +2609,7 @@ xfs_ifree_cluster(
* transaction stale above, which means there is no point in
* even trying to lock them.
*/
- for (i = 0; i < mp->m_inodes_per_cluster; i++) {
+ for (i = 0; i < igeo->inodes_per_cluster; i++) {
retry:
rcu_read_lock();
ip = radix_tree_lookup(&pag->pag_ici_root,
@@ -2748,9 +3087,7 @@ xfs_cross_rename(
error = xfs_droplink(tp, dp2);
if (error)
goto out_trans_abort;
- error = xfs_bumplink(tp, dp1);
- if (error)
- goto out_trans_abort;
+ xfs_bumplink(tp, dp1);
}
/*
@@ -2774,9 +3111,7 @@ xfs_cross_rename(
error = xfs_droplink(tp, dp1);
if (error)
goto out_trans_abort;
- error = xfs_bumplink(tp, dp2);
- if (error)
- goto out_trans_abort;
+ xfs_bumplink(tp, dp2);
}
/*
@@ -2833,11 +3168,9 @@ xfs_rename_alloc_whiteout(
/*
* Prepare the tmpfile inode as if it were created through the VFS.
- * Otherwise, the link increment paths will complain about nlink 0->1.
- * Drop the link count as done by d_tmpfile(), complete the inode setup
- * and flag it as linkable.
+ * Complete the inode setup and flag it as linkable. nlink is already
+ * zero, so we can skip the drop_nlink.
*/
- drop_nlink(VFS_I(tmpfile));
xfs_setup_iops(tmpfile);
xfs_finish_inode_setup(tmpfile);
VFS_I(tmpfile)->i_state |= I_LINKABLE;
@@ -2975,9 +3308,7 @@ xfs_rename(
XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
if (new_parent && src_is_directory) {
- error = xfs_bumplink(tp, target_dp);
- if (error)
- goto out_trans_cancel;
+ xfs_bumplink(tp, target_dp);
}
} else { /* target_ip != NULL */
/*
@@ -3096,9 +3427,7 @@ xfs_rename(
*/
if (wip) {
ASSERT(VFS_I(wip)->i_nlink == 0);
- error = xfs_bumplink(tp, wip);
- if (error)
- goto out_trans_cancel;
+ xfs_bumplink(tp, wip);
error = xfs_iunlink_remove(tp, wip);
if (error)
goto out_trans_cancel;
@@ -3138,28 +3467,27 @@ xfs_iflush_cluster(
struct xfs_mount *mp = ip->i_mount;
struct xfs_perag *pag;
unsigned long first_index, mask;
- unsigned long inodes_per_cluster;
int cilist_size;
struct xfs_inode **cilist;
struct xfs_inode *cip;
+ struct xfs_ino_geometry *igeo = M_IGEO(mp);
int nr_found;
int clcount = 0;
int i;
pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
- inodes_per_cluster = mp->m_inode_cluster_size >> mp->m_sb.sb_inodelog;
- cilist_size = inodes_per_cluster * sizeof(xfs_inode_t *);
+ cilist_size = igeo->inodes_per_cluster * sizeof(struct xfs_inode *);
cilist = kmem_alloc(cilist_size, KM_MAYFAIL|KM_NOFS);
if (!cilist)
goto out_put;
- mask = ~(((mp->m_inode_cluster_size >> mp->m_sb.sb_inodelog)) - 1);
+ mask = ~(igeo->inodes_per_cluster - 1);
first_index = XFS_INO_TO_AGINO(mp, ip->i_ino) & mask;
rcu_read_lock();
/* really need a gang lookup range call here */
nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, (void**)cilist,
- first_index, inodes_per_cluster);
+ first_index, igeo->inodes_per_cluster);
if (nr_found == 0)
goto out_free;
@@ -3267,7 +3595,6 @@ cluster_corrupt_out:
* inode buffer and shut down the filesystem.
*/
rcu_read_unlock();
- xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
/*
* We'll always have an inode attached to the buffer for completion
@@ -3277,11 +3604,14 @@ cluster_corrupt_out:
* xfs_buf_submit().
*/
ASSERT(bp->b_iodone);
+ bp->b_flags |= XBF_ASYNC;
bp->b_flags &= ~XBF_DONE;
xfs_buf_stale(bp);
xfs_buf_ioerror(bp, -EIO);
xfs_buf_ioend(bp);
+ xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+
/* abort the corrupt inode, as it was not attached to the buffer */
xfs_iflush_abort(cip, false);
kmem_free(cilist);
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index be2014520155..558173f95a03 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -45,10 +45,18 @@ typedef struct xfs_inode {
mrlock_t i_lock; /* inode lock */
mrlock_t i_mmaplock; /* inode mmap IO lock */
atomic_t i_pincount; /* inode pin count */
+
+ /*
+ * Bitsets of inode metadata that have been checked and/or are sick.
+ * Callers must hold i_flags_lock before accessing this field.
+ */
+ uint16_t i_checked;
+ uint16_t i_sick;
+
spinlock_t i_flags_lock; /* inode i_flags lock */
/* Miscellaneous state. */
unsigned long i_flags; /* see defined flags below */
- unsigned int i_delayed_blks; /* count of delay alloc blks */
+ uint64_t i_delayed_blks; /* count of delay alloc blks */
struct xfs_icdinode i_d; /* most of ondisk inode */
@@ -57,6 +65,11 @@ typedef struct xfs_inode {
/* VFS inode */
struct inode i_vnode; /* embedded VFS inode */
+
+ /* pending io completions */
+ spinlock_t i_ioend_lock;
+ struct work_struct i_ioend_work;
+ struct list_head i_ioend_list;
} xfs_inode_t;
/* Convert from vfs inode to xfs inode */
@@ -500,4 +513,9 @@ extern struct kmem_zone *xfs_inode_zone;
bool xfs_inode_verify_forks(struct xfs_inode *ip);
+int xfs_iunlink_init(struct xfs_perag *pag);
+void xfs_iunlink_destroy(struct xfs_perag *pag);
+
+void xfs_end_io(struct work_struct *work);
+
#endif /* __XFS_INODE_H__ */
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index fa1c4fe2ffbf..c9a502eed204 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -5,6 +5,7 @@
*/
#include "xfs.h"
#include "xfs_fs.h"
+#include "xfs_shared.h"
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
@@ -12,7 +13,6 @@
#include "xfs_inode.h"
#include "xfs_trans.h"
#include "xfs_inode_item.h"
-#include "xfs_error.h"
#include "xfs_trace.h"
#include "xfs_trans_priv.h"
#include "xfs_buf_item.h"
@@ -565,7 +565,7 @@ out_unlock:
* Unlock the inode associated with the inode log item.
*/
STATIC void
-xfs_inode_item_unlock(
+xfs_inode_item_release(
struct xfs_log_item *lip)
{
struct xfs_inode_log_item *iip = INODE_ITEM(lip);
@@ -621,23 +621,21 @@ xfs_inode_item_committed(
STATIC void
xfs_inode_item_committing(
struct xfs_log_item *lip,
- xfs_lsn_t lsn)
+ xfs_lsn_t commit_lsn)
{
- INODE_ITEM(lip)->ili_last_lsn = lsn;
+ INODE_ITEM(lip)->ili_last_lsn = commit_lsn;
+ return xfs_inode_item_release(lip);
}
-/*
- * This is the ops vector shared by all buf log items.
- */
static const struct xfs_item_ops xfs_inode_item_ops = {
.iop_size = xfs_inode_item_size,
.iop_format = xfs_inode_item_format,
.iop_pin = xfs_inode_item_pin,
.iop_unpin = xfs_inode_item_unpin,
- .iop_unlock = xfs_inode_item_unlock,
+ .iop_release = xfs_inode_item_release,
.iop_committed = xfs_inode_item_committed,
.iop_push = xfs_inode_item_push,
- .iop_committing = xfs_inode_item_committing,
+ .iop_committing = xfs_inode_item_committing,
.iop_error = xfs_inode_item_error
};
diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h
index 27081eba220c..07a60e74c39c 100644
--- a/fs/xfs/xfs_inode_item.h
+++ b/fs/xfs/xfs_inode_item.h
@@ -14,7 +14,7 @@ struct xfs_inode;
struct xfs_mount;
typedef struct xfs_inode_log_item {
- xfs_log_item_t ili_item; /* common portion */
+ struct xfs_log_item ili_item; /* common portion */
struct xfs_inode *ili_inode; /* inode ptr */
xfs_lsn_t ili_flush_lsn; /* lsn at last flush */
xfs_lsn_t ili_last_lsn; /* lsn at last transaction */
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 6ecdbb3af7de..6f7848cd5527 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -11,9 +11,8 @@
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
#include "xfs_inode.h"
-#include "xfs_ioctl.h"
-#include "xfs_alloc.h"
#include "xfs_rtalloc.h"
+#include "xfs_iwalk.h"
#include "xfs_itable.h"
#include "xfs_error.h"
#include "xfs_attr.h"
@@ -25,7 +24,6 @@
#include "xfs_export.h"
#include "xfs_trace.h"
#include "xfs_icache.h"
-#include "xfs_symlink.h"
#include "xfs_trans.h"
#include "xfs_acl.h"
#include "xfs_btree.h"
@@ -33,15 +31,11 @@
#include "xfs_fsmap.h"
#include "scrub/xfs_scrub.h"
#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_health.h"
-#include <linux/capability.h>
-#include <linux/cred.h>
-#include <linux/dcache.h>
#include <linux/mount.h>
#include <linux/namei.h>
-#include <linux/pagemap.h>
-#include <linux/slab.h>
-#include <linux/exportfs.h>
/*
* xfs_find_handle maps from userspace xfs_fsop_handlereq structure to
@@ -719,16 +713,45 @@ out_unlock:
return error;
}
+/* Return 0 on success or positive error */
+int
+xfs_fsbulkstat_one_fmt(
+ struct xfs_ibulk *breq,
+ const struct xfs_bulkstat *bstat)
+{
+ struct xfs_bstat bs1;
+
+ xfs_bulkstat_to_bstat(breq->mp, &bs1, bstat);
+ if (copy_to_user(breq->ubuffer, &bs1, sizeof(bs1)))
+ return -EFAULT;
+ return xfs_ibulk_advance(breq, sizeof(struct xfs_bstat));
+}
+
+int
+xfs_fsinumbers_fmt(
+ struct xfs_ibulk *breq,
+ const struct xfs_inumbers *igrp)
+{
+ struct xfs_inogrp ig1;
+
+ xfs_inumbers_to_inogrp(&ig1, igrp);
+ if (copy_to_user(breq->ubuffer, &ig1, sizeof(struct xfs_inogrp)))
+ return -EFAULT;
+ return xfs_ibulk_advance(breq, sizeof(struct xfs_inogrp));
+}
+
STATIC int
-xfs_ioc_bulkstat(
+xfs_ioc_fsbulkstat(
xfs_mount_t *mp,
unsigned int cmd,
void __user *arg)
{
- xfs_fsop_bulkreq_t bulkreq;
- int count; /* # of records returned */
- xfs_ino_t inlast; /* last inode number */
- int done;
+ struct xfs_fsop_bulkreq bulkreq;
+ struct xfs_ibulk breq = {
+ .mp = mp,
+ .ocount = 0,
+ };
+ xfs_ino_t lastino;
int error;
/* done = 1 if there are more stats to get and if bulkstat */
@@ -740,79 +763,287 @@ xfs_ioc_bulkstat(
if (XFS_FORCED_SHUTDOWN(mp))
return -EIO;
- if (copy_from_user(&bulkreq, arg, sizeof(xfs_fsop_bulkreq_t)))
+ if (copy_from_user(&bulkreq, arg, sizeof(struct xfs_fsop_bulkreq)))
return -EFAULT;
- if (copy_from_user(&inlast, bulkreq.lastip, sizeof(__s64)))
+ if (copy_from_user(&lastino, bulkreq.lastip, sizeof(__s64)))
return -EFAULT;
- if ((count = bulkreq.icount) <= 0)
+ if (bulkreq.icount <= 0)
return -EINVAL;
if (bulkreq.ubuffer == NULL)
return -EINVAL;
- if (cmd == XFS_IOC_FSINUMBERS)
- error = xfs_inumbers(mp, &inlast, &count,
- bulkreq.ubuffer, xfs_inumbers_fmt);
- else if (cmd == XFS_IOC_FSBULKSTAT_SINGLE)
- error = xfs_bulkstat_one(mp, inlast, bulkreq.ubuffer,
- sizeof(xfs_bstat_t), NULL, &done);
- else /* XFS_IOC_FSBULKSTAT */
- error = xfs_bulkstat(mp, &inlast, &count, xfs_bulkstat_one,
- sizeof(xfs_bstat_t), bulkreq.ubuffer,
- &done);
+ breq.ubuffer = bulkreq.ubuffer;
+ breq.icount = bulkreq.icount;
+
+ /*
+ * FSBULKSTAT_SINGLE expects that *lastip contains the inode number
+ * that we want to stat. However, FSINUMBERS and FSBULKSTAT expect
+ * that *lastip contains either zero or the number of the last inode to
+ * be examined by the previous call and return results starting with
+ * the next inode after that. The new bulk request back end functions
+ * take the inode to start with, so we have to compute the startino
+ * parameter from lastino to maintain correct function. lastino == 0
+ * is a special case because it has traditionally meant "first inode
+ * in filesystem".
+ */
+ if (cmd == XFS_IOC_FSINUMBERS) {
+ breq.startino = lastino ? lastino + 1 : 0;
+ error = xfs_inumbers(&breq, xfs_fsinumbers_fmt);
+ lastino = breq.startino - 1;
+ } else if (cmd == XFS_IOC_FSBULKSTAT_SINGLE) {
+ breq.startino = lastino;
+ breq.icount = 1;
+ error = xfs_bulkstat_one(&breq, xfs_fsbulkstat_one_fmt);
+ } else { /* XFS_IOC_FSBULKSTAT */
+ breq.startino = lastino ? lastino + 1 : 0;
+ error = xfs_bulkstat(&breq, xfs_fsbulkstat_one_fmt);
+ lastino = breq.startino - 1;
+ }
if (error)
return error;
- if (bulkreq.ocount != NULL) {
- if (copy_to_user(bulkreq.lastip, &inlast,
- sizeof(xfs_ino_t)))
- return -EFAULT;
+ if (bulkreq.lastip != NULL &&
+ copy_to_user(bulkreq.lastip, &lastino, sizeof(xfs_ino_t)))
+ return -EFAULT;
- if (copy_to_user(bulkreq.ocount, &count, sizeof(count)))
- return -EFAULT;
+ if (bulkreq.ocount != NULL &&
+ copy_to_user(bulkreq.ocount, &breq.ocount, sizeof(__s32)))
+ return -EFAULT;
+
+ return 0;
+}
+
+/* Return 0 on success or positive error */
+static int
+xfs_bulkstat_fmt(
+ struct xfs_ibulk *breq,
+ const struct xfs_bulkstat *bstat)
+{
+ if (copy_to_user(breq->ubuffer, bstat, sizeof(struct xfs_bulkstat)))
+ return -EFAULT;
+ return xfs_ibulk_advance(breq, sizeof(struct xfs_bulkstat));
+}
+
+/*
+ * Check the incoming bulk request @hdr from userspace and initialize the
+ * internal @breq bulk request appropriately. Returns 0 if the bulk request
+ * should proceed; XFS_ITER_ABORT if there's nothing to do; or the usual
+ * negative error code.
+ */
+static int
+xfs_bulk_ireq_setup(
+ struct xfs_mount *mp,
+ struct xfs_bulk_ireq *hdr,
+ struct xfs_ibulk *breq,
+ void __user *ubuffer)
+{
+ if (hdr->icount == 0 ||
+ (hdr->flags & ~XFS_BULK_IREQ_FLAGS_ALL) ||
+ memchr_inv(hdr->reserved, 0, sizeof(hdr->reserved)))
+ return -EINVAL;
+
+ breq->startino = hdr->ino;
+ breq->ubuffer = ubuffer;
+ breq->icount = hdr->icount;
+ breq->ocount = 0;
+ breq->flags = 0;
+
+ /*
+ * The @ino parameter is a special value, so we must look it up here.
+ * We're not allowed to have IREQ_AGNO, and we only return one inode
+ * worth of data.
+ */
+ if (hdr->flags & XFS_BULK_IREQ_SPECIAL) {
+ if (hdr->flags & XFS_BULK_IREQ_AGNO)
+ return -EINVAL;
+
+ switch (hdr->ino) {
+ case XFS_BULK_IREQ_SPECIAL_ROOT:
+ hdr->ino = mp->m_sb.sb_rootino;
+ break;
+ default:
+ return -EINVAL;
+ }
+ breq->icount = 1;
}
+ /*
+ * The IREQ_AGNO flag means that we only want results from a given AG.
+ * If @hdr->ino is zero, we start iterating in that AG. If @hdr->ino is
+ * beyond the specified AG then we return no results.
+ */
+ if (hdr->flags & XFS_BULK_IREQ_AGNO) {
+ if (hdr->agno >= mp->m_sb.sb_agcount)
+ return -EINVAL;
+
+ if (breq->startino == 0)
+ breq->startino = XFS_AGINO_TO_INO(mp, hdr->agno, 0);
+ else if (XFS_INO_TO_AGNO(mp, breq->startino) < hdr->agno)
+ return -EINVAL;
+
+ breq->flags |= XFS_IBULK_SAME_AG;
+
+ /* Asking for an inode past the end of the AG? We're done! */
+ if (XFS_INO_TO_AGNO(mp, breq->startino) > hdr->agno)
+ return XFS_ITER_ABORT;
+ } else if (hdr->agno)
+ return -EINVAL;
+
+ /* Asking for an inode past the end of the FS? We're done! */
+ if (XFS_INO_TO_AGNO(mp, breq->startino) >= mp->m_sb.sb_agcount)
+ return XFS_ITER_ABORT;
+
return 0;
}
+/*
+ * Update the userspace bulk request @hdr to reflect the end state of the
+ * internal bulk request @breq.
+ */
+static void
+xfs_bulk_ireq_teardown(
+ struct xfs_bulk_ireq *hdr,
+ struct xfs_ibulk *breq)
+{
+ hdr->ino = breq->startino;
+ hdr->ocount = breq->ocount;
+}
+
+/* Handle the v5 bulkstat ioctl. */
STATIC int
-xfs_ioc_fsgeometry_v1(
- xfs_mount_t *mp,
- void __user *arg)
+xfs_ioc_bulkstat(
+ struct xfs_mount *mp,
+ unsigned int cmd,
+ struct xfs_bulkstat_req __user *arg)
{
- xfs_fsop_geom_t fsgeo;
- int error;
+ struct xfs_bulk_ireq hdr;
+ struct xfs_ibulk breq = {
+ .mp = mp,
+ };
+ int error;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (XFS_FORCED_SHUTDOWN(mp))
+ return -EIO;
+
+ if (copy_from_user(&hdr, &arg->hdr, sizeof(hdr)))
+ return -EFAULT;
- error = xfs_fs_geometry(&mp->m_sb, &fsgeo, 3);
+ error = xfs_bulk_ireq_setup(mp, &hdr, &breq, arg->bulkstat);
+ if (error == XFS_ITER_ABORT)
+ goto out_teardown;
+ if (error < 0)
+ return error;
+
+ error = xfs_bulkstat(&breq, xfs_bulkstat_fmt);
if (error)
return error;
- /*
- * Caller should have passed an argument of type
- * xfs_fsop_geom_v1_t. This is a proper subset of the
- * xfs_fsop_geom_t that xfs_fs_geometry() fills in.
- */
- if (copy_to_user(arg, &fsgeo, sizeof(xfs_fsop_geom_v1_t)))
+out_teardown:
+ xfs_bulk_ireq_teardown(&hdr, &breq);
+ if (copy_to_user(&arg->hdr, &hdr, sizeof(hdr)))
return -EFAULT;
+
+ return 0;
+}
+
+STATIC int
+xfs_inumbers_fmt(
+ struct xfs_ibulk *breq,
+ const struct xfs_inumbers *igrp)
+{
+ if (copy_to_user(breq->ubuffer, igrp, sizeof(struct xfs_inumbers)))
+ return -EFAULT;
+ return xfs_ibulk_advance(breq, sizeof(struct xfs_inumbers));
+}
+
+/* Handle the v5 inumbers ioctl. */
+STATIC int
+xfs_ioc_inumbers(
+ struct xfs_mount *mp,
+ unsigned int cmd,
+ struct xfs_inumbers_req __user *arg)
+{
+ struct xfs_bulk_ireq hdr;
+ struct xfs_ibulk breq = {
+ .mp = mp,
+ };
+ int error;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (XFS_FORCED_SHUTDOWN(mp))
+ return -EIO;
+
+ if (copy_from_user(&hdr, &arg->hdr, sizeof(hdr)))
+ return -EFAULT;
+
+ error = xfs_bulk_ireq_setup(mp, &hdr, &breq, arg->inumbers);
+ if (error == XFS_ITER_ABORT)
+ goto out_teardown;
+ if (error < 0)
+ return error;
+
+ error = xfs_inumbers(&breq, xfs_inumbers_fmt);
+ if (error)
+ return error;
+
+out_teardown:
+ xfs_bulk_ireq_teardown(&hdr, &breq);
+ if (copy_to_user(&arg->hdr, &hdr, sizeof(hdr)))
+ return -EFAULT;
+
return 0;
}
STATIC int
xfs_ioc_fsgeometry(
- xfs_mount_t *mp,
+ struct xfs_mount *mp,
+ void __user *arg,
+ int struct_version)
+{
+ struct xfs_fsop_geom fsgeo;
+ size_t len;
+
+ xfs_fs_geometry(&mp->m_sb, &fsgeo, struct_version);
+
+ if (struct_version <= 3)
+ len = sizeof(struct xfs_fsop_geom_v1);
+ else if (struct_version == 4)
+ len = sizeof(struct xfs_fsop_geom_v4);
+ else {
+ xfs_fsop_geom_health(mp, &fsgeo);
+ len = sizeof(fsgeo);
+ }
+
+ if (copy_to_user(arg, &fsgeo, len))
+ return -EFAULT;
+ return 0;
+}
+
+STATIC int
+xfs_ioc_ag_geometry(
+ struct xfs_mount *mp,
void __user *arg)
{
- xfs_fsop_geom_t fsgeo;
+ struct xfs_ag_geometry ageo;
int error;
- error = xfs_fs_geometry(&mp->m_sb, &fsgeo, 4);
+ if (copy_from_user(&ageo, arg, sizeof(ageo)))
+ return -EFAULT;
+
+ error = xfs_ag_get_geometry(mp, ageo.ag_number, &ageo);
if (error)
return error;
- if (copy_to_user(arg, &fsgeo, sizeof(fsgeo)))
+ if (copy_to_user(arg, &ageo, sizeof(ageo)))
return -EFAULT;
return 0;
}
@@ -871,37 +1102,44 @@ xfs_di2lxflags(
return flags;
}
-STATIC int
-xfs_ioc_fsgetxattr(
- xfs_inode_t *ip,
- int attr,
- void __user *arg)
+static void
+xfs_fill_fsxattr(
+ struct xfs_inode *ip,
+ bool attr,
+ struct fsxattr *fa)
{
- struct fsxattr fa;
-
- memset(&fa, 0, sizeof(struct fsxattr));
-
- xfs_ilock(ip, XFS_ILOCK_SHARED);
- fa.fsx_xflags = xfs_ip2xflags(ip);
- fa.fsx_extsize = ip->i_d.di_extsize << ip->i_mount->m_sb.sb_blocklog;
- fa.fsx_cowextsize = ip->i_d.di_cowextsize <<
+ simple_fill_fsxattr(fa, xfs_ip2xflags(ip));
+ fa->fsx_extsize = ip->i_d.di_extsize << ip->i_mount->m_sb.sb_blocklog;
+ fa->fsx_cowextsize = ip->i_d.di_cowextsize <<
ip->i_mount->m_sb.sb_blocklog;
- fa.fsx_projid = xfs_get_projid(ip);
+ fa->fsx_projid = xfs_get_projid(ip);
if (attr) {
if (ip->i_afp) {
if (ip->i_afp->if_flags & XFS_IFEXTENTS)
- fa.fsx_nextents = xfs_iext_count(ip->i_afp);
+ fa->fsx_nextents = xfs_iext_count(ip->i_afp);
else
- fa.fsx_nextents = ip->i_d.di_anextents;
+ fa->fsx_nextents = ip->i_d.di_anextents;
} else
- fa.fsx_nextents = 0;
+ fa->fsx_nextents = 0;
} else {
if (ip->i_df.if_flags & XFS_IFEXTENTS)
- fa.fsx_nextents = xfs_iext_count(&ip->i_df);
+ fa->fsx_nextents = xfs_iext_count(&ip->i_df);
else
- fa.fsx_nextents = ip->i_d.di_nextents;
+ fa->fsx_nextents = ip->i_d.di_nextents;
}
+}
+
+STATIC int
+xfs_ioc_fsgetxattr(
+ xfs_inode_t *ip,
+ int attr,
+ void __user *arg)
+{
+ struct fsxattr fa;
+
+ xfs_ilock(ip, XFS_ILOCK_SHARED);
+ xfs_fill_fsxattr(ip, attr, &fa);
xfs_iunlock(ip, XFS_ILOCK_SHARED);
if (copy_to_user(arg, &fa, sizeof(fa)))
@@ -1027,15 +1265,6 @@ xfs_ioctl_setattr_xflags(
if ((fa->fsx_xflags & FS_XFLAG_DAX) && xfs_is_reflink_inode(ip))
return -EINVAL;
- /*
- * Can't modify an immutable/append-only file unless
- * we have appropriate permission.
- */
- if (((ip->i_d.di_flags & (XFS_DIFLAG_IMMUTABLE | XFS_DIFLAG_APPEND)) ||
- (fa->fsx_xflags & (FS_XFLAG_IMMUTABLE | FS_XFLAG_APPEND))) &&
- !capable(CAP_LINUX_IMMUTABLE))
- return -EPERM;
-
/* diflags2 only valid for v3 inodes. */
di_flags2 = xfs_flags2diflags2(ip, fa->fsx_xflags);
if (di_flags2 && ip->i_d.di_version < 3)
@@ -1142,7 +1371,7 @@ xfs_ioctl_setattr_get_trans(
error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp);
if (error)
- return ERR_PTR(error);
+ goto out_unlock;
xfs_ilock(ip, XFS_ILOCK_EXCL);
xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | join_flags);
@@ -1194,39 +1423,31 @@ xfs_ioctl_setattr_check_extsize(
struct fsxattr *fa)
{
struct xfs_mount *mp = ip->i_mount;
-
- if ((fa->fsx_xflags & FS_XFLAG_EXTSIZE) && !S_ISREG(VFS_I(ip)->i_mode))
- return -EINVAL;
-
- if ((fa->fsx_xflags & FS_XFLAG_EXTSZINHERIT) &&
- !S_ISDIR(VFS_I(ip)->i_mode))
- return -EINVAL;
+ xfs_extlen_t size;
+ xfs_fsblock_t extsize_fsb;
if (S_ISREG(VFS_I(ip)->i_mode) && ip->i_d.di_nextents &&
((ip->i_d.di_extsize << mp->m_sb.sb_blocklog) != fa->fsx_extsize))
return -EINVAL;
- if (fa->fsx_extsize != 0) {
- xfs_extlen_t size;
- xfs_fsblock_t extsize_fsb;
-
- extsize_fsb = XFS_B_TO_FSB(mp, fa->fsx_extsize);
- if (extsize_fsb > MAXEXTLEN)
- return -EINVAL;
+ if (fa->fsx_extsize == 0)
+ return 0;
- if (XFS_IS_REALTIME_INODE(ip) ||
- (fa->fsx_xflags & FS_XFLAG_REALTIME)) {
- size = mp->m_sb.sb_rextsize << mp->m_sb.sb_blocklog;
- } else {
- size = mp->m_sb.sb_blocksize;
- if (extsize_fsb > mp->m_sb.sb_agblocks / 2)
- return -EINVAL;
- }
+ extsize_fsb = XFS_B_TO_FSB(mp, fa->fsx_extsize);
+ if (extsize_fsb > MAXEXTLEN)
+ return -EINVAL;
- if (fa->fsx_extsize % size)
+ if (XFS_IS_REALTIME_INODE(ip) ||
+ (fa->fsx_xflags & FS_XFLAG_REALTIME)) {
+ size = mp->m_sb.sb_rextsize << mp->m_sb.sb_blocklog;
+ } else {
+ size = mp->m_sb.sb_blocksize;
+ if (extsize_fsb > mp->m_sb.sb_agblocks / 2)
return -EINVAL;
- } else
- fa->fsx_xflags &= ~(FS_XFLAG_EXTSIZE | FS_XFLAG_EXTSZINHERIT);
+ }
+
+ if (fa->fsx_extsize % size)
+ return -EINVAL;
return 0;
}
@@ -1252,6 +1473,8 @@ xfs_ioctl_setattr_check_cowextsize(
struct fsxattr *fa)
{
struct xfs_mount *mp = ip->i_mount;
+ xfs_extlen_t size;
+ xfs_fsblock_t cowextsize_fsb;
if (!(fa->fsx_xflags & FS_XFLAG_COWEXTSIZE))
return 0;
@@ -1260,25 +1483,19 @@ xfs_ioctl_setattr_check_cowextsize(
ip->i_d.di_version != 3)
return -EINVAL;
- if (!S_ISREG(VFS_I(ip)->i_mode) && !S_ISDIR(VFS_I(ip)->i_mode))
- return -EINVAL;
-
- if (fa->fsx_cowextsize != 0) {
- xfs_extlen_t size;
- xfs_fsblock_t cowextsize_fsb;
+ if (fa->fsx_cowextsize == 0)
+ return 0;
- cowextsize_fsb = XFS_B_TO_FSB(mp, fa->fsx_cowextsize);
- if (cowextsize_fsb > MAXEXTLEN)
- return -EINVAL;
+ cowextsize_fsb = XFS_B_TO_FSB(mp, fa->fsx_cowextsize);
+ if (cowextsize_fsb > MAXEXTLEN)
+ return -EINVAL;
- size = mp->m_sb.sb_blocksize;
- if (cowextsize_fsb > mp->m_sb.sb_agblocks / 2)
- return -EINVAL;
+ size = mp->m_sb.sb_blocksize;
+ if (cowextsize_fsb > mp->m_sb.sb_agblocks / 2)
+ return -EINVAL;
- if (fa->fsx_cowextsize % size)
- return -EINVAL;
- } else
- fa->fsx_xflags &= ~FS_XFLAG_COWEXTSIZE;
+ if (fa->fsx_cowextsize % size)
+ return -EINVAL;
return 0;
}
@@ -1292,21 +1509,6 @@ xfs_ioctl_setattr_check_projid(
if (fa->fsx_projid > (uint16_t)-1 &&
!xfs_sb_version_hasprojid32bit(&ip->i_mount->m_sb))
return -EINVAL;
-
- /*
- * Project Quota ID state is only allowed to change from within the init
- * namespace. Enforce that restriction only if we are trying to change
- * the quota ID state. Everything else is allowed in user namespaces.
- */
- if (current_user_ns() == &init_user_ns)
- return 0;
-
- if (xfs_get_projid(ip) != fa->fsx_projid)
- return -EINVAL;
- if ((fa->fsx_xflags & FS_XFLAG_PROJINHERIT) !=
- (ip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT))
- return -EINVAL;
-
return 0;
}
@@ -1315,6 +1517,7 @@ xfs_ioctl_setattr(
xfs_inode_t *ip,
struct fsxattr *fa)
{
+ struct fsxattr old_fa;
struct xfs_mount *mp = ip->i_mount;
struct xfs_trans *tp;
struct xfs_dquot *udqp = NULL;
@@ -1362,7 +1565,6 @@ xfs_ioctl_setattr(
goto error_free_dquots;
}
-
if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_PQUOTA_ON(mp) &&
xfs_get_projid(ip) != fa->fsx_projid) {
code = xfs_qm_vop_chown_reserve(tp, ip, udqp, NULL, pdqp,
@@ -1371,6 +1573,11 @@ xfs_ioctl_setattr(
goto error_trans_cancel;
}
+ xfs_fill_fsxattr(ip, false, &old_fa);
+ code = vfs_ioc_fssetxattr_check(VFS_I(ip), &old_fa, fa);
+ if (code)
+ goto error_trans_cancel;
+
code = xfs_ioctl_setattr_check_extsize(ip, fa);
if (code)
goto error_trans_cancel;
@@ -1481,6 +1688,7 @@ xfs_ioc_setxflags(
{
struct xfs_trans *tp;
struct fsxattr fa;
+ struct fsxattr old_fa;
unsigned int flags;
int join_flags = 0;
int error;
@@ -1516,6 +1724,13 @@ xfs_ioc_setxflags(
goto out_drop_write;
}
+ xfs_fill_fsxattr(ip, false, &old_fa);
+ error = vfs_ioc_fssetxattr_check(VFS_I(ip), &old_fa, &fa);
+ if (error) {
+ xfs_trans_cancel(tp);
+ goto out_drop_write;
+ }
+
error = xfs_ioctl_setattr_xflags(tp, ip, &fa);
if (error) {
xfs_trans_cancel(tp);
@@ -1934,13 +2149,22 @@ xfs_file_ioctl(
case XFS_IOC_FSBULKSTAT_SINGLE:
case XFS_IOC_FSBULKSTAT:
case XFS_IOC_FSINUMBERS:
+ return xfs_ioc_fsbulkstat(mp, cmd, arg);
+
+ case XFS_IOC_BULKSTAT:
return xfs_ioc_bulkstat(mp, cmd, arg);
+ case XFS_IOC_INUMBERS:
+ return xfs_ioc_inumbers(mp, cmd, arg);
case XFS_IOC_FSGEOMETRY_V1:
- return xfs_ioc_fsgeometry_v1(mp, arg);
-
+ return xfs_ioc_fsgeometry(mp, arg, 3);
+ case XFS_IOC_FSGEOMETRY_V4:
+ return xfs_ioc_fsgeometry(mp, arg, 4);
case XFS_IOC_FSGEOMETRY:
- return xfs_ioc_fsgeometry(mp, arg);
+ return xfs_ioc_fsgeometry(mp, arg, 5);
+
+ case XFS_IOC_AG_GEOMETRY:
+ return xfs_ioc_ag_geometry(mp, arg);
case XFS_IOC_GETVERSION:
return put_user(inode->i_generation, (int __user *)arg);
@@ -2031,9 +2255,7 @@ xfs_file_ioctl(
case XFS_IOC_FSCOUNTS: {
xfs_fsop_counts_t out;
- error = xfs_fs_counts(mp, &out);
- if (error)
- return error;
+ xfs_fs_counts(mp, &out);
if (copy_to_user(arg, &out, sizeof(out)))
return -EFAULT;
diff --git a/fs/xfs/xfs_ioctl.h b/fs/xfs/xfs_ioctl.h
index 4b17f67c888a..654c0bb1bcf8 100644
--- a/fs/xfs/xfs_ioctl.h
+++ b/fs/xfs/xfs_ioctl.h
@@ -77,4 +77,12 @@ xfs_set_dmattrs(
uint evmask,
uint16_t state);
+struct xfs_ibulk;
+struct xfs_bstat;
+struct xfs_inogrp;
+
+int xfs_fsbulkstat_one_fmt(struct xfs_ibulk *breq,
+ const struct xfs_bulkstat *bstat);
+int xfs_fsinumbers_fmt(struct xfs_ibulk *breq, const struct xfs_inumbers *igrp);
+
#endif
diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c
index 5001dca361e9..7fcf7569743f 100644
--- a/fs/xfs/xfs_ioctl32.c
+++ b/fs/xfs/xfs_ioctl32.c
@@ -3,23 +3,19 @@
* Copyright (c) 2004-2005 Silicon Graphics, Inc.
* All Rights Reserved.
*/
-#include <linux/compat.h>
-#include <linux/ioctl.h>
#include <linux/mount.h>
-#include <linux/slab.h>
-#include <linux/uaccess.h>
#include <linux/fsmap.h>
#include "xfs.h"
#include "xfs_fs.h"
+#include "xfs_shared.h"
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
#include "xfs_inode.h"
+#include "xfs_iwalk.h"
#include "xfs_itable.h"
-#include "xfs_error.h"
#include "xfs_fsops.h"
-#include "xfs_alloc.h"
#include "xfs_rtalloc.h"
#include "xfs_attr.h"
#include "xfs_ioctl.h"
@@ -52,12 +48,9 @@ xfs_compat_ioc_fsgeometry_v1(
struct xfs_mount *mp,
compat_xfs_fsop_geom_v1_t __user *arg32)
{
- xfs_fsop_geom_t fsgeo;
- int error;
+ struct xfs_fsop_geom fsgeo;
- error = xfs_fs_geometry(&mp->m_sb, &fsgeo, 3);
- if (error)
- return error;
+ xfs_fs_geometry(&mp->m_sb, &fsgeo, 3);
/* The 32-bit variant simply has some padding at the end */
if (copy_to_user(arg32, &fsgeo, sizeof(struct compat_xfs_fsop_geom_v1)))
return -EFAULT;
@@ -87,27 +80,26 @@ xfs_compat_growfs_rt_copyin(
}
STATIC int
-xfs_inumbers_fmt_compat(
- void __user *ubuffer,
- const struct xfs_inogrp *buffer,
- long count,
- long *written)
+xfs_fsinumbers_fmt_compat(
+ struct xfs_ibulk *breq,
+ const struct xfs_inumbers *ig)
{
- compat_xfs_inogrp_t __user *p32 = ubuffer;
- long i;
+ struct compat_xfs_inogrp __user *p32 = breq->ubuffer;
+ struct xfs_inogrp ig1;
+ struct xfs_inogrp *igrp = &ig1;
- for (i = 0; i < count; i++) {
- if (put_user(buffer[i].xi_startino, &p32[i].xi_startino) ||
- put_user(buffer[i].xi_alloccount, &p32[i].xi_alloccount) ||
- put_user(buffer[i].xi_allocmask, &p32[i].xi_allocmask))
- return -EFAULT;
- }
- *written = count * sizeof(*p32);
- return 0;
+ xfs_inumbers_to_inogrp(&ig1, ig);
+
+ if (put_user(igrp->xi_startino, &p32->xi_startino) ||
+ put_user(igrp->xi_alloccount, &p32->xi_alloccount) ||
+ put_user(igrp->xi_allocmask, &p32->xi_allocmask))
+ return -EFAULT;
+
+ return xfs_ibulk_advance(breq, sizeof(struct compat_xfs_inogrp));
}
#else
-#define xfs_inumbers_fmt_compat xfs_inumbers_fmt
+#define xfs_fsinumbers_fmt_compat xfs_fsinumbers_fmt
#endif /* BROKEN_X86_ALIGNMENT */
STATIC int
@@ -124,11 +116,14 @@ xfs_ioctl32_bstime_copyin(
return 0;
}
-/* xfs_bstat_t has differing alignment on intel, & bstime_t sizes everywhere */
+/*
+ * struct xfs_bstat has differing alignment on intel, & bstime_t sizes
+ * everywhere
+ */
STATIC int
xfs_ioctl32_bstat_copyin(
- xfs_bstat_t *bstat,
- compat_xfs_bstat_t __user *bstat32)
+ struct xfs_bstat *bstat,
+ struct compat_xfs_bstat __user *bstat32)
{
if (get_user(bstat->bs_ino, &bstat32->bs_ino) ||
get_user(bstat->bs_mode, &bstat32->bs_mode) ||
@@ -174,16 +169,15 @@ xfs_bstime_store_compat(
/* Return 0 on success or positive error (to xfs_bulkstat()) */
STATIC int
-xfs_bulkstat_one_fmt_compat(
- void __user *ubuffer,
- int ubsize,
- int *ubused,
- const xfs_bstat_t *buffer)
+xfs_fsbulkstat_one_fmt_compat(
+ struct xfs_ibulk *breq,
+ const struct xfs_bulkstat *bstat)
{
- compat_xfs_bstat_t __user *p32 = ubuffer;
+ struct compat_xfs_bstat __user *p32 = breq->ubuffer;
+ struct xfs_bstat bs1;
+ struct xfs_bstat *buffer = &bs1;
- if (ubsize < sizeof(*p32))
- return -ENOMEM;
+ xfs_bulkstat_to_bstat(breq->mp, &bs1, bstat);
if (put_user(buffer->bs_ino, &p32->bs_ino) ||
put_user(buffer->bs_mode, &p32->bs_mode) ||
@@ -208,37 +202,24 @@ xfs_bulkstat_one_fmt_compat(
put_user(buffer->bs_dmstate, &p32->bs_dmstate) ||
put_user(buffer->bs_aextents, &p32->bs_aextents))
return -EFAULT;
- if (ubused)
- *ubused = sizeof(*p32);
- return 0;
-}
-STATIC int
-xfs_bulkstat_one_compat(
- xfs_mount_t *mp, /* mount point for filesystem */
- xfs_ino_t ino, /* inode number to get data for */
- void __user *buffer, /* buffer to place output in */
- int ubsize, /* size of buffer */
- int *ubused, /* bytes used by me */
- int *stat) /* BULKSTAT_RV_... */
-{
- return xfs_bulkstat_one_int(mp, ino, buffer, ubsize,
- xfs_bulkstat_one_fmt_compat,
- ubused, stat);
+ return xfs_ibulk_advance(breq, sizeof(struct compat_xfs_bstat));
}
/* copied from xfs_ioctl.c */
STATIC int
-xfs_compat_ioc_bulkstat(
+xfs_compat_ioc_fsbulkstat(
xfs_mount_t *mp,
unsigned int cmd,
- compat_xfs_fsop_bulkreq_t __user *p32)
+ struct compat_xfs_fsop_bulkreq __user *p32)
{
u32 addr;
- xfs_fsop_bulkreq_t bulkreq;
- int count; /* # of records returned */
- xfs_ino_t inlast; /* last inode number */
- int done;
+ struct xfs_fsop_bulkreq bulkreq;
+ struct xfs_ibulk breq = {
+ .mp = mp,
+ .ocount = 0,
+ };
+ xfs_ino_t lastino;
int error;
/*
@@ -247,9 +228,8 @@ xfs_compat_ioc_bulkstat(
* to userpace memory via bulkreq.ubuffer. Normally the compat
* functions and structure size are the correct ones to use ...
*/
- inumbers_fmt_pf inumbers_func = xfs_inumbers_fmt_compat;
- bulkstat_one_pf bs_one_func = xfs_bulkstat_one_compat;
- size_t bs_one_size = sizeof(struct compat_xfs_bstat);
+ inumbers_fmt_pf inumbers_func = xfs_fsinumbers_fmt_compat;
+ bulkstat_one_fmt_pf bs_one_func = xfs_fsbulkstat_one_fmt_compat;
#ifdef CONFIG_X86_X32
if (in_x32_syscall()) {
@@ -261,9 +241,8 @@ xfs_compat_ioc_bulkstat(
* the data written out in compat layout will not match what
* x32 userspace expects.
*/
- inumbers_func = xfs_inumbers_fmt;
- bs_one_func = xfs_bulkstat_one;
- bs_one_size = sizeof(struct xfs_bstat);
+ inumbers_func = xfs_fsinumbers_fmt;
+ bs_one_func = xfs_fsbulkstat_one_fmt;
}
#endif
@@ -287,40 +266,55 @@ xfs_compat_ioc_bulkstat(
return -EFAULT;
bulkreq.ocount = compat_ptr(addr);
- if (copy_from_user(&inlast, bulkreq.lastip, sizeof(__s64)))
+ if (copy_from_user(&lastino, bulkreq.lastip, sizeof(__s64)))
return -EFAULT;
- if ((count = bulkreq.icount) <= 0)
+ if (bulkreq.icount <= 0)
return -EINVAL;
if (bulkreq.ubuffer == NULL)
return -EINVAL;
+ breq.ubuffer = bulkreq.ubuffer;
+ breq.icount = bulkreq.icount;
+
+ /*
+ * FSBULKSTAT_SINGLE expects that *lastip contains the inode number
+ * that we want to stat. However, FSINUMBERS and FSBULKSTAT expect
+ * that *lastip contains either zero or the number of the last inode to
+ * be examined by the previous call and return results starting with
+ * the next inode after that. The new bulk request back end functions
+ * take the inode to start with, so we have to compute the startino
+ * parameter from lastino to maintain correct function. lastino == 0
+ * is a special case because it has traditionally meant "first inode
+ * in filesystem".
+ */
if (cmd == XFS_IOC_FSINUMBERS_32) {
- error = xfs_inumbers(mp, &inlast, &count,
- bulkreq.ubuffer, inumbers_func);
+ breq.startino = lastino ? lastino + 1 : 0;
+ error = xfs_inumbers(&breq, inumbers_func);
+ lastino = breq.startino - 1;
} else if (cmd == XFS_IOC_FSBULKSTAT_SINGLE_32) {
- int res;
-
- error = bs_one_func(mp, inlast, bulkreq.ubuffer,
- bs_one_size, NULL, &res);
+ breq.startino = lastino;
+ breq.icount = 1;
+ error = xfs_bulkstat_one(&breq, bs_one_func);
+ lastino = breq.startino;
} else if (cmd == XFS_IOC_FSBULKSTAT_32) {
- error = xfs_bulkstat(mp, &inlast, &count,
- bs_one_func, bs_one_size,
- bulkreq.ubuffer, &done);
- } else
+ breq.startino = lastino ? lastino + 1 : 0;
+ error = xfs_bulkstat(&breq, bs_one_func);
+ lastino = breq.startino - 1;
+ } else {
error = -EINVAL;
+ }
if (error)
return error;
- if (bulkreq.ocount != NULL) {
- if (copy_to_user(bulkreq.lastip, &inlast,
- sizeof(xfs_ino_t)))
- return -EFAULT;
+ if (bulkreq.lastip != NULL &&
+ copy_to_user(bulkreq.lastip, &lastino, sizeof(xfs_ino_t)))
+ return -EFAULT;
- if (copy_to_user(bulkreq.ocount, &count, sizeof(count)))
- return -EFAULT;
- }
+ if (bulkreq.ocount != NULL &&
+ copy_to_user(bulkreq.ocount, &breq.ocount, sizeof(__s32)))
+ return -EFAULT;
return 0;
}
@@ -561,7 +555,9 @@ xfs_file_compat_ioctl(
switch (cmd) {
/* No size or alignment issues on any arch */
case XFS_IOC_DIOINFO:
+ case XFS_IOC_FSGEOMETRY_V4:
case XFS_IOC_FSGEOMETRY:
+ case XFS_IOC_AG_GEOMETRY:
case XFS_IOC_FSGETXATTR:
case XFS_IOC_FSSETXATTR:
case XFS_IOC_FSGETXATTRA:
@@ -578,6 +574,8 @@ xfs_file_compat_ioctl(
case XFS_IOC_ERROR_CLEARALL:
case FS_IOC_GETFSMAP:
case XFS_IOC_SCRUB_METADATA:
+ case XFS_IOC_BULKSTAT:
+ case XFS_IOC_INUMBERS:
return xfs_file_ioctl(filp, cmd, p);
#if !defined(BROKEN_X86_ALIGNMENT) || defined(CONFIG_X86_X32)
/*
@@ -675,7 +673,7 @@ xfs_file_compat_ioctl(
case XFS_IOC_FSBULKSTAT_32:
case XFS_IOC_FSBULKSTAT_SINGLE_32:
case XFS_IOC_FSINUMBERS_32:
- return xfs_compat_ioc_bulkstat(mp, cmd, arg);
+ return xfs_compat_ioc_fsbulkstat(mp, cmd, arg);
case XFS_IOC_FD_TO_HANDLE_32:
case XFS_IOC_PATH_TO_HANDLE_32:
case XFS_IOC_PATH_TO_FSHANDLE_32: {
diff --git a/fs/xfs/xfs_ioctl32.h b/fs/xfs/xfs_ioctl32.h
index d28fa824284a..7985344d3aa6 100644
--- a/fs/xfs/xfs_ioctl32.h
+++ b/fs/xfs/xfs_ioctl32.h
@@ -36,7 +36,7 @@ typedef struct compat_xfs_bstime {
__s32 tv_nsec; /* and nanoseconds */
} compat_xfs_bstime_t;
-typedef struct compat_xfs_bstat {
+struct compat_xfs_bstat {
__u64 bs_ino; /* inode number */
__u16 bs_mode; /* type and mode */
__u16 bs_nlink; /* number of links */
@@ -61,14 +61,14 @@ typedef struct compat_xfs_bstat {
__u32 bs_dmevmask; /* DMIG event mask */
__u16 bs_dmstate; /* DMIG state info */
__u16 bs_aextents; /* attribute number of extents */
-} __compat_packed compat_xfs_bstat_t;
+} __compat_packed;
-typedef struct compat_xfs_fsop_bulkreq {
+struct compat_xfs_fsop_bulkreq {
compat_uptr_t lastip; /* last inode # pointer */
__s32 icount; /* count of entries in buffer */
compat_uptr_t ubuffer; /* user buffer for inode desc. */
compat_uptr_t ocount; /* output count pointer */
-} compat_xfs_fsop_bulkreq_t;
+};
#define XFS_IOC_FSBULKSTAT_32 \
_IOWR('X', 101, struct compat_xfs_fsop_bulkreq)
@@ -106,7 +106,7 @@ typedef struct compat_xfs_swapext {
xfs_off_t sx_offset; /* offset into file */
xfs_off_t sx_length; /* leng from offset */
char sx_pad[16]; /* pad space, unused */
- compat_xfs_bstat_t sx_stat; /* stat of target b4 copy */
+ struct compat_xfs_bstat sx_stat; /* stat of target b4 copy */
} __compat_packed compat_xfs_swapext_t;
#define XFS_IOC_SWAPEXT_32 _IOWR('X', 109, struct compat_xfs_swapext)
@@ -201,11 +201,11 @@ typedef struct compat_xfs_fsop_geom_v1 {
#define XFS_IOC_FSGEOMETRY_V1_32 \
_IOR('X', 100, struct compat_xfs_fsop_geom_v1)
-typedef struct compat_xfs_inogrp {
+struct compat_xfs_inogrp {
__u64 xi_startino; /* starting inode number */
__s32 xi_alloccount; /* # bits set in allocmask */
__u64 xi_allocmask; /* mask of allocated inodes */
-} __attribute__((packed)) compat_xfs_inogrp_t;
+} __attribute__((packed));
/* These growfs input structures have padding on the end, so must translate */
typedef struct compat_xfs_growfs_data {
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 27c93b5f029d..3a4310d7cb59 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -4,7 +4,6 @@
* Copyright (c) 2016-2018 Christoph Hellwig.
* All Rights Reserved.
*/
-#include <linux/iomap.h>
#include "xfs.h"
#include "xfs_fs.h"
#include "xfs_shared.h"
@@ -12,7 +11,6 @@
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
-#include "xfs_defer.h"
#include "xfs_inode.h"
#include "xfs_btree.h"
#include "xfs_bmap_btree.h"
@@ -25,7 +23,6 @@
#include "xfs_inode_item.h"
#include "xfs_iomap.h"
#include "xfs_trace.h"
-#include "xfs_icache.h"
#include "xfs_quota.h"
#include "xfs_dquot_item.h"
#include "xfs_dquot.h"
@@ -35,18 +32,40 @@
#define XFS_WRITEIO_ALIGN(mp,off) (((off) >> mp->m_writeio_log) \
<< mp->m_writeio_log)
-void
+static int
+xfs_alert_fsblock_zero(
+ xfs_inode_t *ip,
+ xfs_bmbt_irec_t *imap)
+{
+ xfs_alert_tag(ip->i_mount, XFS_PTAG_FSBLOCK_ZERO,
+ "Access to block zero in inode %llu "
+ "start_block: %llx start_off: %llx "
+ "blkcnt: %llx extent-state: %x",
+ (unsigned long long)ip->i_ino,
+ (unsigned long long)imap->br_startblock,
+ (unsigned long long)imap->br_startoff,
+ (unsigned long long)imap->br_blockcount,
+ imap->br_state);
+ return -EFSCORRUPTED;
+}
+
+int
xfs_bmbt_to_iomap(
struct xfs_inode *ip,
struct iomap *iomap,
- struct xfs_bmbt_irec *imap)
+ struct xfs_bmbt_irec *imap,
+ bool shared)
{
struct xfs_mount *mp = ip->i_mount;
+ if (unlikely(!imap->br_startblock && !XFS_IS_REALTIME_INODE(ip)))
+ return xfs_alert_fsblock_zero(ip, imap);
+
if (imap->br_startblock == HOLESTARTBLOCK) {
iomap->addr = IOMAP_NULL_ADDR;
iomap->type = IOMAP_HOLE;
- } else if (imap->br_startblock == DELAYSTARTBLOCK) {
+ } else if (imap->br_startblock == DELAYSTARTBLOCK ||
+ isnullstartblock(imap->br_startblock)) {
iomap->addr = IOMAP_NULL_ADDR;
iomap->type = IOMAP_DELALLOC;
} else {
@@ -60,6 +79,13 @@ xfs_bmbt_to_iomap(
iomap->length = XFS_FSB_TO_B(mp, imap->br_blockcount);
iomap->bdev = xfs_find_bdev_for_inode(VFS_I(ip));
iomap->dax_dev = xfs_find_daxdev_for_inode(VFS_I(ip));
+
+ if (xfs_ipincount(ip) &&
+ (ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP))
+ iomap->flags |= IOMAP_F_DIRTY;
+ if (shared)
+ iomap->flags |= IOMAP_F_SHARED;
+ return 0;
}
static void
@@ -138,23 +164,6 @@ xfs_iomap_eof_align_last_fsb(
return 0;
}
-STATIC int
-xfs_alert_fsblock_zero(
- xfs_inode_t *ip,
- xfs_bmbt_irec_t *imap)
-{
- xfs_alert_tag(ip->i_mount, XFS_PTAG_FSBLOCK_ZERO,
- "Access to block zero in inode %llu "
- "start_block: %llx start_off: %llx "
- "blkcnt: %llx extent-state: %x",
- (unsigned long long)ip->i_ino,
- (unsigned long long)imap->br_startblock,
- (unsigned long long)imap->br_startoff,
- (unsigned long long)imap->br_blockcount,
- imap->br_state);
- return -EFSCORRUPTED;
-}
-
int
xfs_iomap_write_direct(
xfs_inode_t *ip,
@@ -383,12 +392,13 @@ xfs_quota_calc_throttle(
STATIC xfs_fsblock_t
xfs_iomap_prealloc_size(
struct xfs_inode *ip,
+ int whichfork,
loff_t offset,
loff_t count,
struct xfs_iext_cursor *icur)
{
struct xfs_mount *mp = ip->i_mount;
- struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
+ struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
struct xfs_bmbt_irec prev;
int shift = 0;
@@ -522,15 +532,16 @@ xfs_file_iomap_begin_delay(
{
struct xfs_inode *ip = XFS_I(inode);
struct xfs_mount *mp = ip->i_mount;
- struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
xfs_fileoff_t maxbytes_fsb =
XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes);
xfs_fileoff_t end_fsb;
- int error = 0, eof = 0;
- struct xfs_bmbt_irec got;
- struct xfs_iext_cursor icur;
+ struct xfs_bmbt_irec imap, cmap;
+ struct xfs_iext_cursor icur, ccur;
xfs_fsblock_t prealloc_blocks = 0;
+ bool eof = false, cow_eof = false, shared = false;
+ int whichfork = XFS_DATA_FORK;
+ int error = 0;
ASSERT(!XFS_IS_REALTIME_INODE(ip));
ASSERT(!xfs_get_extsz_hint(ip));
@@ -548,7 +559,7 @@ xfs_file_iomap_begin_delay(
XFS_STATS_INC(mp, xs_blk_mapw);
- if (!(ifp->if_flags & XFS_IFEXTENTS)) {
+ if (!(ip->i_df.if_flags & XFS_IFEXTENTS)) {
error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK);
if (error)
goto out_unlock;
@@ -556,53 +567,101 @@ xfs_file_iomap_begin_delay(
end_fsb = min(XFS_B_TO_FSB(mp, offset + count), maxbytes_fsb);
- eof = !xfs_iext_lookup_extent(ip, ifp, offset_fsb, &icur, &got);
+ /*
+ * Search the data fork fork first to look up our source mapping. We
+ * always need the data fork map, as we have to return it to the
+ * iomap code so that the higher level write code can read data in to
+ * perform read-modify-write cycles for unaligned writes.
+ */
+ eof = !xfs_iext_lookup_extent(ip, &ip->i_df, offset_fsb, &icur, &imap);
if (eof)
- got.br_startoff = end_fsb; /* fake hole until the end */
+ imap.br_startoff = end_fsb; /* fake hole until the end */
+
+ /* We never need to allocate blocks for zeroing a hole. */
+ if ((flags & IOMAP_ZERO) && imap.br_startoff > offset_fsb) {
+ xfs_hole_to_iomap(ip, iomap, offset_fsb, imap.br_startoff);
+ goto out_unlock;
+ }
- if (got.br_startoff <= offset_fsb) {
+ /*
+ * Search the COW fork extent list even if we did not find a data fork
+ * extent. This serves two purposes: first this implements the
+ * speculative preallocation using cowextsize, so that we also unshare
+ * block adjacent to shared blocks instead of just the shared blocks
+ * themselves. Second the lookup in the extent list is generally faster
+ * than going out to the shared extent tree.
+ */
+ if (xfs_is_cow_inode(ip)) {
+ if (!ip->i_cowfp) {
+ ASSERT(!xfs_is_reflink_inode(ip));
+ xfs_ifork_init_cow(ip);
+ }
+ cow_eof = !xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb,
+ &ccur, &cmap);
+ if (!cow_eof && cmap.br_startoff <= offset_fsb) {
+ trace_xfs_reflink_cow_found(ip, &cmap);
+ whichfork = XFS_COW_FORK;
+ goto done;
+ }
+ }
+
+ if (imap.br_startoff <= offset_fsb) {
/*
* For reflink files we may need a delalloc reservation when
* overwriting shared extents. This includes zeroing of
* existing extents that contain data.
*/
- if (xfs_is_reflink_inode(ip) &&
- ((flags & IOMAP_WRITE) ||
- got.br_state != XFS_EXT_UNWRITTEN)) {
- xfs_trim_extent(&got, offset_fsb, end_fsb - offset_fsb);
- error = xfs_reflink_reserve_cow(ip, &got);
- if (error)
- goto out_unlock;
+ if (!xfs_is_cow_inode(ip) ||
+ ((flags & IOMAP_ZERO) && imap.br_state != XFS_EXT_NORM)) {
+ trace_xfs_iomap_found(ip, offset, count, XFS_DATA_FORK,
+ &imap);
+ goto done;
}
- trace_xfs_iomap_found(ip, offset, count, 0, &got);
- goto done;
- }
+ xfs_trim_extent(&imap, offset_fsb, end_fsb - offset_fsb);
- if (flags & IOMAP_ZERO) {
- xfs_hole_to_iomap(ip, iomap, offset_fsb, got.br_startoff);
- goto out_unlock;
+ /* Trim the mapping to the nearest shared extent boundary. */
+ error = xfs_inode_need_cow(ip, &imap, &shared);
+ if (error)
+ goto out_unlock;
+
+ /* Not shared? Just report the (potentially capped) extent. */
+ if (!shared) {
+ trace_xfs_iomap_found(ip, offset, count, XFS_DATA_FORK,
+ &imap);
+ goto done;
+ }
+
+ /*
+ * Fork all the shared blocks from our write offset until the
+ * end of the extent.
+ */
+ whichfork = XFS_COW_FORK;
+ end_fsb = imap.br_startoff + imap.br_blockcount;
+ } else {
+ /*
+ * We cap the maximum length we map here to MAX_WRITEBACK_PAGES
+ * pages to keep the chunks of work done where somewhat
+ * symmetric with the work writeback does. This is a completely
+ * arbitrary number pulled out of thin air.
+ *
+ * Note that the values needs to be less than 32-bits wide until
+ * the lower level functions are updated.
+ */
+ count = min_t(loff_t, count, 1024 * PAGE_SIZE);
+ end_fsb = min(XFS_B_TO_FSB(mp, offset + count), maxbytes_fsb);
+
+ if (xfs_is_always_cow_inode(ip))
+ whichfork = XFS_COW_FORK;
}
error = xfs_qm_dqattach_locked(ip, false);
if (error)
goto out_unlock;
- /*
- * We cap the maximum length we map here to MAX_WRITEBACK_PAGES pages
- * to keep the chunks of work done where somewhat symmetric with the
- * work writeback does. This is a completely arbitrary number pulled
- * out of thin air as a best guess for initial testing.
- *
- * Note that the values needs to be less than 32-bits wide until
- * the lower level functions are updated.
- */
- count = min_t(loff_t, count, 1024 * PAGE_SIZE);
- end_fsb = min(XFS_B_TO_FSB(mp, offset + count), maxbytes_fsb);
-
if (eof) {
- prealloc_blocks = xfs_iomap_prealloc_size(ip, offset, count,
- &icur);
+ prealloc_blocks = xfs_iomap_prealloc_size(ip, whichfork, offset,
+ count, &icur);
if (prealloc_blocks) {
xfs_extlen_t align;
xfs_off_t end_offset;
@@ -623,9 +682,11 @@ xfs_file_iomap_begin_delay(
}
retry:
- error = xfs_bmapi_reserve_delalloc(ip, XFS_DATA_FORK, offset_fsb,
- end_fsb - offset_fsb, prealloc_blocks, &got, &icur,
- eof);
+ error = xfs_bmapi_reserve_delalloc(ip, whichfork, offset_fsb,
+ end_fsb - offset_fsb, prealloc_blocks,
+ whichfork == XFS_DATA_FORK ? &imap : &cmap,
+ whichfork == XFS_DATA_FORK ? &icur : &ccur,
+ whichfork == XFS_DATA_FORK ? eof : cow_eof);
switch (error) {
case 0:
break;
@@ -647,186 +708,22 @@ retry:
* them out if the write happens to fail.
*/
iomap->flags |= IOMAP_F_NEW;
- trace_xfs_iomap_alloc(ip, offset, count, 0, &got);
+ trace_xfs_iomap_alloc(ip, offset, count, whichfork,
+ whichfork == XFS_DATA_FORK ? &imap : &cmap);
done:
- if (isnullstartblock(got.br_startblock))
- got.br_startblock = DELAYSTARTBLOCK;
-
- if (!got.br_startblock) {
- error = xfs_alert_fsblock_zero(ip, &got);
- if (error)
+ if (whichfork == XFS_COW_FORK) {
+ if (imap.br_startoff > offset_fsb) {
+ xfs_trim_extent(&cmap, offset_fsb,
+ imap.br_startoff - offset_fsb);
+ error = xfs_bmbt_to_iomap(ip, iomap, &cmap, true);
goto out_unlock;
- }
-
- xfs_bmbt_to_iomap(ip, iomap, &got);
-
-out_unlock:
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- return error;
-}
-
-/*
- * Pass in a delayed allocate extent, convert it to real extents;
- * return to the caller the extent we create which maps on top of
- * the originating callers request.
- *
- * Called without a lock on the inode.
- *
- * We no longer bother to look at the incoming map - all we have to
- * guarantee is that whatever we allocate fills the required range.
- */
-int
-xfs_iomap_write_allocate(
- xfs_inode_t *ip,
- int whichfork,
- xfs_off_t offset,
- xfs_bmbt_irec_t *imap,
- unsigned int *cow_seq)
-{
- xfs_mount_t *mp = ip->i_mount;
- struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
- xfs_fileoff_t offset_fsb, last_block;
- xfs_fileoff_t end_fsb, map_start_fsb;
- xfs_filblks_t count_fsb;
- xfs_trans_t *tp;
- int nimaps;
- int error = 0;
- int flags = XFS_BMAPI_DELALLOC;
- int nres;
-
- if (whichfork == XFS_COW_FORK)
- flags |= XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC;
-
- /*
- * Make sure that the dquots are there.
- */
- error = xfs_qm_dqattach(ip);
- if (error)
- return error;
-
- offset_fsb = XFS_B_TO_FSBT(mp, offset);
- count_fsb = imap->br_blockcount;
- map_start_fsb = imap->br_startoff;
-
- XFS_STATS_ADD(mp, xs_xstrat_bytes, XFS_FSB_TO_B(mp, count_fsb));
-
- while (count_fsb != 0) {
- /*
- * Set up a transaction with which to allocate the
- * backing store for the file. Do allocations in a
- * loop until we get some space in the range we are
- * interested in. The other space that might be allocated
- * is in the delayed allocation extent on which we sit
- * but before our buffer starts.
- */
- nimaps = 0;
- while (nimaps == 0) {
- nres = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK);
- /*
- * We have already reserved space for the extent and any
- * indirect blocks when creating the delalloc extent,
- * there is no need to reserve space in this transaction
- * again.
- */
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, 0,
- 0, XFS_TRANS_RESERVE, &tp);
- if (error)
- return error;
-
- xfs_ilock(ip, XFS_ILOCK_EXCL);
- xfs_trans_ijoin(tp, ip, 0);
-
- /*
- * it is possible that the extents have changed since
- * we did the read call as we dropped the ilock for a
- * while. We have to be careful about truncates or hole
- * punchs here - we are not allowed to allocate
- * non-delalloc blocks here.
- *
- * The only protection against truncation is the pages
- * for the range we are being asked to convert are
- * locked and hence a truncate will block on them
- * first.
- *
- * As a result, if we go beyond the range we really
- * need and hit an delalloc extent boundary followed by
- * a hole while we have excess blocks in the map, we
- * will fill the hole incorrectly and overrun the
- * transaction reservation.
- *
- * Using a single map prevents this as we are forced to
- * check each map we look for overlap with the desired
- * range and abort as soon as we find it. Also, given
- * that we only return a single map, having one beyond
- * what we can return is probably a bit silly.
- *
- * We also need to check that we don't go beyond EOF;
- * this is a truncate optimisation as a truncate sets
- * the new file size before block on the pages we
- * currently have locked under writeback. Because they
- * are about to be tossed, we don't need to write them
- * back....
- */
- nimaps = 1;
- end_fsb = XFS_B_TO_FSB(mp, XFS_ISIZE(ip));
- error = xfs_bmap_last_offset(ip, &last_block,
- XFS_DATA_FORK);
- if (error)
- goto trans_cancel;
-
- last_block = XFS_FILEOFF_MAX(last_block, end_fsb);
- if ((map_start_fsb + count_fsb) > last_block) {
- count_fsb = last_block - map_start_fsb;
- if (count_fsb == 0) {
- error = -EAGAIN;
- goto trans_cancel;
- }
- }
-
- /*
- * From this point onwards we overwrite the imap
- * pointer that the caller gave to us.
- */
- error = xfs_bmapi_write(tp, ip, map_start_fsb,
- count_fsb, flags, nres, imap,
- &nimaps);
- if (error)
- goto trans_cancel;
-
- error = xfs_trans_commit(tp);
- if (error)
- goto error0;
-
- if (whichfork == XFS_COW_FORK)
- *cow_seq = READ_ONCE(ifp->if_seq);
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- }
-
- /*
- * See if we were able to allocate an extent that
- * covers at least part of the callers request
- */
- if (!(imap->br_startblock || XFS_IS_REALTIME_INODE(ip)))
- return xfs_alert_fsblock_zero(ip, imap);
-
- if ((offset_fsb >= imap->br_startoff) &&
- (offset_fsb < (imap->br_startoff +
- imap->br_blockcount))) {
- XFS_STATS_INC(mp, xs_xstrat_quick);
- return 0;
}
-
- /*
- * So far we have not mapped the requested part of the
- * file, just surrounding data, try again.
- */
- count_fsb -= imap->br_blockcount;
- map_start_fsb = imap->br_startoff + imap->br_blockcount;
+ /* ensure we only report blocks we have a reservation for */
+ xfs_trim_extent(&imap, cmap.br_startoff, cmap.br_blockcount);
+ shared = true;
}
-
-trans_cancel:
- xfs_trans_cancel(tp);
-error0:
+ error = xfs_bmbt_to_iomap(ip, iomap, &imap, shared);
+out_unlock:
xfs_iunlock(ip, XFS_ILOCK_EXCL);
return error;
}
@@ -879,7 +776,7 @@ xfs_iomap_write_unwritten(
* complete here and might deadlock on the iolock.
*/
error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0,
- XFS_TRANS_RESERVE | XFS_TRANS_NOFS, &tp);
+ XFS_TRANS_RESERVE, &tp);
if (error)
return error;
@@ -975,7 +872,7 @@ xfs_ilock_for_iomap(
* COW writes may allocate delalloc space or convert unwritten COW
* extents, so we need to make sure to take the lock exclusively here.
*/
- if (xfs_is_reflink_inode(ip) && is_write) {
+ if (xfs_is_cow_inode(ip) && is_write) {
/*
* FIXME: It could still overwrite on unshared extents and not
* need allocation.
@@ -1009,7 +906,7 @@ relock:
* check, so if we got ILOCK_SHARED for a write and but we're now a
* reflink inode we have to switch to ILOCK_EXCL and relock.
*/
- if (mode == XFS_ILOCK_SHARED && is_write && xfs_is_reflink_inode(ip)) {
+ if (mode == XFS_ILOCK_SHARED && is_write && xfs_is_cow_inode(ip)) {
xfs_iunlock(ip, mode);
mode = XFS_ILOCK_EXCL;
goto relock;
@@ -1081,23 +978,33 @@ xfs_file_iomap_begin(
* Break shared extents if necessary. Checks for non-blocking IO have
* been done up front, so we don't need to do them here.
*/
- if (xfs_is_reflink_inode(ip)) {
+ if (xfs_is_cow_inode(ip)) {
+ struct xfs_bmbt_irec cmap;
+ bool directio = (flags & IOMAP_DIRECT);
+
/* if zeroing doesn't need COW allocation, then we are done. */
if ((flags & IOMAP_ZERO) &&
!needs_cow_for_zeroing(&imap, nimaps))
goto out_found;
- if (flags & IOMAP_DIRECT) {
- /* may drop and re-acquire the ilock */
- error = xfs_reflink_allocate_cow(ip, &imap, &shared,
- &lockmode);
- if (error)
- goto out_unlock;
- } else {
- error = xfs_reflink_reserve_cow(ip, &imap);
- if (error)
- goto out_unlock;
- }
+ /* may drop and re-acquire the ilock */
+ cmap = imap;
+ error = xfs_reflink_allocate_cow(ip, &cmap, &shared, &lockmode,
+ directio);
+ if (error)
+ goto out_unlock;
+
+ /*
+ * For buffered writes we need to report the address of the
+ * previous block (if there was any) so that the higher level
+ * write code can perform read-modify-write operations; we
+ * won't need the CoW fork mapping until writeback. For direct
+ * I/O, which must be block aligned, we need to report the
+ * newly allocated address. If the data fork has a hole, copy
+ * the COW fork mapping to avoid allocating to the data fork.
+ */
+ if (directio || imap.br_startblock == HOLESTARTBLOCK)
+ imap = cmap;
end_fsb = imap.br_startoff + imap.br_blockcount;
length = XFS_FSB_TO_B(mp, end_fsb) - offset;
@@ -1139,23 +1046,15 @@ xfs_file_iomap_begin(
return error;
iomap->flags |= IOMAP_F_NEW;
- trace_xfs_iomap_alloc(ip, offset, length, 0, &imap);
+ trace_xfs_iomap_alloc(ip, offset, length, XFS_DATA_FORK, &imap);
out_finish:
- if (xfs_ipincount(ip) && (ip->i_itemp->ili_fsync_fields
- & ~XFS_ILOG_TIMESTAMP))
- iomap->flags |= IOMAP_F_DIRTY;
-
- xfs_bmbt_to_iomap(ip, iomap, &imap);
-
- if (shared)
- iomap->flags |= IOMAP_F_SHARED;
- return 0;
+ return xfs_bmbt_to_iomap(ip, iomap, &imap, shared);
out_found:
ASSERT(nimaps);
xfs_iunlock(ip, lockmode);
- trace_xfs_iomap_found(ip, offset, length, 0, &imap);
+ trace_xfs_iomap_found(ip, offset, length, XFS_DATA_FORK, &imap);
goto out_finish;
out_unlock:
@@ -1241,6 +1140,92 @@ const struct iomap_ops xfs_iomap_ops = {
};
static int
+xfs_seek_iomap_begin(
+ struct inode *inode,
+ loff_t offset,
+ loff_t length,
+ unsigned flags,
+ struct iomap *iomap)
+{
+ struct xfs_inode *ip = XFS_I(inode);
+ struct xfs_mount *mp = ip->i_mount;
+ xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
+ xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + length);
+ xfs_fileoff_t cow_fsb = NULLFILEOFF, data_fsb = NULLFILEOFF;
+ struct xfs_iext_cursor icur;
+ struct xfs_bmbt_irec imap, cmap;
+ int error = 0;
+ unsigned lockmode;
+
+ if (XFS_FORCED_SHUTDOWN(mp))
+ return -EIO;
+
+ lockmode = xfs_ilock_data_map_shared(ip);
+ if (!(ip->i_df.if_flags & XFS_IFEXTENTS)) {
+ error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK);
+ if (error)
+ goto out_unlock;
+ }
+
+ if (xfs_iext_lookup_extent(ip, &ip->i_df, offset_fsb, &icur, &imap)) {
+ /*
+ * If we found a data extent we are done.
+ */
+ if (imap.br_startoff <= offset_fsb)
+ goto done;
+ data_fsb = imap.br_startoff;
+ } else {
+ /*
+ * Fake a hole until the end of the file.
+ */
+ data_fsb = min(XFS_B_TO_FSB(mp, offset + length),
+ XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes));
+ }
+
+ /*
+ * If a COW fork extent covers the hole, report it - capped to the next
+ * data fork extent:
+ */
+ if (xfs_inode_has_cow_data(ip) &&
+ xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &cmap))
+ cow_fsb = cmap.br_startoff;
+ if (cow_fsb != NULLFILEOFF && cow_fsb <= offset_fsb) {
+ if (data_fsb < cow_fsb + cmap.br_blockcount)
+ end_fsb = min(end_fsb, data_fsb);
+ xfs_trim_extent(&cmap, offset_fsb, end_fsb);
+ error = xfs_bmbt_to_iomap(ip, iomap, &cmap, true);
+ /*
+ * This is a COW extent, so we must probe the page cache
+ * because there could be dirty page cache being backed
+ * by this extent.
+ */
+ iomap->type = IOMAP_UNWRITTEN;
+ goto out_unlock;
+ }
+
+ /*
+ * Else report a hole, capped to the next found data or COW extent.
+ */
+ if (cow_fsb != NULLFILEOFF && cow_fsb < data_fsb)
+ imap.br_blockcount = cow_fsb - offset_fsb;
+ else
+ imap.br_blockcount = data_fsb - offset_fsb;
+ imap.br_startoff = offset_fsb;
+ imap.br_startblock = HOLESTARTBLOCK;
+ imap.br_state = XFS_EXT_NORM;
+done:
+ xfs_trim_extent(&imap, offset_fsb, end_fsb);
+ error = xfs_bmbt_to_iomap(ip, iomap, &imap, false);
+out_unlock:
+ xfs_iunlock(ip, lockmode);
+ return error;
+}
+
+const struct iomap_ops xfs_seek_iomap_ops = {
+ .iomap_begin = xfs_seek_iomap_begin,
+};
+
+static int
xfs_xattr_iomap_begin(
struct inode *inode,
loff_t offset,
@@ -1273,12 +1258,10 @@ xfs_xattr_iomap_begin(
out_unlock:
xfs_iunlock(ip, lockmode);
- if (!error) {
- ASSERT(nimaps);
- xfs_bmbt_to_iomap(ip, iomap, &imap);
- }
-
- return error;
+ if (error)
+ return error;
+ ASSERT(nimaps);
+ return xfs_bmbt_to_iomap(ip, iomap, &imap, false);
}
const struct iomap_ops xfs_xattr_iomap_ops = {
diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h
index c6170548831b..5c2f6aa6d78f 100644
--- a/fs/xfs/xfs_iomap.h
+++ b/fs/xfs/xfs_iomap.h
@@ -13,12 +13,10 @@ struct xfs_bmbt_irec;
int xfs_iomap_write_direct(struct xfs_inode *, xfs_off_t, size_t,
struct xfs_bmbt_irec *, int);
-int xfs_iomap_write_allocate(struct xfs_inode *, int, xfs_off_t,
- struct xfs_bmbt_irec *, unsigned int *);
int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, xfs_off_t, bool);
-void xfs_bmbt_to_iomap(struct xfs_inode *, struct iomap *,
- struct xfs_bmbt_irec *);
+int xfs_bmbt_to_iomap(struct xfs_inode *, struct iomap *,
+ struct xfs_bmbt_irec *, bool shared);
xfs_extlen_t xfs_eof_alignment(struct xfs_inode *ip, xfs_extlen_t extsize);
static inline xfs_filblks_t
@@ -42,6 +40,7 @@ xfs_aligned_fsb_count(
}
extern const struct iomap_ops xfs_iomap_ops;
+extern const struct iomap_ops xfs_seek_iomap_ops;
extern const struct iomap_ops xfs_xattr_iomap_ops;
#endif /* __XFS_IOMAP_H__*/
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index f48ffd7a8d3e..ff3c1fae5357 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -10,30 +10,20 @@
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
-#include "xfs_da_format.h"
#include "xfs_inode.h"
-#include "xfs_bmap.h"
-#include "xfs_bmap_util.h"
#include "xfs_acl.h"
#include "xfs_quota.h"
-#include "xfs_error.h"
#include "xfs_attr.h"
#include "xfs_trans.h"
#include "xfs_trace.h"
#include "xfs_icache.h"
#include "xfs_symlink.h"
-#include "xfs_da_btree.h"
#include "xfs_dir2.h"
-#include "xfs_trans_space.h"
#include "xfs_iomap.h"
-#include "xfs_defer.h"
-#include <linux/capability.h>
#include <linux/xattr.h>
#include <linux/posix_acl.h>
#include <linux/security.h>
-#include <linux/iomap.h>
-#include <linux/slab.h>
#include <linux/iversion.h>
/*
@@ -191,9 +181,18 @@ xfs_generic_create(
xfs_setup_iops(ip);
- if (tmpfile)
+ if (tmpfile) {
+ /*
+ * The VFS requires that any inode fed to d_tmpfile must have
+ * nlink == 1 so that it can decrement the nlink in d_tmpfile.
+ * However, we created the temp file with nlink == 0 because
+ * we're not allowed to put an inode with nlink > 0 on the
+ * unlinked list. Therefore we have to set nlink to 1 so that
+ * d_tmpfile can immediately set it back to zero.
+ */
+ set_nlink(inode, 1);
d_tmpfile(dentry, inode);
- else
+ } else
d_instantiate(dentry, inode);
xfs_finish_inode_setup(ip);
@@ -522,6 +521,10 @@ xfs_vn_getattr(
}
}
+ /*
+ * Note: If you add another clause to set an attribute flag, please
+ * update attributes_mask below.
+ */
if (ip->i_d.di_flags & XFS_DIFLAG_IMMUTABLE)
stat->attributes |= STATX_ATTR_IMMUTABLE;
if (ip->i_d.di_flags & XFS_DIFLAG_APPEND)
@@ -529,6 +532,10 @@ xfs_vn_getattr(
if (ip->i_d.di_flags & XFS_DIFLAG_NODUMP)
stat->attributes |= STATX_ATTR_NODUMP;
+ stat->attributes_mask |= (STATX_ATTR_IMMUTABLE |
+ STATX_ATTR_APPEND |
+ STATX_ATTR_NODUMP);
+
switch (inode->i_mode & S_IFMT) {
case S_IFBLK:
case S_IFCHR:
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index 942e4aa5e729..a8a06bb78ea8 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -14,45 +14,66 @@
#include "xfs_btree.h"
#include "xfs_ialloc.h"
#include "xfs_ialloc_btree.h"
+#include "xfs_iwalk.h"
#include "xfs_itable.h"
#include "xfs_error.h"
-#include "xfs_trace.h"
#include "xfs_icache.h"
+#include "xfs_health.h"
/*
- * Return stat information for one inode.
- * Return 0 if ok, else errno.
+ * Bulk Stat
+ * =========
+ *
+ * Use the inode walking functions to fill out struct xfs_bulkstat for every
+ * allocated inode, then pass the stat information to some externally provided
+ * iteration function.
*/
-int
+
+struct xfs_bstat_chunk {
+ bulkstat_one_fmt_pf formatter;
+ struct xfs_ibulk *breq;
+ struct xfs_bulkstat *buf;
+};
+
+/*
+ * Fill out the bulkstat info for a single inode and report it somewhere.
+ *
+ * bc->breq->lastino is effectively the inode cursor as we walk through the
+ * filesystem. Therefore, we update it any time we need to move the cursor
+ * forward, regardless of whether or not we're sending any bstat information
+ * back to userspace. If the inode is internal metadata or, has been freed
+ * out from under us, we just simply keep going.
+ *
+ * However, if any other type of error happens we want to stop right where we
+ * are so that userspace will call back with exact number of the bad inode and
+ * we can send back an error code.
+ *
+ * Note that if the formatter tells us there's no space left in the buffer we
+ * move the cursor forward and abort the walk.
+ */
+STATIC int
xfs_bulkstat_one_int(
- struct xfs_mount *mp, /* mount point for filesystem */
- xfs_ino_t ino, /* inode to get data for */
- void __user *buffer, /* buffer to place output in */
- int ubsize, /* size of buffer */
- bulkstat_one_fmt_pf formatter, /* formatter, copy to user */
- int *ubused, /* bytes used by me */
- int *stat) /* BULKSTAT_RV_... */
+ struct xfs_mount *mp,
+ struct xfs_trans *tp,
+ xfs_ino_t ino,
+ struct xfs_bstat_chunk *bc)
{
struct xfs_icdinode *dic; /* dinode core info pointer */
struct xfs_inode *ip; /* incore inode pointer */
struct inode *inode;
- struct xfs_bstat *buf; /* return buffer */
- int error = 0; /* error value */
+ struct xfs_bulkstat *buf = bc->buf;
+ int error = -EINVAL;
- *stat = BULKSTAT_RV_NOTHING;
+ if (xfs_internal_inum(mp, ino))
+ goto out_advance;
- if (!buffer || xfs_internal_inum(mp, ino))
- return -EINVAL;
-
- buf = kmem_zalloc(sizeof(*buf), KM_SLEEP | KM_MAYFAIL);
- if (!buf)
- return -ENOMEM;
-
- error = xfs_iget(mp, NULL, ino,
+ error = xfs_iget(mp, tp, ino,
(XFS_IGET_DONTCACHE | XFS_IGET_UNTRUSTED),
XFS_ILOCK_SHARED, &ip);
+ if (error == -ENOENT || error == -EINVAL)
+ goto out_advance;
if (error)
- goto out_free;
+ goto out;
ASSERT(ip != NULL);
ASSERT(ip->i_imap.im_blkno != 0);
@@ -63,36 +84,35 @@ xfs_bulkstat_one_int(
/* xfs_iget returns the following without needing
* further change.
*/
- buf->bs_projid_lo = dic->di_projid_lo;
- buf->bs_projid_hi = dic->di_projid_hi;
+ buf->bs_projectid = xfs_get_projid(ip);
buf->bs_ino = ino;
buf->bs_uid = dic->di_uid;
buf->bs_gid = dic->di_gid;
buf->bs_size = dic->di_size;
buf->bs_nlink = inode->i_nlink;
- buf->bs_atime.tv_sec = inode->i_atime.tv_sec;
- buf->bs_atime.tv_nsec = inode->i_atime.tv_nsec;
- buf->bs_mtime.tv_sec = inode->i_mtime.tv_sec;
- buf->bs_mtime.tv_nsec = inode->i_mtime.tv_nsec;
- buf->bs_ctime.tv_sec = inode->i_ctime.tv_sec;
- buf->bs_ctime.tv_nsec = inode->i_ctime.tv_nsec;
+ buf->bs_atime = inode->i_atime.tv_sec;
+ buf->bs_atime_nsec = inode->i_atime.tv_nsec;
+ buf->bs_mtime = inode->i_mtime.tv_sec;
+ buf->bs_mtime_nsec = inode->i_mtime.tv_nsec;
+ buf->bs_ctime = inode->i_ctime.tv_sec;
+ buf->bs_ctime_nsec = inode->i_ctime.tv_nsec;
+ buf->bs_btime = dic->di_crtime.t_sec;
+ buf->bs_btime_nsec = dic->di_crtime.t_nsec;
buf->bs_gen = inode->i_generation;
buf->bs_mode = inode->i_mode;
buf->bs_xflags = xfs_ip2xflags(ip);
- buf->bs_extsize = dic->di_extsize << mp->m_sb.sb_blocklog;
+ buf->bs_extsize_blks = dic->di_extsize;
buf->bs_extents = dic->di_nextents;
- memset(buf->bs_pad, 0, sizeof(buf->bs_pad));
- buf->bs_dmevmask = dic->di_dmevmask;
- buf->bs_dmstate = dic->di_dmstate;
+ xfs_bulkstat_health(ip, buf);
buf->bs_aextents = dic->di_anextents;
buf->bs_forkoff = XFS_IFORK_BOFF(ip);
+ buf->bs_version = XFS_BULKSTAT_VERSION_V5;
if (dic->di_version == 3) {
if (dic->di_flags2 & XFS_DIFLAG2_COWEXTSIZE)
- buf->bs_cowextsize = dic->di_cowextsize <<
- mp->m_sb.sb_blocklog;
+ buf->bs_cowextsize_blks = dic->di_cowextsize;
}
switch (dic->di_format) {
@@ -116,385 +136,121 @@ xfs_bulkstat_one_int(
xfs_iunlock(ip, XFS_ILOCK_SHARED);
xfs_irele(ip);
- error = formatter(buffer, ubsize, ubused, buf);
- if (!error)
- *stat = BULKSTAT_RV_DIDONE;
+ error = bc->formatter(bc->breq, buf);
+ if (error == XFS_IBULK_ABORT)
+ goto out_advance;
+ if (error)
+ goto out;
- out_free:
- kmem_free(buf);
+out_advance:
+ /*
+ * Advance the cursor to the inode that comes after the one we just
+ * looked at. We want the caller to move along if the bulkstat
+ * information was copied successfully; if we tried to grab the inode
+ * but it's no longer allocated; or if it's internal metadata.
+ */
+ bc->breq->startino = ino + 1;
+out:
return error;
}
-/* Return 0 on success or positive error */
-STATIC int
-xfs_bulkstat_one_fmt(
- void __user *ubuffer,
- int ubsize,
- int *ubused,
- const xfs_bstat_t *buffer)
-{
- if (ubsize < sizeof(*buffer))
- return -ENOMEM;
- if (copy_to_user(ubuffer, buffer, sizeof(*buffer)))
- return -EFAULT;
- if (ubused)
- *ubused = sizeof(*buffer);
- return 0;
-}
-
+/* Bulkstat a single inode. */
int
xfs_bulkstat_one(
- xfs_mount_t *mp, /* mount point for filesystem */
- xfs_ino_t ino, /* inode number to get data for */
- void __user *buffer, /* buffer to place output in */
- int ubsize, /* size of buffer */
- int *ubused, /* bytes used by me */
- int *stat) /* BULKSTAT_RV_... */
+ struct xfs_ibulk *breq,
+ bulkstat_one_fmt_pf formatter)
{
- return xfs_bulkstat_one_int(mp, ino, buffer, ubsize,
- xfs_bulkstat_one_fmt, ubused, stat);
-}
+ struct xfs_bstat_chunk bc = {
+ .formatter = formatter,
+ .breq = breq,
+ };
+ int error;
-/*
- * Loop over all clusters in a chunk for a given incore inode allocation btree
- * record. Do a readahead if there are any allocated inodes in that cluster.
- */
-STATIC void
-xfs_bulkstat_ichunk_ra(
- struct xfs_mount *mp,
- xfs_agnumber_t agno,
- struct xfs_inobt_rec_incore *irec)
-{
- xfs_agblock_t agbno;
- struct blk_plug plug;
- int i; /* inode chunk index */
-
- agbno = XFS_AGINO_TO_AGBNO(mp, irec->ir_startino);
-
- blk_start_plug(&plug);
- for (i = 0; i < XFS_INODES_PER_CHUNK;
- i += mp->m_inodes_per_cluster, agbno += mp->m_blocks_per_cluster) {
- if (xfs_inobt_maskn(i, mp->m_inodes_per_cluster) &
- ~irec->ir_free) {
- xfs_btree_reada_bufs(mp, agno, agbno,
- mp->m_blocks_per_cluster,
- &xfs_inode_buf_ops);
- }
- }
- blk_finish_plug(&plug);
-}
+ ASSERT(breq->icount == 1);
-/*
- * Lookup the inode chunk that the given inode lives in and then get the record
- * if we found the chunk. If the inode was not the last in the chunk and there
- * are some left allocated, update the data for the pointed-to record as well as
- * return the count of grabbed inodes.
- */
-STATIC int
-xfs_bulkstat_grab_ichunk(
- struct xfs_btree_cur *cur, /* btree cursor */
- xfs_agino_t agino, /* starting inode of chunk */
- int *icount,/* return # of inodes grabbed */
- struct xfs_inobt_rec_incore *irec) /* btree record */
-{
- int idx; /* index into inode chunk */
- int stat;
- int error = 0;
-
- /* Lookup the inode chunk that this inode lives in */
- error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &stat);
- if (error)
- return error;
- if (!stat) {
- *icount = 0;
- return error;
- }
+ bc.buf = kmem_zalloc(sizeof(struct xfs_bulkstat),
+ KM_SLEEP | KM_MAYFAIL);
+ if (!bc.buf)
+ return -ENOMEM;
- /* Get the record, should always work */
- error = xfs_inobt_get_rec(cur, irec, &stat);
- if (error)
- return error;
- XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, stat == 1);
+ error = xfs_bulkstat_one_int(breq->mp, NULL, breq->startino, &bc);
- /* Check if the record contains the inode in request */
- if (irec->ir_startino + XFS_INODES_PER_CHUNK <= agino) {
- *icount = 0;
- return 0;
- }
+ kmem_free(bc.buf);
- idx = agino - irec->ir_startino + 1;
- if (idx < XFS_INODES_PER_CHUNK &&
- (xfs_inobt_maskn(idx, XFS_INODES_PER_CHUNK - idx) & ~irec->ir_free)) {
- int i;
-
- /* We got a right chunk with some left inodes allocated at it.
- * Grab the chunk record. Mark all the uninteresting inodes
- * free -- because they're before our start point.
- */
- for (i = 0; i < idx; i++) {
- if (XFS_INOBT_MASK(i) & ~irec->ir_free)
- irec->ir_freecount++;
- }
-
- irec->ir_free |= xfs_inobt_maskn(0, idx);
- *icount = irec->ir_count - irec->ir_freecount;
- }
+ /*
+ * If we reported one inode to userspace then we abort because we hit
+ * the end of the buffer. Don't leak that back to userspace.
+ */
+ if (error == XFS_IWALK_ABORT)
+ error = 0;
- return 0;
+ return error;
}
-#define XFS_BULKSTAT_UBLEFT(ubleft) ((ubleft) >= statstruct_size)
-
-struct xfs_bulkstat_agichunk {
- char __user **ac_ubuffer;/* pointer into user's buffer */
- int ac_ubleft; /* bytes left in user's buffer */
- int ac_ubelem; /* spaces used in user's buffer */
-};
-
-/*
- * Process inodes in chunk with a pointer to a formatter function
- * that will iget the inode and fill in the appropriate structure.
- */
static int
-xfs_bulkstat_ag_ichunk(
- struct xfs_mount *mp,
- xfs_agnumber_t agno,
- struct xfs_inobt_rec_incore *irbp,
- bulkstat_one_pf formatter,
- size_t statstruct_size,
- struct xfs_bulkstat_agichunk *acp,
- xfs_agino_t *last_agino)
+xfs_bulkstat_iwalk(
+ struct xfs_mount *mp,
+ struct xfs_trans *tp,
+ xfs_ino_t ino,
+ void *data)
{
- char __user **ubufp = acp->ac_ubuffer;
- int chunkidx;
- int error = 0;
- xfs_agino_t agino = irbp->ir_startino;
-
- for (chunkidx = 0; chunkidx < XFS_INODES_PER_CHUNK;
- chunkidx++, agino++) {
- int fmterror;
- int ubused;
-
- /* inode won't fit in buffer, we are done */
- if (acp->ac_ubleft < statstruct_size)
- break;
-
- /* Skip if this inode is free */
- if (XFS_INOBT_MASK(chunkidx) & irbp->ir_free)
- continue;
-
- /* Get the inode and fill in a single buffer */
- ubused = statstruct_size;
- error = formatter(mp, XFS_AGINO_TO_INO(mp, agno, agino),
- *ubufp, acp->ac_ubleft, &ubused, &fmterror);
-
- if (fmterror == BULKSTAT_RV_GIVEUP ||
- (error && error != -ENOENT && error != -EINVAL)) {
- acp->ac_ubleft = 0;
- ASSERT(error);
- break;
- }
-
- /* be careful not to leak error if at end of chunk */
- if (fmterror == BULKSTAT_RV_NOTHING || error) {
- error = 0;
- continue;
- }
-
- *ubufp += ubused;
- acp->ac_ubleft -= ubused;
- acp->ac_ubelem++;
- }
-
- /*
- * Post-update *last_agino. At this point, agino will always point one
- * inode past the last inode we processed successfully. Hence we
- * substract that inode when setting the *last_agino cursor so that we
- * return the correct cookie to userspace. On the next bulkstat call,
- * the inode under the lastino cookie will be skipped as we have already
- * processed it here.
- */
- *last_agino = agino - 1;
+ int error;
+ error = xfs_bulkstat_one_int(mp, tp, ino, data);
+ /* bulkstat just skips over missing inodes */
+ if (error == -ENOENT || error == -EINVAL)
+ return 0;
return error;
}
/*
- * Return stat information in bulk (by-inode) for the filesystem.
+ * Check the incoming lastino parameter.
+ *
+ * We allow any inode value that could map to physical space inside the
+ * filesystem because if there are no inodes there, bulkstat moves on to the
+ * next chunk. In other words, the magic agino value of zero takes us to the
+ * first chunk in the AG, and an agino value past the end of the AG takes us to
+ * the first chunk in the next AG.
+ *
+ * Therefore we can end early if the requested inode is beyond the end of the
+ * filesystem or doesn't map properly.
*/
-int /* error status */
-xfs_bulkstat(
- xfs_mount_t *mp, /* mount point for filesystem */
- xfs_ino_t *lastinop, /* last inode returned */
- int *ubcountp, /* size of buffer/count returned */
- bulkstat_one_pf formatter, /* func that'd fill a single buf */
- size_t statstruct_size, /* sizeof struct filling */
- char __user *ubuffer, /* buffer with inode stats */
- int *done) /* 1 if there are more stats to get */
+static inline bool
+xfs_bulkstat_already_done(
+ struct xfs_mount *mp,
+ xfs_ino_t startino)
{
- xfs_buf_t *agbp; /* agi header buffer */
- xfs_agino_t agino; /* inode # in allocation group */
- xfs_agnumber_t agno; /* allocation group number */
- xfs_btree_cur_t *cur; /* btree cursor for ialloc btree */
- xfs_inobt_rec_incore_t *irbuf; /* start of irec buffer */
- int nirbuf; /* size of irbuf */
- int ubcount; /* size of user's buffer */
- struct xfs_bulkstat_agichunk ac;
- int error = 0;
+ xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, startino);
+ xfs_agino_t agino = XFS_INO_TO_AGINO(mp, startino);
- /*
- * Get the last inode value, see if there's nothing to do.
- */
- agno = XFS_INO_TO_AGNO(mp, *lastinop);
- agino = XFS_INO_TO_AGINO(mp, *lastinop);
- if (agno >= mp->m_sb.sb_agcount ||
- *lastinop != XFS_AGINO_TO_INO(mp, agno, agino)) {
- *done = 1;
- *ubcountp = 0;
- return 0;
- }
+ return agno >= mp->m_sb.sb_agcount ||
+ startino != XFS_AGINO_TO_INO(mp, agno, agino);
+}
- ubcount = *ubcountp; /* statstruct's */
- ac.ac_ubuffer = &ubuffer;
- ac.ac_ubleft = ubcount * statstruct_size; /* bytes */;
- ac.ac_ubelem = 0;
+/* Return stat information in bulk (by-inode) for the filesystem. */
+int
+xfs_bulkstat(
+ struct xfs_ibulk *breq,
+ bulkstat_one_fmt_pf formatter)
+{
+ struct xfs_bstat_chunk bc = {
+ .formatter = formatter,
+ .breq = breq,
+ };
+ int error;
- *ubcountp = 0;
- *done = 0;
+ if (xfs_bulkstat_already_done(breq->mp, breq->startino))
+ return 0;
- irbuf = kmem_zalloc_large(PAGE_SIZE * 4, KM_SLEEP);
- if (!irbuf)
+ bc.buf = kmem_zalloc(sizeof(struct xfs_bulkstat),
+ KM_SLEEP | KM_MAYFAIL);
+ if (!bc.buf)
return -ENOMEM;
- nirbuf = (PAGE_SIZE * 4) / sizeof(*irbuf);
- /*
- * Loop over the allocation groups, starting from the last
- * inode returned; 0 means start of the allocation group.
- */
- while (agno < mp->m_sb.sb_agcount) {
- struct xfs_inobt_rec_incore *irbp = irbuf;
- struct xfs_inobt_rec_incore *irbufend = irbuf + nirbuf;
- bool end_of_ag = false;
- int icount = 0;
- int stat;
-
- error = xfs_ialloc_read_agi(mp, NULL, agno, &agbp);
- if (error)
- break;
- /*
- * Allocate and initialize a btree cursor for ialloc btree.
- */
- cur = xfs_inobt_init_cursor(mp, NULL, agbp, agno,
- XFS_BTNUM_INO);
- if (agino > 0) {
- /*
- * In the middle of an allocation group, we need to get
- * the remainder of the chunk we're in.
- */
- struct xfs_inobt_rec_incore r;
-
- error = xfs_bulkstat_grab_ichunk(cur, agino, &icount, &r);
- if (error)
- goto del_cursor;
- if (icount) {
- irbp->ir_startino = r.ir_startino;
- irbp->ir_holemask = r.ir_holemask;
- irbp->ir_count = r.ir_count;
- irbp->ir_freecount = r.ir_freecount;
- irbp->ir_free = r.ir_free;
- irbp++;
- }
- /* Increment to the next record */
- error = xfs_btree_increment(cur, 0, &stat);
- } else {
- /* Start of ag. Lookup the first inode chunk */
- error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &stat);
- }
- if (error || stat == 0) {
- end_of_ag = true;
- goto del_cursor;
- }
-
- /*
- * Loop through inode btree records in this ag,
- * until we run out of inodes or space in the buffer.
- */
- while (irbp < irbufend && icount < ubcount) {
- struct xfs_inobt_rec_incore r;
-
- error = xfs_inobt_get_rec(cur, &r, &stat);
- if (error || stat == 0) {
- end_of_ag = true;
- goto del_cursor;
- }
-
- /*
- * If this chunk has any allocated inodes, save it.
- * Also start read-ahead now for this chunk.
- */
- if (r.ir_freecount < r.ir_count) {
- xfs_bulkstat_ichunk_ra(mp, agno, &r);
- irbp->ir_startino = r.ir_startino;
- irbp->ir_holemask = r.ir_holemask;
- irbp->ir_count = r.ir_count;
- irbp->ir_freecount = r.ir_freecount;
- irbp->ir_free = r.ir_free;
- irbp++;
- icount += r.ir_count - r.ir_freecount;
- }
- error = xfs_btree_increment(cur, 0, &stat);
- if (error || stat == 0) {
- end_of_ag = true;
- goto del_cursor;
- }
- cond_resched();
- }
-
- /*
- * Drop the btree buffers and the agi buffer as we can't hold any
- * of the locks these represent when calling iget. If there is a
- * pending error, then we are done.
- */
-del_cursor:
- xfs_btree_del_cursor(cur, error);
- xfs_buf_relse(agbp);
- if (error)
- break;
- /*
- * Now format all the good inodes into the user's buffer. The
- * call to xfs_bulkstat_ag_ichunk() sets up the agino pointer
- * for the next loop iteration.
- */
- irbufend = irbp;
- for (irbp = irbuf;
- irbp < irbufend && ac.ac_ubleft >= statstruct_size;
- irbp++) {
- error = xfs_bulkstat_ag_ichunk(mp, agno, irbp,
- formatter, statstruct_size, &ac,
- &agino);
- if (error)
- break;
-
- cond_resched();
- }
-
- /*
- * If we've run out of space or had a formatting error, we
- * are now done
- */
- if (ac.ac_ubleft < statstruct_size || error)
- break;
-
- if (end_of_ag) {
- agno++;
- agino = 0;
- }
- }
- /*
- * Done, we're either out of filesystem or space to put the data.
- */
- kmem_free(irbuf);
- *ubcountp = ac.ac_ubelem;
+ error = xfs_iwalk(breq->mp, NULL, breq->startino, breq->flags,
+ xfs_bulkstat_iwalk, breq->icount, &bc);
+
+ kmem_free(bc.buf);
/*
* We found some inodes, so clear the error status and return them.
@@ -503,135 +259,136 @@ del_cursor:
* triggered again and propagated to userspace as there will be no
* formatted inodes in the buffer.
*/
- if (ac.ac_ubelem)
+ if (breq->ocount > 0)
error = 0;
- /*
- * If we ran out of filesystem, lastino will point off the end of
- * the filesystem so the next call will return immediately.
- */
- *lastinop = XFS_AGINO_TO_INO(mp, agno, agino);
- if (agno >= mp->m_sb.sb_agcount)
- *done = 1;
-
return error;
}
-int
-xfs_inumbers_fmt(
- void __user *ubuffer, /* buffer to write to */
- const struct xfs_inogrp *buffer, /* buffer to read from */
- long count, /* # of elements to read */
- long *written) /* # of bytes written */
+/* Convert bulkstat (v5) to bstat (v1). */
+void
+xfs_bulkstat_to_bstat(
+ struct xfs_mount *mp,
+ struct xfs_bstat *bs1,
+ const struct xfs_bulkstat *bstat)
{
- if (copy_to_user(ubuffer, buffer, count * sizeof(*buffer)))
- return -EFAULT;
- *written = count * sizeof(*buffer);
- return 0;
+ memset(bs1, 0, sizeof(struct xfs_bstat));
+ bs1->bs_ino = bstat->bs_ino;
+ bs1->bs_mode = bstat->bs_mode;
+ bs1->bs_nlink = bstat->bs_nlink;
+ bs1->bs_uid = bstat->bs_uid;
+ bs1->bs_gid = bstat->bs_gid;
+ bs1->bs_rdev = bstat->bs_rdev;
+ bs1->bs_blksize = bstat->bs_blksize;
+ bs1->bs_size = bstat->bs_size;
+ bs1->bs_atime.tv_sec = bstat->bs_atime;
+ bs1->bs_mtime.tv_sec = bstat->bs_mtime;
+ bs1->bs_ctime.tv_sec = bstat->bs_ctime;
+ bs1->bs_atime.tv_nsec = bstat->bs_atime_nsec;
+ bs1->bs_mtime.tv_nsec = bstat->bs_mtime_nsec;
+ bs1->bs_ctime.tv_nsec = bstat->bs_ctime_nsec;
+ bs1->bs_blocks = bstat->bs_blocks;
+ bs1->bs_xflags = bstat->bs_xflags;
+ bs1->bs_extsize = XFS_FSB_TO_B(mp, bstat->bs_extsize_blks);
+ bs1->bs_extents = bstat->bs_extents;
+ bs1->bs_gen = bstat->bs_gen;
+ bs1->bs_projid_lo = bstat->bs_projectid & 0xFFFF;
+ bs1->bs_forkoff = bstat->bs_forkoff;
+ bs1->bs_projid_hi = bstat->bs_projectid >> 16;
+ bs1->bs_sick = bstat->bs_sick;
+ bs1->bs_checked = bstat->bs_checked;
+ bs1->bs_cowextsize = XFS_FSB_TO_B(mp, bstat->bs_cowextsize_blks);
+ bs1->bs_dmevmask = 0;
+ bs1->bs_dmstate = 0;
+ bs1->bs_aextents = bstat->bs_aextents;
+}
+
+struct xfs_inumbers_chunk {
+ inumbers_fmt_pf formatter;
+ struct xfs_ibulk *breq;
+};
+
+/*
+ * INUMBERS
+ * ========
+ * This is how we export inode btree records to userspace, so that XFS tools
+ * can figure out where inodes are allocated.
+ */
+
+/*
+ * Format the inode group structure and report it somewhere.
+ *
+ * Similar to xfs_bulkstat_one_int, lastino is the inode cursor as we walk
+ * through the filesystem so we move it forward unless there was a runtime
+ * error. If the formatter tells us the buffer is now full we also move the
+ * cursor forward and abort the walk.
+ */
+STATIC int
+xfs_inumbers_walk(
+ struct xfs_mount *mp,
+ struct xfs_trans *tp,
+ xfs_agnumber_t agno,
+ const struct xfs_inobt_rec_incore *irec,
+ void *data)
+{
+ struct xfs_inumbers inogrp = {
+ .xi_startino = XFS_AGINO_TO_INO(mp, agno, irec->ir_startino),
+ .xi_alloccount = irec->ir_count - irec->ir_freecount,
+ .xi_allocmask = ~irec->ir_free,
+ .xi_version = XFS_INUMBERS_VERSION_V5,
+ };
+ struct xfs_inumbers_chunk *ic = data;
+ int error;
+
+ error = ic->formatter(ic->breq, &inogrp);
+ if (error && error != XFS_IBULK_ABORT)
+ return error;
+
+ ic->breq->startino = XFS_AGINO_TO_INO(mp, agno, irec->ir_startino) +
+ XFS_INODES_PER_CHUNK;
+ return error;
}
/*
* Return inode number table for the filesystem.
*/
-int /* error status */
+int
xfs_inumbers(
- struct xfs_mount *mp,/* mount point for filesystem */
- xfs_ino_t *lastino,/* last inode returned */
- int *count,/* size of buffer/count returned */
- void __user *ubuffer,/* buffer with inode descriptions */
+ struct xfs_ibulk *breq,
inumbers_fmt_pf formatter)
{
- xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, *lastino);
- xfs_agino_t agino = XFS_INO_TO_AGINO(mp, *lastino);
- struct xfs_btree_cur *cur = NULL;
- struct xfs_buf *agbp = NULL;
- struct xfs_inogrp *buffer;
- int bcount;
- int left = *count;
- int bufidx = 0;
+ struct xfs_inumbers_chunk ic = {
+ .formatter = formatter,
+ .breq = breq,
+ };
int error = 0;
- *count = 0;
- if (agno >= mp->m_sb.sb_agcount ||
- *lastino != XFS_AGINO_TO_INO(mp, agno, agino))
- return error;
+ if (xfs_bulkstat_already_done(breq->mp, breq->startino))
+ return 0;
- bcount = min(left, (int)(PAGE_SIZE / sizeof(*buffer)));
- buffer = kmem_zalloc(bcount * sizeof(*buffer), KM_SLEEP);
- do {
- struct xfs_inobt_rec_incore r;
- int stat;
-
- if (!agbp) {
- error = xfs_ialloc_read_agi(mp, NULL, agno, &agbp);
- if (error)
- break;
-
- cur = xfs_inobt_init_cursor(mp, NULL, agbp, agno,
- XFS_BTNUM_INO);
- error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_GE,
- &stat);
- if (error)
- break;
- if (!stat)
- goto next_ag;
- }
-
- error = xfs_inobt_get_rec(cur, &r, &stat);
- if (error)
- break;
- if (!stat)
- goto next_ag;
-
- agino = r.ir_startino + XFS_INODES_PER_CHUNK - 1;
- buffer[bufidx].xi_startino =
- XFS_AGINO_TO_INO(mp, agno, r.ir_startino);
- buffer[bufidx].xi_alloccount = r.ir_count - r.ir_freecount;
- buffer[bufidx].xi_allocmask = ~r.ir_free;
- if (++bufidx == bcount) {
- long written;
-
- error = formatter(ubuffer, buffer, bufidx, &written);
- if (error)
- break;
- ubuffer += written;
- *count += bufidx;
- bufidx = 0;
- }
- if (!--left)
- break;
-
- error = xfs_btree_increment(cur, 0, &stat);
- if (error)
- break;
- if (stat)
- continue;
-
-next_ag:
- xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
- cur = NULL;
- xfs_buf_relse(agbp);
- agbp = NULL;
- agino = 0;
- agno++;
- } while (agno < mp->m_sb.sb_agcount);
-
- if (!error) {
- if (bufidx) {
- long written;
-
- error = formatter(ubuffer, buffer, bufidx, &written);
- if (!error)
- *count += bufidx;
- }
- *lastino = XFS_AGINO_TO_INO(mp, agno, agino);
- }
+ error = xfs_inobt_walk(breq->mp, NULL, breq->startino, breq->flags,
+ xfs_inumbers_walk, breq->icount, &ic);
- kmem_free(buffer);
- if (cur)
- xfs_btree_del_cursor(cur, error);
- if (agbp)
- xfs_buf_relse(agbp);
+ /*
+ * We found some inode groups, so clear the error status and return
+ * them. The lastino pointer will point directly at the inode that
+ * triggered any error that occurred, so on the next call the error
+ * will be triggered again and propagated to userspace as there will be
+ * no formatted inode groups in the buffer.
+ */
+ if (breq->ocount > 0)
+ error = 0;
return error;
}
+
+/* Convert an inumbers (v5) struct to a inogrp (v1) struct. */
+void
+xfs_inumbers_to_inogrp(
+ struct xfs_inogrp *ig1,
+ const struct xfs_inumbers *ig)
+{
+ ig1->xi_startino = ig->xi_startino;
+ ig1->xi_alloccount = ig->xi_alloccount;
+ ig1->xi_allocmask = ig->xi_allocmask;
+}
diff --git a/fs/xfs/xfs_itable.h b/fs/xfs/xfs_itable.h
index 8a822285b671..e90c1fc5b981 100644
--- a/fs/xfs/xfs_itable.h
+++ b/fs/xfs/xfs_itable.h
@@ -5,83 +5,55 @@
#ifndef __XFS_ITABLE_H__
#define __XFS_ITABLE_H__
-/*
- * xfs_bulkstat() is used to fill in xfs_bstat structures as well as dm_stat
- * structures (by the dmi library). This is a pointer to a formatter function
- * that will iget the inode and fill in the appropriate structure.
- * see xfs_bulkstat_one() and xfs_dm_bulkstat_one() in dmapi_xfs.c
- */
-typedef int (*bulkstat_one_pf)(struct xfs_mount *mp,
- xfs_ino_t ino,
- void __user *buffer,
- int ubsize,
- int *ubused,
- int *stat);
+/* In-memory representation of a userspace request for batch inode data. */
+struct xfs_ibulk {
+ struct xfs_mount *mp;
+ void __user *ubuffer; /* user output buffer */
+ xfs_ino_t startino; /* start with this inode */
+ unsigned int icount; /* number of elements in ubuffer */
+ unsigned int ocount; /* number of records returned */
+ unsigned int flags; /* see XFS_IBULK_FLAG_* */
+};
+
+/* Only iterate within the same AG as startino */
+#define XFS_IBULK_SAME_AG (XFS_IWALK_SAME_AG)
+
+/* Return value that means we want to abort the walk. */
+#define XFS_IBULK_ABORT (XFS_IWALK_ABORT)
/*
- * Values for stat return value.
+ * Advance the user buffer pointer by one record of the given size. If the
+ * buffer is now full, return the appropriate error code.
*/
-#define BULKSTAT_RV_NOTHING 0
-#define BULKSTAT_RV_DIDONE 1
-#define BULKSTAT_RV_GIVEUP 2
+static inline int
+xfs_ibulk_advance(
+ struct xfs_ibulk *breq,
+ size_t bytes)
+{
+ char __user *b = breq->ubuffer;
+
+ breq->ubuffer = b + bytes;
+ breq->ocount++;
+ return breq->ocount == breq->icount ? XFS_IBULK_ABORT : 0;
+}
/*
* Return stat information in bulk (by-inode) for the filesystem.
*/
-int /* error status */
-xfs_bulkstat(
- xfs_mount_t *mp, /* mount point for filesystem */
- xfs_ino_t *lastino, /* last inode returned */
- int *count, /* size of buffer/count returned */
- bulkstat_one_pf formatter, /* func that'd fill a single buf */
- size_t statstruct_size,/* sizeof struct that we're filling */
- char __user *ubuffer,/* buffer with inode stats */
- int *done); /* 1 if there are more stats to get */
-
-typedef int (*bulkstat_one_fmt_pf)( /* used size in bytes or negative error */
- void __user *ubuffer, /* buffer to write to */
- int ubsize, /* remaining user buffer sz */
- int *ubused, /* bytes used by formatter */
- const xfs_bstat_t *buffer); /* buffer to read from */
-
-int
-xfs_bulkstat_one_int(
- xfs_mount_t *mp,
- xfs_ino_t ino,
- void __user *buffer,
- int ubsize,
- bulkstat_one_fmt_pf formatter,
- int *ubused,
- int *stat);
-int
-xfs_bulkstat_one(
- xfs_mount_t *mp,
- xfs_ino_t ino,
- void __user *buffer,
- int ubsize,
- int *ubused,
- int *stat);
+typedef int (*bulkstat_one_fmt_pf)(struct xfs_ibulk *breq,
+ const struct xfs_bulkstat *bstat);
-typedef int (*inumbers_fmt_pf)(
- void __user *ubuffer, /* buffer to write to */
- const xfs_inogrp_t *buffer, /* buffer to read from */
- long count, /* # of elements to read */
- long *written); /* # of bytes written */
+int xfs_bulkstat_one(struct xfs_ibulk *breq, bulkstat_one_fmt_pf formatter);
+int xfs_bulkstat(struct xfs_ibulk *breq, bulkstat_one_fmt_pf formatter);
+void xfs_bulkstat_to_bstat(struct xfs_mount *mp, struct xfs_bstat *bs1,
+ const struct xfs_bulkstat *bstat);
-int
-xfs_inumbers_fmt(
- void __user *ubuffer, /* buffer to write to */
- const xfs_inogrp_t *buffer, /* buffer to read from */
- long count, /* # of elements to read */
- long *written); /* # of bytes written */
+typedef int (*inumbers_fmt_pf)(struct xfs_ibulk *breq,
+ const struct xfs_inumbers *igrp);
-int /* error status */
-xfs_inumbers(
- xfs_mount_t *mp, /* mount point for filesystem */
- xfs_ino_t *last, /* last inode returned */
- int *count, /* size of buffer/count returned */
- void __user *buffer, /* buffer with inode info */
- inumbers_fmt_pf formatter);
+int xfs_inumbers(struct xfs_ibulk *breq, inumbers_fmt_pf formatter);
+void xfs_inumbers_to_inogrp(struct xfs_inogrp *ig1,
+ const struct xfs_inumbers *ig);
#endif /* __XFS_ITABLE_H__ */
diff --git a/fs/xfs/xfs_iwalk.c b/fs/xfs/xfs_iwalk.c
new file mode 100644
index 000000000000..8c7d727149ea
--- /dev/null
+++ b/fs/xfs/xfs_iwalk.c
@@ -0,0 +1,720 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2019 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_btree.h"
+#include "xfs_ialloc.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_iwalk.h"
+#include "xfs_error.h"
+#include "xfs_trace.h"
+#include "xfs_icache.h"
+#include "xfs_health.h"
+#include "xfs_trans.h"
+#include "xfs_pwork.h"
+
+/*
+ * Walking Inodes in the Filesystem
+ * ================================
+ *
+ * This iterator function walks a subset of filesystem inodes in increasing
+ * order from @startino until there are no more inodes. For each allocated
+ * inode it finds, it calls a walk function with the relevant inode number and
+ * a pointer to caller-provided data. The walk function can return the usual
+ * negative error code to stop the iteration; 0 to continue the iteration; or
+ * XFS_IWALK_ABORT to stop the iteration. This return value is returned to the
+ * caller.
+ *
+ * Internally, we allow the walk function to do anything, which means that we
+ * cannot maintain the inobt cursor or our lock on the AGI buffer. We
+ * therefore cache the inobt records in kernel memory and only call the walk
+ * function when our memory buffer is full. @nr_recs is the number of records
+ * that we've cached, and @sz_recs is the size of our cache.
+ *
+ * It is the responsibility of the walk function to ensure it accesses
+ * allocated inodes, as the inobt records may be stale by the time they are
+ * acted upon.
+ */
+
+struct xfs_iwalk_ag {
+ /* parallel work control data; will be null if single threaded */
+ struct xfs_pwork pwork;
+
+ struct xfs_mount *mp;
+ struct xfs_trans *tp;
+
+ /* Where do we start the traversal? */
+ xfs_ino_t startino;
+
+ /* Array of inobt records we cache. */
+ struct xfs_inobt_rec_incore *recs;
+
+ /* Number of entries allocated for the @recs array. */
+ unsigned int sz_recs;
+
+ /* Number of entries in the @recs array that are in use. */
+ unsigned int nr_recs;
+
+ /* Inode walk function and data pointer. */
+ xfs_iwalk_fn iwalk_fn;
+ xfs_inobt_walk_fn inobt_walk_fn;
+ void *data;
+
+ /*
+ * Make it look like the inodes up to startino are free so that
+ * bulkstat can start its inode iteration at the correct place without
+ * needing to special case everywhere.
+ */
+ unsigned int trim_start:1;
+
+ /* Skip empty inobt records? */
+ unsigned int skip_empty:1;
+};
+
+/*
+ * Loop over all clusters in a chunk for a given incore inode allocation btree
+ * record. Do a readahead if there are any allocated inodes in that cluster.
+ */
+STATIC void
+xfs_iwalk_ichunk_ra(
+ struct xfs_mount *mp,
+ xfs_agnumber_t agno,
+ struct xfs_inobt_rec_incore *irec)
+{
+ struct xfs_ino_geometry *igeo = M_IGEO(mp);
+ xfs_agblock_t agbno;
+ struct blk_plug plug;
+ int i; /* inode chunk index */
+
+ agbno = XFS_AGINO_TO_AGBNO(mp, irec->ir_startino);
+
+ blk_start_plug(&plug);
+ for (i = 0; i < XFS_INODES_PER_CHUNK; i += igeo->inodes_per_cluster) {
+ xfs_inofree_t imask;
+
+ imask = xfs_inobt_maskn(i, igeo->inodes_per_cluster);
+ if (imask & ~irec->ir_free) {
+ xfs_btree_reada_bufs(mp, agno, agbno,
+ igeo->blocks_per_cluster,
+ &xfs_inode_buf_ops);
+ }
+ agbno += igeo->blocks_per_cluster;
+ }
+ blk_finish_plug(&plug);
+}
+
+/*
+ * Set the bits in @irec's free mask that correspond to the inodes before
+ * @agino so that we skip them. This is how we restart an inode walk that was
+ * interrupted in the middle of an inode record.
+ */
+STATIC void
+xfs_iwalk_adjust_start(
+ xfs_agino_t agino, /* starting inode of chunk */
+ struct xfs_inobt_rec_incore *irec) /* btree record */
+{
+ int idx; /* index into inode chunk */
+ int i;
+
+ idx = agino - irec->ir_startino;
+
+ /*
+ * We got a right chunk with some left inodes allocated at it. Grab
+ * the chunk record. Mark all the uninteresting inodes free because
+ * they're before our start point.
+ */
+ for (i = 0; i < idx; i++) {
+ if (XFS_INOBT_MASK(i) & ~irec->ir_free)
+ irec->ir_freecount++;
+ }
+
+ irec->ir_free |= xfs_inobt_maskn(0, idx);
+}
+
+/* Allocate memory for a walk. */
+STATIC int
+xfs_iwalk_alloc(
+ struct xfs_iwalk_ag *iwag)
+{
+ size_t size;
+
+ ASSERT(iwag->recs == NULL);
+ iwag->nr_recs = 0;
+
+ /* Allocate a prefetch buffer for inobt records. */
+ size = iwag->sz_recs * sizeof(struct xfs_inobt_rec_incore);
+ iwag->recs = kmem_alloc(size, KM_MAYFAIL);
+ if (iwag->recs == NULL)
+ return -ENOMEM;
+
+ return 0;
+}
+
+/* Free memory we allocated for a walk. */
+STATIC void
+xfs_iwalk_free(
+ struct xfs_iwalk_ag *iwag)
+{
+ kmem_free(iwag->recs);
+ iwag->recs = NULL;
+}
+
+/* For each inuse inode in each cached inobt record, call our function. */
+STATIC int
+xfs_iwalk_ag_recs(
+ struct xfs_iwalk_ag *iwag)
+{
+ struct xfs_mount *mp = iwag->mp;
+ struct xfs_trans *tp = iwag->tp;
+ xfs_ino_t ino;
+ unsigned int i, j;
+ xfs_agnumber_t agno;
+ int error;
+
+ agno = XFS_INO_TO_AGNO(mp, iwag->startino);
+ for (i = 0; i < iwag->nr_recs; i++) {
+ struct xfs_inobt_rec_incore *irec = &iwag->recs[i];
+
+ trace_xfs_iwalk_ag_rec(mp, agno, irec);
+
+ if (xfs_pwork_want_abort(&iwag->pwork))
+ return 0;
+
+ if (iwag->inobt_walk_fn) {
+ error = iwag->inobt_walk_fn(mp, tp, agno, irec,
+ iwag->data);
+ if (error)
+ return error;
+ }
+
+ if (!iwag->iwalk_fn)
+ continue;
+
+ for (j = 0; j < XFS_INODES_PER_CHUNK; j++) {
+ if (xfs_pwork_want_abort(&iwag->pwork))
+ return 0;
+
+ /* Skip if this inode is free */
+ if (XFS_INOBT_MASK(j) & irec->ir_free)
+ continue;
+
+ /* Otherwise call our function. */
+ ino = XFS_AGINO_TO_INO(mp, agno, irec->ir_startino + j);
+ error = iwag->iwalk_fn(mp, tp, ino, iwag->data);
+ if (error)
+ return error;
+ }
+ }
+
+ return 0;
+}
+
+/* Delete cursor and let go of AGI. */
+static inline void
+xfs_iwalk_del_inobt(
+ struct xfs_trans *tp,
+ struct xfs_btree_cur **curpp,
+ struct xfs_buf **agi_bpp,
+ int error)
+{
+ if (*curpp) {
+ xfs_btree_del_cursor(*curpp, error);
+ *curpp = NULL;
+ }
+ if (*agi_bpp) {
+ xfs_trans_brelse(tp, *agi_bpp);
+ *agi_bpp = NULL;
+ }
+}
+
+/*
+ * Set ourselves up for walking inobt records starting from a given point in
+ * the filesystem.
+ *
+ * If caller passed in a nonzero start inode number, load the record from the
+ * inobt and make the record look like all the inodes before agino are free so
+ * that we skip them, and then move the cursor to the next inobt record. This
+ * is how we support starting an iwalk in the middle of an inode chunk.
+ *
+ * If the caller passed in a start number of zero, move the cursor to the first
+ * inobt record.
+ *
+ * The caller is responsible for cleaning up the cursor and buffer pointer
+ * regardless of the error status.
+ */
+STATIC int
+xfs_iwalk_ag_start(
+ struct xfs_iwalk_ag *iwag,
+ xfs_agnumber_t agno,
+ xfs_agino_t agino,
+ struct xfs_btree_cur **curpp,
+ struct xfs_buf **agi_bpp,
+ int *has_more)
+{
+ struct xfs_mount *mp = iwag->mp;
+ struct xfs_trans *tp = iwag->tp;
+ struct xfs_inobt_rec_incore *irec;
+ int error;
+
+ /* Set up a fresh cursor and empty the inobt cache. */
+ iwag->nr_recs = 0;
+ error = xfs_inobt_cur(mp, tp, agno, XFS_BTNUM_INO, curpp, agi_bpp);
+ if (error)
+ return error;
+
+ /* Starting at the beginning of the AG? That's easy! */
+ if (agino == 0)
+ return xfs_inobt_lookup(*curpp, 0, XFS_LOOKUP_GE, has_more);
+
+ /*
+ * Otherwise, we have to grab the inobt record where we left off, stuff
+ * the record into our cache, and then see if there are more records.
+ * We require a lookup cache of at least two elements so that the
+ * caller doesn't have to deal with tearing down the cursor to walk the
+ * records.
+ */
+ error = xfs_inobt_lookup(*curpp, agino, XFS_LOOKUP_LE, has_more);
+ if (error)
+ return error;
+
+ /*
+ * If the LE lookup at @agino yields no records, jump ahead to the
+ * inobt cursor increment to see if there are more records to process.
+ */
+ if (!*has_more)
+ goto out_advance;
+
+ /* Get the record, should always work */
+ irec = &iwag->recs[iwag->nr_recs];
+ error = xfs_inobt_get_rec(*curpp, irec, has_more);
+ if (error)
+ return error;
+ XFS_WANT_CORRUPTED_RETURN(mp, *has_more == 1);
+
+ /*
+ * If the LE lookup yielded an inobt record before the cursor position,
+ * skip it and see if there's another one after it.
+ */
+ if (irec->ir_startino + XFS_INODES_PER_CHUNK <= agino)
+ goto out_advance;
+
+ /*
+ * If agino fell in the middle of the inode record, make it look like
+ * the inodes up to agino are free so that we don't return them again.
+ */
+ if (iwag->trim_start)
+ xfs_iwalk_adjust_start(agino, irec);
+
+ /*
+ * The prefetch calculation is supposed to give us a large enough inobt
+ * record cache that grab_ichunk can stage a partial first record and
+ * the loop body can cache a record without having to check for cache
+ * space until after it reads an inobt record.
+ */
+ iwag->nr_recs++;
+ ASSERT(iwag->nr_recs < iwag->sz_recs);
+
+out_advance:
+ return xfs_btree_increment(*curpp, 0, has_more);
+}
+
+/*
+ * The inobt record cache is full, so preserve the inobt cursor state and
+ * run callbacks on the cached inobt records. When we're done, restore the
+ * cursor state to wherever the cursor would have been had the cache not been
+ * full (and therefore we could've just incremented the cursor) if *@has_more
+ * is true. On exit, *@has_more will indicate whether or not the caller should
+ * try for more inode records.
+ */
+STATIC int
+xfs_iwalk_run_callbacks(
+ struct xfs_iwalk_ag *iwag,
+ xfs_agnumber_t agno,
+ struct xfs_btree_cur **curpp,
+ struct xfs_buf **agi_bpp,
+ int *has_more)
+{
+ struct xfs_mount *mp = iwag->mp;
+ struct xfs_trans *tp = iwag->tp;
+ struct xfs_inobt_rec_incore *irec;
+ xfs_agino_t restart;
+ int error;
+
+ ASSERT(iwag->nr_recs > 0);
+
+ /* Delete cursor but remember the last record we cached... */
+ xfs_iwalk_del_inobt(tp, curpp, agi_bpp, 0);
+ irec = &iwag->recs[iwag->nr_recs - 1];
+ restart = irec->ir_startino + XFS_INODES_PER_CHUNK - 1;
+
+ error = xfs_iwalk_ag_recs(iwag);
+ if (error)
+ return error;
+
+ /* ...empty the cache... */
+ iwag->nr_recs = 0;
+
+ if (!has_more)
+ return 0;
+
+ /* ...and recreate the cursor just past where we left off. */
+ error = xfs_inobt_cur(mp, tp, agno, XFS_BTNUM_INO, curpp, agi_bpp);
+ if (error)
+ return error;
+
+ return xfs_inobt_lookup(*curpp, restart, XFS_LOOKUP_GE, has_more);
+}
+
+/* Walk all inodes in a single AG, from @iwag->startino to the end of the AG. */
+STATIC int
+xfs_iwalk_ag(
+ struct xfs_iwalk_ag *iwag)
+{
+ struct xfs_mount *mp = iwag->mp;
+ struct xfs_trans *tp = iwag->tp;
+ struct xfs_buf *agi_bp = NULL;
+ struct xfs_btree_cur *cur = NULL;
+ xfs_agnumber_t agno;
+ xfs_agino_t agino;
+ int has_more;
+ int error = 0;
+
+ /* Set up our cursor at the right place in the inode btree. */
+ agno = XFS_INO_TO_AGNO(mp, iwag->startino);
+ agino = XFS_INO_TO_AGINO(mp, iwag->startino);
+ error = xfs_iwalk_ag_start(iwag, agno, agino, &cur, &agi_bp, &has_more);
+
+ while (!error && has_more) {
+ struct xfs_inobt_rec_incore *irec;
+
+ cond_resched();
+ if (xfs_pwork_want_abort(&iwag->pwork))
+ goto out;
+
+ /* Fetch the inobt record. */
+ irec = &iwag->recs[iwag->nr_recs];
+ error = xfs_inobt_get_rec(cur, irec, &has_more);
+ if (error || !has_more)
+ break;
+
+ /* No allocated inodes in this chunk; skip it. */
+ if (iwag->skip_empty && irec->ir_freecount == irec->ir_count) {
+ error = xfs_btree_increment(cur, 0, &has_more);
+ if (error)
+ break;
+ continue;
+ }
+
+ /*
+ * Start readahead for this inode chunk in anticipation of
+ * walking the inodes.
+ */
+ if (iwag->iwalk_fn)
+ xfs_iwalk_ichunk_ra(mp, agno, irec);
+
+ /*
+ * If there's space in the buffer for more records, increment
+ * the btree cursor and grab more.
+ */
+ if (++iwag->nr_recs < iwag->sz_recs) {
+ error = xfs_btree_increment(cur, 0, &has_more);
+ if (error || !has_more)
+ break;
+ continue;
+ }
+
+ /*
+ * Otherwise, we need to save cursor state and run the callback
+ * function on the cached records. The run_callbacks function
+ * is supposed to return a cursor pointing to the record where
+ * we would be if we had been able to increment like above.
+ */
+ ASSERT(has_more);
+ error = xfs_iwalk_run_callbacks(iwag, agno, &cur, &agi_bp,
+ &has_more);
+ }
+
+ if (iwag->nr_recs == 0 || error)
+ goto out;
+
+ /* Walk the unprocessed records in the cache. */
+ error = xfs_iwalk_run_callbacks(iwag, agno, &cur, &agi_bp, &has_more);
+
+out:
+ xfs_iwalk_del_inobt(tp, &cur, &agi_bp, error);
+ return error;
+}
+
+/*
+ * We experimentally determined that the reduction in ioctl call overhead
+ * diminishes when userspace asks for more than 2048 inodes, so we'll cap
+ * prefetch at this point.
+ */
+#define IWALK_MAX_INODE_PREFETCH (2048U)
+
+/*
+ * Given the number of inodes to prefetch, set the number of inobt records that
+ * we cache in memory, which controls the number of inodes we try to read
+ * ahead. Set the maximum if @inodes == 0.
+ */
+static inline unsigned int
+xfs_iwalk_prefetch(
+ unsigned int inodes)
+{
+ unsigned int inobt_records;
+
+ /*
+ * If the caller didn't tell us the number of inodes they wanted,
+ * assume the maximum prefetch possible for best performance.
+ * Otherwise, cap prefetch at that maximum so that we don't start an
+ * absurd amount of prefetch.
+ */
+ if (inodes == 0)
+ inodes = IWALK_MAX_INODE_PREFETCH;
+ inodes = min(inodes, IWALK_MAX_INODE_PREFETCH);
+
+ /* Round the inode count up to a full chunk. */
+ inodes = round_up(inodes, XFS_INODES_PER_CHUNK);
+
+ /*
+ * In order to convert the number of inodes to prefetch into an
+ * estimate of the number of inobt records to cache, we require a
+ * conversion factor that reflects our expectations of the average
+ * loading factor of an inode chunk. Based on data gathered, most
+ * (but not all) filesystems manage to keep the inode chunks totally
+ * full, so we'll underestimate slightly so that our readahead will
+ * still deliver the performance we want on aging filesystems:
+ *
+ * inobt = inodes / (INODES_PER_CHUNK * (4 / 5));
+ *
+ * The funny math is to avoid integer division.
+ */
+ inobt_records = (inodes * 5) / (4 * XFS_INODES_PER_CHUNK);
+
+ /*
+ * Allocate enough space to prefetch at least two inobt records so that
+ * we can cache both the record where the iwalk started and the next
+ * record. This simplifies the AG inode walk loop setup code.
+ */
+ return max(inobt_records, 2U);
+}
+
+/*
+ * Walk all inodes in the filesystem starting from @startino. The @iwalk_fn
+ * will be called for each allocated inode, being passed the inode's number and
+ * @data. @max_prefetch controls how many inobt records' worth of inodes we
+ * try to readahead.
+ */
+int
+xfs_iwalk(
+ struct xfs_mount *mp,
+ struct xfs_trans *tp,
+ xfs_ino_t startino,
+ unsigned int flags,
+ xfs_iwalk_fn iwalk_fn,
+ unsigned int inode_records,
+ void *data)
+{
+ struct xfs_iwalk_ag iwag = {
+ .mp = mp,
+ .tp = tp,
+ .iwalk_fn = iwalk_fn,
+ .data = data,
+ .startino = startino,
+ .sz_recs = xfs_iwalk_prefetch(inode_records),
+ .trim_start = 1,
+ .skip_empty = 1,
+ .pwork = XFS_PWORK_SINGLE_THREADED,
+ };
+ xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, startino);
+ int error;
+
+ ASSERT(agno < mp->m_sb.sb_agcount);
+ ASSERT(!(flags & ~XFS_IWALK_FLAGS_ALL));
+
+ error = xfs_iwalk_alloc(&iwag);
+ if (error)
+ return error;
+
+ for (; agno < mp->m_sb.sb_agcount; agno++) {
+ error = xfs_iwalk_ag(&iwag);
+ if (error)
+ break;
+ iwag.startino = XFS_AGINO_TO_INO(mp, agno + 1, 0);
+ if (flags & XFS_INOBT_WALK_SAME_AG)
+ break;
+ }
+
+ xfs_iwalk_free(&iwag);
+ return error;
+}
+
+/* Run per-thread iwalk work. */
+static int
+xfs_iwalk_ag_work(
+ struct xfs_mount *mp,
+ struct xfs_pwork *pwork)
+{
+ struct xfs_iwalk_ag *iwag;
+ int error = 0;
+
+ iwag = container_of(pwork, struct xfs_iwalk_ag, pwork);
+ if (xfs_pwork_want_abort(pwork))
+ goto out;
+
+ error = xfs_iwalk_alloc(iwag);
+ if (error)
+ goto out;
+
+ error = xfs_iwalk_ag(iwag);
+ xfs_iwalk_free(iwag);
+out:
+ kmem_free(iwag);
+ return error;
+}
+
+/*
+ * Walk all the inodes in the filesystem using multiple threads to process each
+ * AG.
+ */
+int
+xfs_iwalk_threaded(
+ struct xfs_mount *mp,
+ xfs_ino_t startino,
+ unsigned int flags,
+ xfs_iwalk_fn iwalk_fn,
+ unsigned int inode_records,
+ bool polled,
+ void *data)
+{
+ struct xfs_pwork_ctl pctl;
+ xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, startino);
+ unsigned int nr_threads;
+ int error;
+
+ ASSERT(agno < mp->m_sb.sb_agcount);
+ ASSERT(!(flags & ~XFS_IWALK_FLAGS_ALL));
+
+ nr_threads = xfs_pwork_guess_datadev_parallelism(mp);
+ error = xfs_pwork_init(mp, &pctl, xfs_iwalk_ag_work, "xfs_iwalk",
+ nr_threads);
+ if (error)
+ return error;
+
+ for (; agno < mp->m_sb.sb_agcount; agno++) {
+ struct xfs_iwalk_ag *iwag;
+
+ if (xfs_pwork_ctl_want_abort(&pctl))
+ break;
+
+ iwag = kmem_zalloc(sizeof(struct xfs_iwalk_ag), KM_SLEEP);
+ iwag->mp = mp;
+ iwag->iwalk_fn = iwalk_fn;
+ iwag->data = data;
+ iwag->startino = startino;
+ iwag->sz_recs = xfs_iwalk_prefetch(inode_records);
+ xfs_pwork_queue(&pctl, &iwag->pwork);
+ startino = XFS_AGINO_TO_INO(mp, agno + 1, 0);
+ if (flags & XFS_INOBT_WALK_SAME_AG)
+ break;
+ }
+
+ if (polled)
+ xfs_pwork_poll(&pctl);
+ return xfs_pwork_destroy(&pctl);
+}
+
+/*
+ * Allow callers to cache up to a page's worth of inobt records. This reflects
+ * the existing inumbers prefetching behavior. Since the inobt walk does not
+ * itself do anything with the inobt records, we can set a fairly high limit
+ * here.
+ */
+#define MAX_INOBT_WALK_PREFETCH \
+ (PAGE_SIZE / sizeof(struct xfs_inobt_rec_incore))
+
+/*
+ * Given the number of records that the user wanted, set the number of inobt
+ * records that we buffer in memory. Set the maximum if @inobt_records == 0.
+ */
+static inline unsigned int
+xfs_inobt_walk_prefetch(
+ unsigned int inobt_records)
+{
+ /*
+ * If the caller didn't tell us the number of inobt records they
+ * wanted, assume the maximum prefetch possible for best performance.
+ */
+ if (inobt_records == 0)
+ inobt_records = MAX_INOBT_WALK_PREFETCH;
+
+ /*
+ * Allocate enough space to prefetch at least two inobt records so that
+ * we can cache both the record where the iwalk started and the next
+ * record. This simplifies the AG inode walk loop setup code.
+ */
+ inobt_records = max(inobt_records, 2U);
+
+ /*
+ * Cap prefetch at that maximum so that we don't use an absurd amount
+ * of memory.
+ */
+ return min_t(unsigned int, inobt_records, MAX_INOBT_WALK_PREFETCH);
+}
+
+/*
+ * Walk all inode btree records in the filesystem starting from @startino. The
+ * @inobt_walk_fn will be called for each btree record, being passed the incore
+ * record and @data. @max_prefetch controls how many inobt records we try to
+ * cache ahead of time.
+ */
+int
+xfs_inobt_walk(
+ struct xfs_mount *mp,
+ struct xfs_trans *tp,
+ xfs_ino_t startino,
+ unsigned int flags,
+ xfs_inobt_walk_fn inobt_walk_fn,
+ unsigned int inobt_records,
+ void *data)
+{
+ struct xfs_iwalk_ag iwag = {
+ .mp = mp,
+ .tp = tp,
+ .inobt_walk_fn = inobt_walk_fn,
+ .data = data,
+ .startino = startino,
+ .sz_recs = xfs_inobt_walk_prefetch(inobt_records),
+ .pwork = XFS_PWORK_SINGLE_THREADED,
+ };
+ xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, startino);
+ int error;
+
+ ASSERT(agno < mp->m_sb.sb_agcount);
+ ASSERT(!(flags & ~XFS_INOBT_WALK_FLAGS_ALL));
+
+ error = xfs_iwalk_alloc(&iwag);
+ if (error)
+ return error;
+
+ for (; agno < mp->m_sb.sb_agcount; agno++) {
+ error = xfs_iwalk_ag(&iwag);
+ if (error)
+ break;
+ iwag.startino = XFS_AGINO_TO_INO(mp, agno + 1, 0);
+ if (flags & XFS_INOBT_WALK_SAME_AG)
+ break;
+ }
+
+ xfs_iwalk_free(&iwag);
+ return error;
+}
diff --git a/fs/xfs/xfs_iwalk.h b/fs/xfs/xfs_iwalk.h
new file mode 100644
index 000000000000..6c960e10ed4d
--- /dev/null
+++ b/fs/xfs/xfs_iwalk.h
@@ -0,0 +1,46 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (C) 2019 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ */
+#ifndef __XFS_IWALK_H__
+#define __XFS_IWALK_H__
+
+/* Walk all inodes in the filesystem starting from @startino. */
+typedef int (*xfs_iwalk_fn)(struct xfs_mount *mp, struct xfs_trans *tp,
+ xfs_ino_t ino, void *data);
+/* Return values for xfs_iwalk_fn. */
+#define XFS_IWALK_CONTINUE (XFS_ITER_CONTINUE)
+#define XFS_IWALK_ABORT (XFS_ITER_ABORT)
+
+int xfs_iwalk(struct xfs_mount *mp, struct xfs_trans *tp, xfs_ino_t startino,
+ unsigned int flags, xfs_iwalk_fn iwalk_fn,
+ unsigned int inode_records, void *data);
+int xfs_iwalk_threaded(struct xfs_mount *mp, xfs_ino_t startino,
+ unsigned int flags, xfs_iwalk_fn iwalk_fn,
+ unsigned int inode_records, bool poll, void *data);
+
+/* Only iterate inodes within the same AG as @startino. */
+#define XFS_IWALK_SAME_AG (0x1)
+
+#define XFS_IWALK_FLAGS_ALL (XFS_IWALK_SAME_AG)
+
+/* Walk all inode btree records in the filesystem starting from @startino. */
+typedef int (*xfs_inobt_walk_fn)(struct xfs_mount *mp, struct xfs_trans *tp,
+ xfs_agnumber_t agno,
+ const struct xfs_inobt_rec_incore *irec,
+ void *data);
+/* Return value (for xfs_inobt_walk_fn) that aborts the walk immediately. */
+#define XFS_INOBT_WALK_ABORT (XFS_IWALK_ABORT)
+
+int xfs_inobt_walk(struct xfs_mount *mp, struct xfs_trans *tp,
+ xfs_ino_t startino, unsigned int flags,
+ xfs_inobt_walk_fn inobt_walk_fn, unsigned int inobt_records,
+ void *data);
+
+/* Only iterate inobt records within the same AG as @startino. */
+#define XFS_INOBT_WALK_SAME_AG (XFS_IWALK_SAME_AG)
+
+#define XFS_INOBT_WALK_FLAGS_ALL (XFS_INOBT_WALK_SAME_AG)
+
+#endif /* __XFS_IWALK_H__ */
diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h
index edbd5a210df2..ca15105681ca 100644
--- a/fs/xfs/xfs_linux.h
+++ b/fs/xfs/xfs_linux.h
@@ -110,8 +110,6 @@ typedef __u32 xfs_nlink_t;
#define current_restore_flags_nested(sp, f) \
(current->flags = ((current->flags & ~(f)) | (*(sp) & (f))))
-#define spinlock_destroy(lock)
-
#define NBBY 8 /* number of bits per byte */
/*
@@ -221,6 +219,9 @@ static inline uint64_t howmany_64(uint64_t x, uint32_t y)
return x;
}
+int xfs_rw_bdev(struct block_device *bdev, sector_t sector, unsigned int count,
+ char *data, unsigned int op);
+
#define ASSERT_ALWAYS(expr) \
(likely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__))
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index c3b610b687d1..00e9f5c388d3 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -16,13 +16,10 @@
#include "xfs_trans_priv.h"
#include "xfs_log.h"
#include "xfs_log_priv.h"
-#include "xfs_log_recover.h"
-#include "xfs_inode.h"
#include "xfs_trace.h"
-#include "xfs_fsops.h"
-#include "xfs_cksum.h"
#include "xfs_sysfs.h"
#include "xfs_sb.h"
+#include "xfs_health.h"
kmem_zone_t *xfs_log_ticket_zone;
@@ -44,21 +41,14 @@ STATIC int
xlog_space_left(
struct xlog *log,
atomic64_t *head);
-STATIC int
-xlog_sync(
- struct xlog *log,
- struct xlog_in_core *iclog);
STATIC void
xlog_dealloc_log(
struct xlog *log);
/* local state machine functions */
-STATIC void xlog_state_done_syncing(xlog_in_core_t *iclog, int);
-STATIC void
-xlog_state_do_callback(
- struct xlog *log,
- int aborted,
- struct xlog_in_core *iclog);
+STATIC void xlog_state_done_syncing(
+ struct xlog_in_core *iclog,
+ bool aborted);
STATIC int
xlog_state_get_iclog_space(
struct xlog *log,
@@ -106,8 +96,7 @@ STATIC void
xlog_verify_iclog(
struct xlog *log,
struct xlog_in_core *iclog,
- int count,
- bool syncing);
+ int count);
STATIC void
xlog_verify_tail_lsn(
struct xlog *log,
@@ -116,7 +105,7 @@ xlog_verify_tail_lsn(
#else
#define xlog_verify_dest_ptr(a,b)
#define xlog_verify_grant_tail(a)
-#define xlog_verify_iclog(a,b,c,d)
+#define xlog_verify_iclog(a,b,c)
#define xlog_verify_tail_lsn(a,b,c)
#endif
@@ -540,32 +529,6 @@ xfs_log_done(
return lsn;
}
-/*
- * Attaches a new iclog I/O completion callback routine during
- * transaction commit. If the log is in error state, a non-zero
- * return code is handed back and the caller is responsible for
- * executing the callback at an appropriate time.
- */
-int
-xfs_log_notify(
- struct xlog_in_core *iclog,
- xfs_log_callback_t *cb)
-{
- int abortflg;
-
- spin_lock(&iclog->ic_callback_lock);
- abortflg = (iclog->ic_state & XLOG_STATE_IOERROR);
- if (!abortflg) {
- ASSERT_ALWAYS((iclog->ic_state == XLOG_STATE_ACTIVE) ||
- (iclog->ic_state == XLOG_STATE_WANT_SYNC));
- cb->cb_next = NULL;
- *(iclog->ic_callback_tail) = cb;
- iclog->ic_callback_tail = &(cb->cb_next);
- }
- spin_unlock(&iclog->ic_callback_lock);
- return abortflg;
-}
-
int
xfs_log_release_iclog(
struct xfs_mount *mp,
@@ -806,16 +769,12 @@ xfs_log_mount_finish(
* The mount has failed. Cancel the recovery if it hasn't completed and destroy
* the log.
*/
-int
+void
xfs_log_mount_cancel(
struct xfs_mount *mp)
{
- int error;
-
- error = xlog_recover_cancel(mp->m_log);
+ xlog_recover_cancel(mp->m_log);
xfs_log_unmount(mp);
-
- return error;
}
/*
@@ -861,7 +820,7 @@ xfs_log_write_unmount_record(
* recalculated during log recovery at next mount. Refer to
* xlog_check_unmount_rec for more details.
*/
- if (XFS_TEST_ERROR((mp->m_flags & XFS_MOUNT_BAD_SUMMARY), mp,
+ if (XFS_TEST_ERROR(xfs_fs_has_sickness(mp, XFS_SICK_FS_COUNTERS), mp,
XFS_ERRTAG_FORCE_SUMMARY_RECALC)) {
xfs_alert(mp, "%s: will fix summary counters at next mount",
__func__);
@@ -931,7 +890,7 @@ xfs_log_unmount_write(xfs_mount_t *mp)
* Or, if we are doing a forced umount (typically because of IO errors).
*/
if (mp->m_flags & XFS_MOUNT_NORECOVERY ||
- xfs_readonly_buftarg(log->l_mp->m_logdev_targp)) {
+ xfs_readonly_buftarg(log->l_targ)) {
ASSERT(mp->m_flags & XFS_MOUNT_RDONLY);
return 0;
}
@@ -1243,53 +1202,49 @@ xlog_space_left(
}
-/*
- * Log function which is called when an io completes.
- *
- * The log manager needs its own routine, in order to control what
- * happens with the buffer after the write completes.
- */
static void
-xlog_iodone(xfs_buf_t *bp)
+xlog_ioend_work(
+ struct work_struct *work)
{
- struct xlog_in_core *iclog = bp->b_log_item;
- struct xlog *l = iclog->ic_log;
- int aborted = 0;
+ struct xlog_in_core *iclog =
+ container_of(work, struct xlog_in_core, ic_end_io_work);
+ struct xlog *log = iclog->ic_log;
+ bool aborted = false;
+ int error;
+
+ error = blk_status_to_errno(iclog->ic_bio.bi_status);
+#ifdef DEBUG
+ /* treat writes with injected CRC errors as failed */
+ if (iclog->ic_fail_crc)
+ error = -EIO;
+#endif
/*
- * Race to shutdown the filesystem if we see an error or the iclog is in
- * IOABORT state. The IOABORT state is only set in DEBUG mode to inject
- * CRC errors into log recovery.
+ * Race to shutdown the filesystem if we see an error.
*/
- if (XFS_TEST_ERROR(bp->b_error, l->l_mp, XFS_ERRTAG_IODONE_IOERR) ||
- iclog->ic_state & XLOG_STATE_IOABORT) {
- if (iclog->ic_state & XLOG_STATE_IOABORT)
- iclog->ic_state &= ~XLOG_STATE_IOABORT;
-
- xfs_buf_ioerror_alert(bp, __func__);
- xfs_buf_stale(bp);
- xfs_force_shutdown(l->l_mp, SHUTDOWN_LOG_IO_ERROR);
+ if (XFS_TEST_ERROR(error, log->l_mp, XFS_ERRTAG_IODONE_IOERR)) {
+ xfs_alert(log->l_mp, "log I/O error %d", error);
+ xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR);
/*
* This flag will be propagated to the trans-committed
* callback routines to let them know that the log-commit
* didn't succeed.
*/
- aborted = XFS_LI_ABORTED;
+ aborted = true;
} else if (iclog->ic_state & XLOG_STATE_IOERROR) {
- aborted = XFS_LI_ABORTED;
+ aborted = true;
}
- /* log I/O is always issued ASYNC */
- ASSERT(bp->b_flags & XBF_ASYNC);
xlog_state_done_syncing(iclog, aborted);
+ bio_uninit(&iclog->ic_bio);
/*
- * drop the buffer lock now that we are done. Nothing references
- * the buffer after this, so an unmount waiting on this lock can now
- * tear it down safely. As such, it is unsafe to reference the buffer
- * (bp) after the unlock as we could race with it being freed.
+ * Drop the lock to signal that we are done. Nothing references the
+ * iclog after this, so an unmount waiting on this lock can now tear it
+ * down safely. As such, it is unsafe to reference the iclog after the
+ * unlock as we could race with it being freed.
*/
- xfs_buf_unlock(bp);
+ up(&iclog->ic_sema);
}
/*
@@ -1300,65 +1255,26 @@ xlog_iodone(xfs_buf_t *bp)
* If the filesystem blocksize is too large, we may need to choose a
* larger size since the directory code currently logs entire blocks.
*/
-
STATIC void
xlog_get_iclog_buffer_size(
struct xfs_mount *mp,
struct xlog *log)
{
- int size;
- int xhdrs;
-
if (mp->m_logbufs <= 0)
- log->l_iclog_bufs = XLOG_MAX_ICLOGS;
- else
- log->l_iclog_bufs = mp->m_logbufs;
+ mp->m_logbufs = XLOG_MAX_ICLOGS;
+ if (mp->m_logbsize <= 0)
+ mp->m_logbsize = XLOG_BIG_RECORD_BSIZE;
+
+ log->l_iclog_bufs = mp->m_logbufs;
+ log->l_iclog_size = mp->m_logbsize;
/*
- * Buffer size passed in from mount system call.
+ * # headers = size / 32k - one header holds cycles from 32k of data.
*/
- if (mp->m_logbsize > 0) {
- size = log->l_iclog_size = mp->m_logbsize;
- log->l_iclog_size_log = 0;
- while (size != 1) {
- log->l_iclog_size_log++;
- size >>= 1;
- }
-
- if (xfs_sb_version_haslogv2(&mp->m_sb)) {
- /* # headers = size / 32k
- * one header holds cycles from 32k of data
- */
-
- xhdrs = mp->m_logbsize / XLOG_HEADER_CYCLE_SIZE;
- if (mp->m_logbsize % XLOG_HEADER_CYCLE_SIZE)
- xhdrs++;
- log->l_iclog_hsize = xhdrs << BBSHIFT;
- log->l_iclog_heads = xhdrs;
- } else {
- ASSERT(mp->m_logbsize <= XLOG_BIG_RECORD_BSIZE);
- log->l_iclog_hsize = BBSIZE;
- log->l_iclog_heads = 1;
- }
- goto done;
- }
-
- /* All machines use 32kB buffers by default. */
- log->l_iclog_size = XLOG_BIG_RECORD_BSIZE;
- log->l_iclog_size_log = XLOG_BIG_RECORD_BSHIFT;
-
- /* the default log size is 16k or 32k which is one header sector */
- log->l_iclog_hsize = BBSIZE;
- log->l_iclog_heads = 1;
-
-done:
- /* are we being asked to make the sizes selected above visible? */
- if (mp->m_logbufs == 0)
- mp->m_logbufs = log->l_iclog_bufs;
- if (mp->m_logbsize == 0)
- mp->m_logbsize = log->l_iclog_size;
-} /* xlog_get_iclog_buffer_size */
-
+ log->l_iclog_heads =
+ DIV_ROUND_UP(mp->m_logbsize, XLOG_HEADER_CYCLE_SIZE);
+ log->l_iclog_hsize = log->l_iclog_heads << BBSHIFT;
+}
void
xfs_log_work_queue(
@@ -1421,7 +1337,6 @@ xlog_alloc_log(
xlog_rec_header_t *head;
xlog_in_core_t **iclogp;
xlog_in_core_t *iclog, *prev_iclog=NULL;
- xfs_buf_t *bp;
int i;
int error = -ENOMEM;
uint log2_size = 0;
@@ -1479,30 +1394,6 @@ xlog_alloc_log(
xlog_get_iclog_buffer_size(mp, log);
- /*
- * Use a NULL block for the extra log buffer used during splits so that
- * it will trigger errors if we ever try to do IO on it without first
- * having set it up properly.
- */
- error = -ENOMEM;
- bp = xfs_buf_alloc(mp->m_logdev_targp, XFS_BUF_DADDR_NULL,
- BTOBB(log->l_iclog_size), XBF_NO_IOACCT);
- if (!bp)
- goto out_free_log;
-
- /*
- * The iclogbuf buffer locks are held over IO but we are not going to do
- * IO yet. Hence unlock the buffer so that the log IO path can grab it
- * when appropriately.
- */
- ASSERT(xfs_buf_islocked(bp));
- xfs_buf_unlock(bp);
-
- /* use high priority wq for log I/O completion */
- bp->b_ioend_wq = mp->m_log_workqueue;
- bp->b_iodone = xlog_iodone;
- log->l_xbuf = bp;
-
spin_lock_init(&log->l_icloglock);
init_waitqueue_head(&log->l_flush_wait);
@@ -1515,29 +1406,22 @@ xlog_alloc_log(
* xlog_in_core_t in xfs_log_priv.h for details.
*/
ASSERT(log->l_iclog_size >= 4096);
- for (i=0; i < log->l_iclog_bufs; i++) {
- *iclogp = kmem_zalloc(sizeof(xlog_in_core_t), KM_MAYFAIL);
- if (!*iclogp)
+ for (i = 0; i < log->l_iclog_bufs; i++) {
+ size_t bvec_size = howmany(log->l_iclog_size, PAGE_SIZE) *
+ sizeof(struct bio_vec);
+
+ iclog = kmem_zalloc(sizeof(*iclog) + bvec_size, KM_MAYFAIL);
+ if (!iclog)
goto out_free_iclog;
- iclog = *iclogp;
+ *iclogp = iclog;
iclog->ic_prev = prev_iclog;
prev_iclog = iclog;
- bp = xfs_buf_get_uncached(mp->m_logdev_targp,
- BTOBB(log->l_iclog_size),
- XBF_NO_IOACCT);
- if (!bp)
+ iclog->ic_data = kmem_alloc_large(log->l_iclog_size,
+ KM_MAYFAIL);
+ if (!iclog->ic_data)
goto out_free_iclog;
-
- ASSERT(xfs_buf_islocked(bp));
- xfs_buf_unlock(bp);
-
- /* use high priority wq for log I/O completion */
- bp->b_ioend_wq = mp->m_log_workqueue;
- bp->b_iodone = xlog_iodone;
- iclog->ic_bp = bp;
- iclog->ic_data = bp->b_addr;
#ifdef DEBUG
log->l_iclog_bak[i] = &iclog->ic_header;
#endif
@@ -1551,36 +1435,43 @@ xlog_alloc_log(
head->h_fmt = cpu_to_be32(XLOG_FMT);
memcpy(&head->h_fs_uuid, &mp->m_sb.sb_uuid, sizeof(uuid_t));
- iclog->ic_size = BBTOB(bp->b_length) - log->l_iclog_hsize;
+ iclog->ic_size = log->l_iclog_size - log->l_iclog_hsize;
iclog->ic_state = XLOG_STATE_ACTIVE;
iclog->ic_log = log;
atomic_set(&iclog->ic_refcnt, 0);
spin_lock_init(&iclog->ic_callback_lock);
- iclog->ic_callback_tail = &(iclog->ic_callback);
+ INIT_LIST_HEAD(&iclog->ic_callbacks);
iclog->ic_datap = (char *)iclog->ic_data + log->l_iclog_hsize;
init_waitqueue_head(&iclog->ic_force_wait);
init_waitqueue_head(&iclog->ic_write_wait);
+ INIT_WORK(&iclog->ic_end_io_work, xlog_ioend_work);
+ sema_init(&iclog->ic_sema, 1);
iclogp = &iclog->ic_next;
}
*iclogp = log->l_iclog; /* complete ring */
log->l_iclog->ic_prev = prev_iclog; /* re-write 1st prev ptr */
+ log->l_ioend_workqueue = alloc_workqueue("xfs-log/%s",
+ WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_HIGHPRI, 0,
+ mp->m_fsname);
+ if (!log->l_ioend_workqueue)
+ goto out_free_iclog;
+
error = xlog_cil_init(log);
if (error)
- goto out_free_iclog;
+ goto out_destroy_workqueue;
return log;
+out_destroy_workqueue:
+ destroy_workqueue(log->l_ioend_workqueue);
out_free_iclog:
for (iclog = log->l_iclog; iclog; iclog = prev_iclog) {
prev_iclog = iclog->ic_next;
- if (iclog->ic_bp)
- xfs_buf_free(iclog->ic_bp);
+ kmem_free(iclog->ic_data);
kmem_free(iclog);
}
- spinlock_destroy(&log->l_icloglock);
- xfs_buf_free(log->l_xbuf);
out_free_log:
kmem_free(log);
out:
@@ -1765,42 +1656,155 @@ xlog_cksum(
return xfs_end_cksum(crc);
}
-/*
- * The bdstrat callback function for log bufs. This gives us a central
- * place to trap bufs in case we get hit by a log I/O error and need to
- * shutdown. Actually, in practice, even when we didn't get a log error,
- * we transition the iclogs to IOERROR state *after* flushing all existing
- * iclogs to disk. This is because we don't want anymore new transactions to be
- * started or completed afterwards.
- *
- * We lock the iclogbufs here so that we can serialise against IO completion
- * during unmount. We might be processing a shutdown triggered during unmount,
- * and that can occur asynchronously to the unmount thread, and hence we need to
- * ensure that completes before tearing down the iclogbufs. Hence we need to
- * hold the buffer lock across the log IO to acheive that.
- */
-STATIC int
-xlog_bdstrat(
- struct xfs_buf *bp)
+static void
+xlog_bio_end_io(
+ struct bio *bio)
{
- struct xlog_in_core *iclog = bp->b_log_item;
+ struct xlog_in_core *iclog = bio->bi_private;
- xfs_buf_lock(bp);
- if (iclog->ic_state & XLOG_STATE_IOERROR) {
- xfs_buf_ioerror(bp, -EIO);
- xfs_buf_stale(bp);
- xfs_buf_ioend(bp);
+ queue_work(iclog->ic_log->l_ioend_workqueue,
+ &iclog->ic_end_io_work);
+}
+
+static void
+xlog_map_iclog_data(
+ struct bio *bio,
+ void *data,
+ size_t count)
+{
+ do {
+ struct page *page = kmem_to_page(data);
+ unsigned int off = offset_in_page(data);
+ size_t len = min_t(size_t, count, PAGE_SIZE - off);
+
+ WARN_ON_ONCE(bio_add_page(bio, page, len, off) != len);
+
+ data += len;
+ count -= len;
+ } while (count);
+}
+
+STATIC void
+xlog_write_iclog(
+ struct xlog *log,
+ struct xlog_in_core *iclog,
+ uint64_t bno,
+ unsigned int count,
+ bool need_flush)
+{
+ ASSERT(bno < log->l_logBBsize);
+
+ /*
+ * We lock the iclogbufs here so that we can serialise against I/O
+ * completion during unmount. We might be processing a shutdown
+ * triggered during unmount, and that can occur asynchronously to the
+ * unmount thread, and hence we need to ensure that completes before
+ * tearing down the iclogbufs. Hence we need to hold the buffer lock
+ * across the log IO to archieve that.
+ */
+ down(&iclog->ic_sema);
+ if (unlikely(iclog->ic_state & XLOG_STATE_IOERROR)) {
/*
* It would seem logical to return EIO here, but we rely on
* the log state machine to propagate I/O errors instead of
- * doing it here. Similarly, IO completion will unlock the
- * buffer, so we don't do it here.
+ * doing it here. We kick of the state machine and unlock
+ * the buffer manually, the code needs to be kept in sync
+ * with the I/O completion path.
*/
- return 0;
+ xlog_state_done_syncing(iclog, XFS_LI_ABORTED);
+ up(&iclog->ic_sema);
+ return;
}
- xfs_buf_submit(bp);
- return 0;
+ iclog->ic_io_size = count;
+
+ bio_init(&iclog->ic_bio, iclog->ic_bvec, howmany(count, PAGE_SIZE));
+ bio_set_dev(&iclog->ic_bio, log->l_targ->bt_bdev);
+ iclog->ic_bio.bi_iter.bi_sector = log->l_logBBstart + bno;
+ iclog->ic_bio.bi_end_io = xlog_bio_end_io;
+ iclog->ic_bio.bi_private = iclog;
+ iclog->ic_bio.bi_opf = REQ_OP_WRITE | REQ_META | REQ_SYNC | REQ_FUA;
+ if (need_flush)
+ iclog->ic_bio.bi_opf |= REQ_PREFLUSH;
+
+ xlog_map_iclog_data(&iclog->ic_bio, iclog->ic_data, iclog->ic_io_size);
+ if (is_vmalloc_addr(iclog->ic_data))
+ flush_kernel_vmap_range(iclog->ic_data, iclog->ic_io_size);
+
+ /*
+ * If this log buffer would straddle the end of the log we will have
+ * to split it up into two bios, so that we can continue at the start.
+ */
+ if (bno + BTOBB(count) > log->l_logBBsize) {
+ struct bio *split;
+
+ split = bio_split(&iclog->ic_bio, log->l_logBBsize - bno,
+ GFP_NOIO, &fs_bio_set);
+ bio_chain(split, &iclog->ic_bio);
+ submit_bio(split);
+
+ /* restart at logical offset zero for the remainder */
+ iclog->ic_bio.bi_iter.bi_sector = log->l_logBBstart;
+ }
+
+ submit_bio(&iclog->ic_bio);
+}
+
+/*
+ * We need to bump cycle number for the part of the iclog that is
+ * written to the start of the log. Watch out for the header magic
+ * number case, though.
+ */
+static void
+xlog_split_iclog(
+ struct xlog *log,
+ void *data,
+ uint64_t bno,
+ unsigned int count)
+{
+ unsigned int split_offset = BBTOB(log->l_logBBsize - bno);
+ unsigned int i;
+
+ for (i = split_offset; i < count; i += BBSIZE) {
+ uint32_t cycle = get_unaligned_be32(data + i);
+
+ if (++cycle == XLOG_HEADER_MAGIC_NUM)
+ cycle++;
+ put_unaligned_be32(cycle, data + i);
+ }
+}
+
+static int
+xlog_calc_iclog_size(
+ struct xlog *log,
+ struct xlog_in_core *iclog,
+ uint32_t *roundoff)
+{
+ uint32_t count_init, count;
+ bool use_lsunit;
+
+ use_lsunit = xfs_sb_version_haslogv2(&log->l_mp->m_sb) &&
+ log->l_mp->m_sb.sb_logsunit > 1;
+
+ /* Add for LR header */
+ count_init = log->l_iclog_hsize + iclog->ic_offset;
+
+ /* Round out the log write size */
+ if (use_lsunit) {
+ /* we have a v2 stripe unit to use */
+ count = XLOG_LSUNITTOB(log, XLOG_BTOLSUNIT(log, count_init));
+ } else {
+ count = BBTOB(BTOBB(count_init));
+ }
+
+ ASSERT(count >= count_init);
+ *roundoff = count - count_init;
+
+ if (use_lsunit)
+ ASSERT(*roundoff < log->l_mp->m_sb.sb_logsunit);
+ else
+ ASSERT(*roundoff < BBTOB(1));
+ return count;
}
/*
@@ -1823,46 +1827,23 @@ xlog_bdstrat(
* log will require grabbing the lock though.
*
* The entire log manager uses a logical block numbering scheme. Only
- * log_sync (and then only bwrite()) know about the fact that the log may
- * not start with block zero on a given device. The log block start offset
- * is added immediately before calling bwrite().
+ * xlog_write_iclog knows about the fact that the log may not start with
+ * block zero on a given device.
*/
-
-STATIC int
+STATIC void
xlog_sync(
struct xlog *log,
struct xlog_in_core *iclog)
{
- xfs_buf_t *bp;
- int i;
- uint count; /* byte count of bwrite */
- uint count_init; /* initial count before roundup */
- int roundoff; /* roundoff to BB or stripe */
- int split = 0; /* split write into two regions */
- int error;
- int v2 = xfs_sb_version_haslogv2(&log->l_mp->m_sb);
- int size;
+ unsigned int count; /* byte count of bwrite */
+ unsigned int roundoff; /* roundoff to BB or stripe */
+ uint64_t bno;
+ unsigned int size;
+ bool need_flush = true, split = false;
- XFS_STATS_INC(log->l_mp, xs_log_writes);
ASSERT(atomic_read(&iclog->ic_refcnt) == 0);
- /* Add for LR header */
- count_init = log->l_iclog_hsize + iclog->ic_offset;
-
- /* Round out the log write size */
- if (v2 && log->l_mp->m_sb.sb_logsunit > 1) {
- /* we have a v2 stripe unit to use */
- count = XLOG_LSUNITTOB(log, XLOG_BTOLSUNIT(log, count_init));
- } else {
- count = BBTOB(BTOBB(count_init));
- }
- roundoff = count - count_init;
- ASSERT(roundoff >= 0);
- ASSERT((v2 && log->l_mp->m_sb.sb_logsunit > 1 &&
- roundoff < log->l_mp->m_sb.sb_logsunit)
- ||
- (log->l_mp->m_sb.sb_logsunit <= 1 &&
- roundoff < BBTOB(1)));
+ count = xlog_calc_iclog_size(log, iclog, &roundoff);
/* move grant heads by roundoff in sync */
xlog_grant_add_space(log, &log->l_reserve_head.grant, roundoff);
@@ -1873,41 +1854,19 @@ xlog_sync(
/* real byte length */
size = iclog->ic_offset;
- if (v2)
+ if (xfs_sb_version_haslogv2(&log->l_mp->m_sb))
size += roundoff;
iclog->ic_header.h_len = cpu_to_be32(size);
- bp = iclog->ic_bp;
- XFS_BUF_SET_ADDR(bp, BLOCK_LSN(be64_to_cpu(iclog->ic_header.h_lsn)));
-
+ XFS_STATS_INC(log->l_mp, xs_log_writes);
XFS_STATS_ADD(log->l_mp, xs_log_blocks, BTOBB(count));
- /* Do we need to split this write into 2 parts? */
- if (XFS_BUF_ADDR(bp) + BTOBB(count) > log->l_logBBsize) {
- char *dptr;
-
- split = count - (BBTOB(log->l_logBBsize - XFS_BUF_ADDR(bp)));
- count = BBTOB(log->l_logBBsize - XFS_BUF_ADDR(bp));
- iclog->ic_bwritecnt = 2;
+ bno = BLOCK_LSN(be64_to_cpu(iclog->ic_header.h_lsn));
- /*
- * Bump the cycle numbers at the start of each block in the
- * part of the iclog that ends up in the buffer that gets
- * written to the start of the log.
- *
- * Watch out for the header magic number case, though.
- */
- dptr = (char *)&iclog->ic_header + count;
- for (i = 0; i < split; i += BBSIZE) {
- uint32_t cycle = be32_to_cpu(*(__be32 *)dptr);
- if (++cycle == XLOG_HEADER_MAGIC_NUM)
- cycle++;
- *(__be32 *)dptr = cpu_to_be32(cycle);
-
- dptr += BBSIZE;
- }
- } else {
- iclog->ic_bwritecnt = 1;
+ /* Do we need to split this write into 2 parts? */
+ if (bno + BTOBB(count) > log->l_logBBsize) {
+ xlog_split_iclog(log, &iclog->ic_header, bno, count);
+ split = true;
}
/* calculcate the checksum */
@@ -1920,18 +1879,15 @@ xlog_sync(
* write on I/O completion and shutdown the fs. The subsequent mount
* detects the bad CRC and attempts to recover.
*/
+#ifdef DEBUG
if (XFS_TEST_ERROR(false, log->l_mp, XFS_ERRTAG_LOG_BAD_CRC)) {
iclog->ic_header.h_crc &= cpu_to_le32(0xAAAAAAAA);
- iclog->ic_state |= XLOG_STATE_IOABORT;
+ iclog->ic_fail_crc = true;
xfs_warn(log->l_mp,
"Intentionally corrupted log record at LSN 0x%llx. Shutdown imminent.",
be64_to_cpu(iclog->ic_header.h_lsn));
}
-
- bp->b_io_length = BTOBB(count);
- bp->b_log_item = iclog;
- bp->b_flags &= ~XBF_FLUSH;
- bp->b_flags |= (XBF_ASYNC | XBF_SYNCIO | XBF_WRITE | XBF_FUA);
+#endif
/*
* Flush the data device before flushing the log to make sure all meta
@@ -1941,50 +1897,14 @@ xlog_sync(
* synchronously here; for an internal log we can simply use the block
* layer state machine for preflushes.
*/
- if (log->l_mp->m_logdev_targp != log->l_mp->m_ddev_targp)
+ if (log->l_targ != log->l_mp->m_ddev_targp || split) {
xfs_blkdev_issue_flush(log->l_mp->m_ddev_targp);
- else
- bp->b_flags |= XBF_FLUSH;
-
- ASSERT(XFS_BUF_ADDR(bp) <= log->l_logBBsize-1);
- ASSERT(XFS_BUF_ADDR(bp) + BTOBB(count) <= log->l_logBBsize);
-
- xlog_verify_iclog(log, iclog, count, true);
-
- /* account for log which doesn't start at block #0 */
- XFS_BUF_SET_ADDR(bp, XFS_BUF_ADDR(bp) + log->l_logBBstart);
-
- /*
- * Don't call xfs_bwrite here. We do log-syncs even when the filesystem
- * is shutting down.
- */
- error = xlog_bdstrat(bp);
- if (error) {
- xfs_buf_ioerror_alert(bp, "xlog_sync");
- return error;
+ need_flush = false;
}
- if (split) {
- bp = iclog->ic_log->l_xbuf;
- XFS_BUF_SET_ADDR(bp, 0); /* logical 0 */
- xfs_buf_associate_memory(bp,
- (char *)&iclog->ic_header + count, split);
- bp->b_log_item = iclog;
- bp->b_flags &= ~XBF_FLUSH;
- bp->b_flags |= (XBF_ASYNC | XBF_SYNCIO | XBF_WRITE | XBF_FUA);
-
- ASSERT(XFS_BUF_ADDR(bp) <= log->l_logBBsize-1);
- ASSERT(XFS_BUF_ADDR(bp) + BTOBB(count) <= log->l_logBBsize);
-
- /* account for internal log which doesn't start at block #0 */
- XFS_BUF_SET_ADDR(bp, XFS_BUF_ADDR(bp) + log->l_logBBstart);
- error = xlog_bdstrat(bp);
- if (error) {
- xfs_buf_ioerror_alert(bp, "xlog_sync (split)");
- return error;
- }
- }
- return 0;
-} /* xlog_sync */
+
+ xlog_verify_iclog(log, iclog, count);
+ xlog_write_iclog(log, iclog, bno, count, need_flush);
+}
/*
* Deallocate a log structure
@@ -2004,31 +1924,21 @@ xlog_dealloc_log(
*/
iclog = log->l_iclog;
for (i = 0; i < log->l_iclog_bufs; i++) {
- xfs_buf_lock(iclog->ic_bp);
- xfs_buf_unlock(iclog->ic_bp);
+ down(&iclog->ic_sema);
+ up(&iclog->ic_sema);
iclog = iclog->ic_next;
}
- /*
- * Always need to ensure that the extra buffer does not point to memory
- * owned by another log buffer before we free it. Also, cycle the lock
- * first to ensure we've completed IO on it.
- */
- xfs_buf_lock(log->l_xbuf);
- xfs_buf_unlock(log->l_xbuf);
- xfs_buf_set_empty(log->l_xbuf, BTOBB(log->l_iclog_size));
- xfs_buf_free(log->l_xbuf);
-
iclog = log->l_iclog;
for (i = 0; i < log->l_iclog_bufs; i++) {
- xfs_buf_free(iclog->ic_bp);
next_iclog = iclog->ic_next;
+ kmem_free(iclog->ic_data);
kmem_free(iclog);
iclog = next_iclog;
}
- spinlock_destroy(&log->l_icloglock);
log->l_mp->m_log = NULL;
+ destroy_workqueue(log->l_ioend_workqueue);
kmem_free(log);
} /* xlog_dealloc_log */
@@ -2068,7 +1978,7 @@ xlog_print_tic_res(
/* match with XLOG_REG_TYPE_* in xfs_log.h */
#define REG_TYPE_STR(type, str) [XLOG_REG_TYPE_##type] = str
- static char *res_type_str[XLOG_REG_TYPE_MAX + 1] = {
+ static char *res_type_str[] = {
REG_TYPE_STR(BFORMAT, "bformat"),
REG_TYPE_STR(BCHUNK, "bchunk"),
REG_TYPE_STR(EFI_FORMAT, "efi_format"),
@@ -2088,8 +1998,15 @@ xlog_print_tic_res(
REG_TYPE_STR(UNMOUNT, "unmount"),
REG_TYPE_STR(COMMIT, "commit"),
REG_TYPE_STR(TRANSHDR, "trans header"),
- REG_TYPE_STR(ICREATE, "inode create")
+ REG_TYPE_STR(ICREATE, "inode create"),
+ REG_TYPE_STR(RUI_FORMAT, "rui_format"),
+ REG_TYPE_STR(RUD_FORMAT, "rud_format"),
+ REG_TYPE_STR(CUI_FORMAT, "cui_format"),
+ REG_TYPE_STR(CUD_FORMAT, "cud_format"),
+ REG_TYPE_STR(BUI_FORMAT, "bui_format"),
+ REG_TYPE_STR(BUD_FORMAT, "bud_format"),
};
+ BUILD_BUG_ON(ARRAY_SIZE(res_type_str) != XLOG_REG_TYPE_MAX + 1);
#undef REG_TYPE_STR
xfs_warn(mp, "ticket reservation summary:");
@@ -2602,7 +2519,7 @@ xlog_state_clean_log(
if (iclog->ic_state == XLOG_STATE_DIRTY) {
iclog->ic_state = XLOG_STATE_ACTIVE;
iclog->ic_offset = 0;
- ASSERT(iclog->ic_callback == NULL);
+ ASSERT(list_empty_careful(&iclog->ic_callbacks));
/*
* If the number of ops in this iclog indicate it just
* contains the dummy transaction, we can
@@ -2672,37 +2589,32 @@ xlog_state_clean_log(
STATIC xfs_lsn_t
xlog_get_lowest_lsn(
- struct xlog *log)
+ struct xlog *log)
{
- xlog_in_core_t *lsn_log;
- xfs_lsn_t lowest_lsn, lsn;
+ struct xlog_in_core *iclog = log->l_iclog;
+ xfs_lsn_t lowest_lsn = 0, lsn;
- lsn_log = log->l_iclog;
- lowest_lsn = 0;
do {
- if (!(lsn_log->ic_state & (XLOG_STATE_ACTIVE|XLOG_STATE_DIRTY))) {
- lsn = be64_to_cpu(lsn_log->ic_header.h_lsn);
- if ((lsn && !lowest_lsn) ||
- (XFS_LSN_CMP(lsn, lowest_lsn) < 0)) {
+ if (iclog->ic_state & (XLOG_STATE_ACTIVE | XLOG_STATE_DIRTY))
+ continue;
+
+ lsn = be64_to_cpu(iclog->ic_header.h_lsn);
+ if ((lsn && !lowest_lsn) || XFS_LSN_CMP(lsn, lowest_lsn) < 0)
lowest_lsn = lsn;
- }
- }
- lsn_log = lsn_log->ic_next;
- } while (lsn_log != log->l_iclog);
+ } while ((iclog = iclog->ic_next) != log->l_iclog);
+
return lowest_lsn;
}
-
STATIC void
xlog_state_do_callback(
struct xlog *log,
- int aborted,
+ bool aborted,
struct xlog_in_core *ciclog)
{
xlog_in_core_t *iclog;
xlog_in_core_t *first_iclog; /* used to know when we've
* processed all iclogs once */
- xfs_log_callback_t *cb, *cb_next;
int flushcnt = 0;
xfs_lsn_t lowest_lsn;
int ioerrors; /* counter: iclogs with errors */
@@ -2813,7 +2725,7 @@ xlog_state_do_callback(
*/
ASSERT(XFS_LSN_CMP(atomic64_read(&log->l_last_sync_lsn),
be64_to_cpu(iclog->ic_header.h_lsn)) <= 0);
- if (iclog->ic_callback)
+ if (!list_empty_careful(&iclog->ic_callbacks))
atomic64_set(&log->l_last_sync_lsn,
be64_to_cpu(iclog->ic_header.h_lsn));
@@ -2830,26 +2742,20 @@ xlog_state_do_callback(
* callbacks being added.
*/
spin_lock(&iclog->ic_callback_lock);
- cb = iclog->ic_callback;
- while (cb) {
- iclog->ic_callback_tail = &(iclog->ic_callback);
- iclog->ic_callback = NULL;
- spin_unlock(&iclog->ic_callback_lock);
+ while (!list_empty(&iclog->ic_callbacks)) {
+ LIST_HEAD(tmp);
- /* perform callbacks in the order given */
- for (; cb; cb = cb_next) {
- cb_next = cb->cb_next;
- cb->cb_func(cb->cb_arg, aborted);
- }
+ list_splice_init(&iclog->ic_callbacks, &tmp);
+
+ spin_unlock(&iclog->ic_callback_lock);
+ xlog_cil_process_committed(&tmp, aborted);
spin_lock(&iclog->ic_callback_lock);
- cb = iclog->ic_callback;
}
loopdidcallbacks++;
funcdidcallbacks++;
spin_lock(&log->l_icloglock);
- ASSERT(iclog->ic_callback == NULL);
spin_unlock(&iclog->ic_callback_lock);
if (!(iclog->ic_state & XLOG_STATE_IOERROR))
iclog->ic_state = XLOG_STATE_DIRTY;
@@ -2935,18 +2841,16 @@ xlog_state_do_callback(
*/
STATIC void
xlog_state_done_syncing(
- xlog_in_core_t *iclog,
- int aborted)
+ struct xlog_in_core *iclog,
+ bool aborted)
{
- struct xlog *log = iclog->ic_log;
+ struct xlog *log = iclog->ic_log;
spin_lock(&log->l_icloglock);
ASSERT(iclog->ic_state == XLOG_STATE_SYNCING ||
iclog->ic_state == XLOG_STATE_IOERROR);
ASSERT(atomic_read(&iclog->ic_refcnt) == 0);
- ASSERT(iclog->ic_bwritecnt == 1 || iclog->ic_bwritecnt == 2);
-
/*
* If we got an error, either on the first buffer, or in the case of
@@ -2954,13 +2858,8 @@ xlog_state_done_syncing(
* and none should ever be attempted to be written to disk
* again.
*/
- if (iclog->ic_state != XLOG_STATE_IOERROR) {
- if (--iclog->ic_bwritecnt == 1) {
- spin_unlock(&log->l_icloglock);
- return;
- }
+ if (iclog->ic_state != XLOG_STATE_IOERROR)
iclog->ic_state = XLOG_STATE_DONE_SYNC;
- }
/*
* Someone could be sleeping prior to writing out the next
@@ -3229,7 +3128,7 @@ xlog_state_release_iclog(
* flags after this point.
*/
if (sync)
- return xlog_sync(log, iclog);
+ xlog_sync(log, iclog);
return 0;
} /* xlog_state_release_iclog */
@@ -3820,8 +3719,7 @@ STATIC void
xlog_verify_iclog(
struct xlog *log,
struct xlog_in_core *iclog,
- int count,
- bool syncing)
+ int count)
{
xlog_op_header_t *ophead;
xlog_in_core_t *icptr;
@@ -3865,7 +3763,7 @@ xlog_verify_iclog(
/* clientid is only 1 byte */
p = &ophead->oh_clientid;
field_offset = p - base_ptr;
- if (!syncing || (field_offset & 0x1ff)) {
+ if (field_offset & 0x1ff) {
clientid = ophead->oh_clientid;
} else {
idx = BTOBBT((char *)&ophead->oh_clientid - iclog->ic_datap);
@@ -3888,7 +3786,7 @@ xlog_verify_iclog(
/* check length */
p = &ophead->oh_len;
field_offset = p - base_ptr;
- if (!syncing || (field_offset & 0x1ff)) {
+ if (field_offset & 0x1ff) {
op_len = be32_to_cpu(ophead->oh_len);
} else {
idx = BTOBBT((uintptr_t)&ophead->oh_len -
@@ -4025,7 +3923,7 @@ xfs_log_force_umount(
* avoid races.
*/
wake_up_all(&log->l_cilp->xc_commit_wait);
- xlog_state_do_callback(log, XFS_LI_ABORTED, NULL);
+ xlog_state_do_callback(log, true, NULL);
#ifdef XFSERRORDEBUG
{
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index 73a64bf32f6f..84e06805160f 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -6,6 +6,8 @@
#ifndef __XFS_LOG_H__
#define __XFS_LOG_H__
+struct xfs_cil_ctx;
+
struct xfs_log_vec {
struct xfs_log_vec *lv_next; /* next lv in build list */
int lv_niovecs; /* number of iovecs in lv */
@@ -72,16 +74,6 @@ xlog_copy_iovec(struct xfs_log_vec *lv, struct xfs_log_iovec **vecp,
}
/*
- * Structure used to pass callback function and the function's argument
- * to the log manager.
- */
-typedef struct xfs_log_callback {
- struct xfs_log_callback *cb_next;
- void (*cb_func)(void *, int);
- void *cb_arg;
-} xfs_log_callback_t;
-
-/*
* By comparing each component, we don't have to worry about extra
* endian issues in treating two 32 bit numbers as one 64 bit number
*/
@@ -125,12 +117,10 @@ int xfs_log_mount(struct xfs_mount *mp,
xfs_daddr_t start_block,
int num_bblocks);
int xfs_log_mount_finish(struct xfs_mount *mp);
-int xfs_log_mount_cancel(struct xfs_mount *);
+void xfs_log_mount_cancel(struct xfs_mount *);
xfs_lsn_t xlog_assign_tail_lsn(struct xfs_mount *mp);
xfs_lsn_t xlog_assign_tail_lsn_locked(struct xfs_mount *mp);
void xfs_log_space_wake(struct xfs_mount *mp);
-int xfs_log_notify(struct xlog_in_core *iclog,
- struct xfs_log_callback *callback_entry);
int xfs_log_release_iclog(struct xfs_mount *mp,
struct xlog_in_core *iclog);
int xfs_log_reserve(struct xfs_mount *mp,
@@ -148,6 +138,7 @@ void xfs_log_ticket_put(struct xlog_ticket *ticket);
void xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp,
xfs_lsn_t *commit_lsn, bool regrant);
+void xlog_cil_process_committed(struct list_head *list, bool aborted);
bool xfs_log_item_in_current_chkpt(struct xfs_log_item *lip);
void xfs_log_work_queue(struct xfs_mount *mp);
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index d3884e08b43c..fa5602d0fd7f 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -10,10 +10,7 @@
#include "xfs_shared.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
-#include "xfs_error.h"
-#include "xfs_alloc.h"
#include "xfs_extent_busy.h"
-#include "xfs_discard.h"
#include "xfs_trans.h"
#include "xfs_trans_priv.h"
#include "xfs_log.h"
@@ -246,7 +243,8 @@ xfs_cil_prepare_item(
* shadow buffer, so update the the pointer to it appropriately.
*/
if (!old_lv) {
- lv->lv_item->li_ops->iop_pin(lv->lv_item);
+ if (lv->lv_item->li_ops->iop_pin)
+ lv->lv_item->li_ops->iop_pin(lv->lv_item);
lv->lv_item->li_lv_shadow = NULL;
} else if (old_lv != lv) {
ASSERT(lv->lv_buf_len != XFS_LOG_VEC_ORDERED);
@@ -576,12 +574,24 @@ xlog_discard_busy_extents(
*/
static void
xlog_cil_committed(
- void *args,
- int abort)
+ struct xfs_cil_ctx *ctx,
+ bool abort)
{
- struct xfs_cil_ctx *ctx = args;
struct xfs_mount *mp = ctx->cil->xc_log->l_mp;
+ /*
+ * If the I/O failed, we're aborting the commit and already shutdown.
+ * Wake any commit waiters before aborting the log items so we don't
+ * block async log pushers on callbacks. Async log pushers explicitly do
+ * not wait on log force completion because they may be holding locks
+ * required to unpin items.
+ */
+ if (abort) {
+ spin_lock(&ctx->cil->xc_push_lock);
+ wake_up_all(&ctx->cil->xc_commit_wait);
+ spin_unlock(&ctx->cil->xc_push_lock);
+ }
+
xfs_trans_committed_bulk(ctx->cil->xc_log->l_ailp, ctx->lv_chain,
ctx->start_lsn, abort);
@@ -589,15 +599,7 @@ xlog_cil_committed(
xfs_extent_busy_clear(mp, &ctx->busy_extents,
(mp->m_flags & XFS_MOUNT_DISCARD) && !abort);
- /*
- * If we are aborting the commit, wake up anyone waiting on the
- * committing list. If we don't, then a shutdown we can leave processes
- * waiting in xlog_cil_force_lsn() waiting on a sequence commit that
- * will never happen because we aborted it.
- */
spin_lock(&ctx->cil->xc_push_lock);
- if (abort)
- wake_up_all(&ctx->cil->xc_commit_wait);
list_del(&ctx->committing);
spin_unlock(&ctx->cil->xc_push_lock);
@@ -609,6 +611,20 @@ xlog_cil_committed(
kmem_free(ctx);
}
+void
+xlog_cil_process_committed(
+ struct list_head *list,
+ bool aborted)
+{
+ struct xfs_cil_ctx *ctx;
+
+ while ((ctx = list_first_entry_or_null(list,
+ struct xfs_cil_ctx, iclog_entry))) {
+ list_del(&ctx->iclog_entry);
+ xlog_cil_committed(ctx, aborted);
+ }
+}
+
/*
* Push the Committed Item List to the log. If @push_seq flag is zero, then it
* is a background flush and so we can chose to ignore it. Otherwise, if the
@@ -830,12 +846,15 @@ restart:
if (commit_lsn == -1)
goto out_abort;
- /* attach all the transactions w/ busy extents to iclog */
- ctx->log_cb.cb_func = xlog_cil_committed;
- ctx->log_cb.cb_arg = ctx;
- error = xfs_log_notify(commit_iclog, &ctx->log_cb);
- if (error)
+ spin_lock(&commit_iclog->ic_callback_lock);
+ if (commit_iclog->ic_state & XLOG_STATE_IOERROR) {
+ spin_unlock(&commit_iclog->ic_callback_lock);
goto out_abort;
+ }
+ ASSERT_ALWAYS(commit_iclog->ic_state == XLOG_STATE_ACTIVE ||
+ commit_iclog->ic_state == XLOG_STATE_WANT_SYNC);
+ list_add_tail(&ctx->iclog_entry, &commit_iclog->ic_callbacks);
+ spin_unlock(&commit_iclog->ic_callback_lock);
/*
* now the checkpoint commit is complete and we've attached the
@@ -859,7 +878,7 @@ out_skip:
out_abort_free_ticket:
xfs_log_ticket_put(tic);
out_abort:
- xlog_cil_committed(ctx, XFS_LI_ABORTED);
+ xlog_cil_committed(ctx, true);
return -EIO;
}
@@ -979,6 +998,7 @@ xfs_log_commit_cil(
{
struct xlog *log = mp->m_log;
struct xfs_cil *cil = log->l_cilp;
+ struct xfs_log_item *lip, *next;
xfs_lsn_t xc_commit_lsn;
/*
@@ -1003,7 +1023,7 @@ xfs_log_commit_cil(
/*
* Once all the items of the transaction have been copied to the CIL,
- * the items can be unlocked and freed.
+ * the items can be unlocked and possibly freed.
*
* This needs to be done before we drop the CIL context lock because we
* have to update state in the log items and unlock them before they go
@@ -1012,8 +1032,12 @@ xfs_log_commit_cil(
* the log items. This affects (at least) processing of stale buffers,
* inodes and EFIs.
*/
- xfs_trans_free_items(tp, xc_commit_lsn, false);
-
+ trace_xfs_trans_commit_items(tp, _RET_IP_);
+ list_for_each_entry_safe(lip, next, &tp->t_items, li_trans) {
+ xfs_trans_del_item(lip);
+ if (lip->li_ops->iop_committing)
+ lip->li_ops->iop_committing(lip, xc_commit_lsn);
+ }
xlog_cil_push_background(log);
up_read(&cil->xc_ctx_lock);
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index b5f82cb36202..b880c23cb6e4 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -10,7 +10,6 @@ struct xfs_buf;
struct xlog;
struct xlog_ticket;
struct xfs_mount;
-struct xfs_log_callback;
/*
* Flags for log structure
@@ -50,7 +49,6 @@ static inline uint xlog_get_client_id(__be32 i)
#define XLOG_STATE_CALLBACK 0x0020 /* Callback functions now */
#define XLOG_STATE_DIRTY 0x0040 /* Dirty IC log, not ready for ACTIVE status*/
#define XLOG_STATE_IOERROR 0x0080 /* IO error happened in sync'ing log */
-#define XLOG_STATE_IOABORT 0x0100 /* force abort on I/O completion (debug) */
#define XLOG_STATE_ALL 0x7FFF /* All possible valid flags */
#define XLOG_STATE_NOTUSED 0x8000 /* This IC log not being used */
@@ -179,11 +177,10 @@ typedef struct xlog_ticket {
* the iclog.
* - ic_forcewait is used to implement synchronous forcing of the iclog to disk.
* - ic_next is the pointer to the next iclog in the ring.
- * - ic_bp is a pointer to the buffer used to write this incore log to disk.
* - ic_log is a pointer back to the global log structure.
- * - ic_callback is a linked list of callback function/argument pairs to be
- * called after an iclog finishes writing.
- * - ic_size is the full size of the header plus data.
+ * - ic_size is the full size of the log buffer, minus the cycle headers.
+ * - ic_io_size is the size of the currently pending log buffer write, which
+ * might be smaller than ic_size
* - ic_offset is the current number of bytes written to in this iclog.
* - ic_refcnt is bumped when someone is writing to the log.
* - ic_state is the state of the iclog.
@@ -193,7 +190,7 @@ typedef struct xlog_ticket {
* structure cacheline aligned. The following fields can be contended on
* by independent processes:
*
- * - ic_callback_*
+ * - ic_callbacks
* - ic_refcnt
* - fields protected by the global l_icloglock
*
@@ -206,23 +203,28 @@ typedef struct xlog_in_core {
wait_queue_head_t ic_write_wait;
struct xlog_in_core *ic_next;
struct xlog_in_core *ic_prev;
- struct xfs_buf *ic_bp;
struct xlog *ic_log;
- int ic_size;
- int ic_offset;
- int ic_bwritecnt;
+ u32 ic_size;
+ u32 ic_io_size;
+ u32 ic_offset;
unsigned short ic_state;
char *ic_datap; /* pointer to iclog data */
/* Callback structures need their own cacheline */
spinlock_t ic_callback_lock ____cacheline_aligned_in_smp;
- struct xfs_log_callback *ic_callback;
- struct xfs_log_callback **ic_callback_tail;
+ struct list_head ic_callbacks;
/* reference counts need their own cacheline */
atomic_t ic_refcnt ____cacheline_aligned_in_smp;
xlog_in_core_2_t *ic_data;
#define ic_header ic_data->hic_header
+#ifdef DEBUG
+ bool ic_fail_crc : 1;
+#endif
+ struct semaphore ic_sema;
+ struct work_struct ic_end_io_work;
+ struct bio ic_bio;
+ struct bio_vec ic_bvec[];
} xlog_in_core_t;
/*
@@ -243,7 +245,7 @@ struct xfs_cil_ctx {
int space_used; /* aggregate size of regions */
struct list_head busy_extents; /* busy extents in chkpt */
struct xfs_log_vec *lv_chain; /* logvecs being pushed */
- struct xfs_log_callback log_cb; /* completion callback hook. */
+ struct list_head iclog_entry;
struct list_head committing; /* ctx committing list */
struct work_struct discard_endio_work;
};
@@ -350,9 +352,8 @@ struct xlog {
struct xfs_mount *l_mp; /* mount point */
struct xfs_ail *l_ailp; /* AIL log is working with */
struct xfs_cil *l_cilp; /* CIL log is working with */
- struct xfs_buf *l_xbuf; /* extra buffer for log
- * wrapping */
struct xfs_buftarg *l_targ; /* buftarg of log */
+ struct workqueue_struct *l_ioend_workqueue; /* for I/O completions */
struct delayed_work l_work; /* background flush work */
uint l_flags;
uint l_quotaoffs_flag; /* XFS_DQ_*, for QUOTAOFFs */
@@ -361,7 +362,6 @@ struct xlog {
int l_iclog_heads; /* # of iclog header sectors */
uint l_sectBBsize; /* sector size in BBs (2^n) */
int l_iclog_size; /* size of log in bytes */
- int l_iclog_size_log; /* log power size of log */
int l_iclog_bufs; /* number of iclog buffers */
xfs_daddr_t l_logBBstart; /* start block of log */
int l_logsize; /* size of log in bytes */
@@ -418,7 +418,7 @@ xlog_recover(
extern int
xlog_recover_finish(
struct xlog *log);
-extern int
+extern void
xlog_recover_cancel(struct xlog *);
extern __le32 xlog_cksum(struct xlog *log, struct xlog_rec_header *rhead,
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 9fe88d125f0a..13d1d3e95b88 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -13,8 +13,6 @@
#include "xfs_sb.h"
#include "xfs_mount.h"
#include "xfs_defer.h"
-#include "xfs_da_format.h"
-#include "xfs_da_btree.h"
#include "xfs_inode.h"
#include "xfs_trans.h"
#include "xfs_log.h"
@@ -26,7 +24,6 @@
#include "xfs_alloc.h"
#include "xfs_ialloc.h"
#include "xfs_quota.h"
-#include "xfs_cksum.h"
#include "xfs_trace.h"
#include "xfs_icache.h"
#include "xfs_bmap_btree.h"
@@ -79,7 +76,7 @@ struct xfs_buf_cancel {
* are valid, false otherwise.
*/
static inline bool
-xlog_verify_bp(
+xlog_verify_bno(
struct xlog *log,
xfs_daddr_t blk_no,
int bbcount)
@@ -92,22 +89,19 @@ xlog_verify_bp(
}
/*
- * Allocate a buffer to hold log data. The buffer needs to be able
- * to map to a range of nbblks basic blocks at any valid (basic
- * block) offset within the log.
+ * Allocate a buffer to hold log data. The buffer needs to be able to map to
+ * a range of nbblks basic blocks at any valid offset within the log.
*/
-STATIC xfs_buf_t *
-xlog_get_bp(
+static char *
+xlog_alloc_buffer(
struct xlog *log,
int nbblks)
{
- struct xfs_buf *bp;
-
/*
* Pass log block 0 since we don't have an addr yet, buffer will be
* verified on read.
*/
- if (!xlog_verify_bp(log, 0, nbblks)) {
+ if (!xlog_verify_bno(log, 0, nbblks)) {
xfs_warn(log->l_mp, "Invalid block length (0x%x) for buffer",
nbblks);
XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp);
@@ -115,69 +109,48 @@ xlog_get_bp(
}
/*
- * We do log I/O in units of log sectors (a power-of-2
- * multiple of the basic block size), so we round up the
- * requested size to accommodate the basic blocks required
- * for complete log sectors.
+ * We do log I/O in units of log sectors (a power-of-2 multiple of the
+ * basic block size), so we round up the requested size to accommodate
+ * the basic blocks required for complete log sectors.
*
- * In addition, the buffer may be used for a non-sector-
- * aligned block offset, in which case an I/O of the
- * requested size could extend beyond the end of the
- * buffer. If the requested size is only 1 basic block it
- * will never straddle a sector boundary, so this won't be
- * an issue. Nor will this be a problem if the log I/O is
- * done in basic blocks (sector size 1). But otherwise we
- * extend the buffer by one extra log sector to ensure
- * there's space to accommodate this possibility.
+ * In addition, the buffer may be used for a non-sector-aligned block
+ * offset, in which case an I/O of the requested size could extend
+ * beyond the end of the buffer. If the requested size is only 1 basic
+ * block it will never straddle a sector boundary, so this won't be an
+ * issue. Nor will this be a problem if the log I/O is done in basic
+ * blocks (sector size 1). But otherwise we extend the buffer by one
+ * extra log sector to ensure there's space to accommodate this
+ * possibility.
*/
if (nbblks > 1 && log->l_sectBBsize > 1)
nbblks += log->l_sectBBsize;
nbblks = round_up(nbblks, log->l_sectBBsize);
-
- bp = xfs_buf_get_uncached(log->l_mp->m_logdev_targp, nbblks, 0);
- if (bp)
- xfs_buf_unlock(bp);
- return bp;
-}
-
-STATIC void
-xlog_put_bp(
- xfs_buf_t *bp)
-{
- xfs_buf_free(bp);
+ return kmem_alloc_large(BBTOB(nbblks), KM_MAYFAIL);
}
/*
* Return the address of the start of the given block number's data
* in a log buffer. The buffer covers a log sector-aligned region.
*/
-STATIC char *
+static inline unsigned int
xlog_align(
struct xlog *log,
- xfs_daddr_t blk_no,
- int nbblks,
- struct xfs_buf *bp)
+ xfs_daddr_t blk_no)
{
- xfs_daddr_t offset = blk_no & ((xfs_daddr_t)log->l_sectBBsize - 1);
-
- ASSERT(offset + nbblks <= bp->b_length);
- return bp->b_addr + BBTOB(offset);
+ return BBTOB(blk_no & ((xfs_daddr_t)log->l_sectBBsize - 1));
}
-
-/*
- * nbblks should be uint, but oh well. Just want to catch that 32-bit length.
- */
-STATIC int
-xlog_bread_noalign(
- struct xlog *log,
- xfs_daddr_t blk_no,
- int nbblks,
- struct xfs_buf *bp)
+static int
+xlog_do_io(
+ struct xlog *log,
+ xfs_daddr_t blk_no,
+ unsigned int nbblks,
+ char *data,
+ unsigned int op)
{
- int error;
+ int error;
- if (!xlog_verify_bp(log, blk_no, nbblks)) {
+ if (!xlog_verify_bno(log, blk_no, nbblks)) {
xfs_warn(log->l_mp,
"Invalid log block/length (0x%llx, 0x%x) for buffer",
blk_no, nbblks);
@@ -187,107 +160,53 @@ xlog_bread_noalign(
blk_no = round_down(blk_no, log->l_sectBBsize);
nbblks = round_up(nbblks, log->l_sectBBsize);
-
ASSERT(nbblks > 0);
- ASSERT(nbblks <= bp->b_length);
-
- XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no);
- bp->b_flags |= XBF_READ;
- bp->b_io_length = nbblks;
- bp->b_error = 0;
- error = xfs_buf_submit(bp);
- if (error && !XFS_FORCED_SHUTDOWN(log->l_mp))
- xfs_buf_ioerror_alert(bp, __func__);
+ error = xfs_rw_bdev(log->l_targ->bt_bdev, log->l_logBBstart + blk_no,
+ BBTOB(nbblks), data, op);
+ if (error && !XFS_FORCED_SHUTDOWN(log->l_mp)) {
+ xfs_alert(log->l_mp,
+ "log recovery %s I/O error at daddr 0x%llx len %d error %d",
+ op == REQ_OP_WRITE ? "write" : "read",
+ blk_no, nbblks, error);
+ }
return error;
}
STATIC int
-xlog_bread(
+xlog_bread_noalign(
struct xlog *log,
xfs_daddr_t blk_no,
int nbblks,
- struct xfs_buf *bp,
- char **offset)
+ char *data)
{
- int error;
-
- error = xlog_bread_noalign(log, blk_no, nbblks, bp);
- if (error)
- return error;
-
- *offset = xlog_align(log, blk_no, nbblks, bp);
- return 0;
+ return xlog_do_io(log, blk_no, nbblks, data, REQ_OP_READ);
}
-/*
- * Read at an offset into the buffer. Returns with the buffer in it's original
- * state regardless of the result of the read.
- */
STATIC int
-xlog_bread_offset(
+xlog_bread(
struct xlog *log,
- xfs_daddr_t blk_no, /* block to read from */
- int nbblks, /* blocks to read */
- struct xfs_buf *bp,
- char *offset)
+ xfs_daddr_t blk_no,
+ int nbblks,
+ char *data,
+ char **offset)
{
- char *orig_offset = bp->b_addr;
- int orig_len = BBTOB(bp->b_length);
- int error, error2;
-
- error = xfs_buf_associate_memory(bp, offset, BBTOB(nbblks));
- if (error)
- return error;
-
- error = xlog_bread_noalign(log, blk_no, nbblks, bp);
+ int error;
- /* must reset buffer pointer even on error */
- error2 = xfs_buf_associate_memory(bp, orig_offset, orig_len);
- if (error)
- return error;
- return error2;
+ error = xlog_do_io(log, blk_no, nbblks, data, REQ_OP_READ);
+ if (!error)
+ *offset = data + xlog_align(log, blk_no);
+ return error;
}
-/*
- * Write out the buffer at the given block for the given number of blocks.
- * The buffer is kept locked across the write and is returned locked.
- * This can only be used for synchronous log writes.
- */
STATIC int
xlog_bwrite(
struct xlog *log,
xfs_daddr_t blk_no,
int nbblks,
- struct xfs_buf *bp)
+ char *data)
{
- int error;
-
- if (!xlog_verify_bp(log, blk_no, nbblks)) {
- xfs_warn(log->l_mp,
- "Invalid log block/length (0x%llx, 0x%x) for buffer",
- blk_no, nbblks);
- XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp);
- return -EFSCORRUPTED;
- }
-
- blk_no = round_down(blk_no, log->l_sectBBsize);
- nbblks = round_up(nbblks, log->l_sectBBsize);
-
- ASSERT(nbblks > 0);
- ASSERT(nbblks <= bp->b_length);
-
- XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no);
- xfs_buf_hold(bp);
- xfs_buf_lock(bp);
- bp->b_io_length = nbblks;
- bp->b_error = 0;
-
- error = xfs_bwrite(bp);
- if (error)
- xfs_buf_ioerror_alert(bp, __func__);
- xfs_buf_relse(bp);
- return error;
+ return xlog_do_io(log, blk_no, nbblks, data, REQ_OP_WRITE);
}
#ifdef DEBUG
@@ -377,10 +296,9 @@ xlog_recover_iodone(
* We're not going to bother about retrying
* this during recovery. One strike!
*/
- if (!XFS_FORCED_SHUTDOWN(bp->b_target->bt_mount)) {
+ if (!XFS_FORCED_SHUTDOWN(bp->b_mount)) {
xfs_buf_ioerror_alert(bp, __func__);
- xfs_force_shutdown(bp->b_target->bt_mount,
- SHUTDOWN_META_IO_ERROR);
+ xfs_force_shutdown(bp->b_mount, SHUTDOWN_META_IO_ERROR);
}
}
@@ -405,7 +323,7 @@ xlog_recover_iodone(
STATIC int
xlog_find_cycle_start(
struct xlog *log,
- struct xfs_buf *bp,
+ char *buffer,
xfs_daddr_t first_blk,
xfs_daddr_t *last_blk,
uint cycle)
@@ -419,7 +337,7 @@ xlog_find_cycle_start(
end_blk = *last_blk;
mid_blk = BLK_AVG(first_blk, end_blk);
while (mid_blk != first_blk && mid_blk != end_blk) {
- error = xlog_bread(log, mid_blk, 1, bp, &offset);
+ error = xlog_bread(log, mid_blk, 1, buffer, &offset);
if (error)
return error;
mid_cycle = xlog_get_cycle(offset);
@@ -455,7 +373,7 @@ xlog_find_verify_cycle(
{
xfs_daddr_t i, j;
uint cycle;
- xfs_buf_t *bp;
+ char *buffer;
xfs_daddr_t bufblks;
char *buf = NULL;
int error = 0;
@@ -469,7 +387,7 @@ xlog_find_verify_cycle(
bufblks = 1 << ffs(nbblks);
while (bufblks > log->l_logBBsize)
bufblks >>= 1;
- while (!(bp = xlog_get_bp(log, bufblks))) {
+ while (!(buffer = xlog_alloc_buffer(log, bufblks))) {
bufblks >>= 1;
if (bufblks < log->l_sectBBsize)
return -ENOMEM;
@@ -480,7 +398,7 @@ xlog_find_verify_cycle(
bcount = min(bufblks, (start_blk + nbblks - i));
- error = xlog_bread(log, i, bcount, bp, &buf);
+ error = xlog_bread(log, i, bcount, buffer, &buf);
if (error)
goto out;
@@ -498,7 +416,7 @@ xlog_find_verify_cycle(
*new_blk = -1;
out:
- xlog_put_bp(bp);
+ kmem_free(buffer);
return error;
}
@@ -522,7 +440,7 @@ xlog_find_verify_log_record(
int extra_bblks)
{
xfs_daddr_t i;
- xfs_buf_t *bp;
+ char *buffer;
char *offset = NULL;
xlog_rec_header_t *head = NULL;
int error = 0;
@@ -532,12 +450,14 @@ xlog_find_verify_log_record(
ASSERT(start_blk != 0 || *last_blk != start_blk);
- if (!(bp = xlog_get_bp(log, num_blks))) {
- if (!(bp = xlog_get_bp(log, 1)))
+ buffer = xlog_alloc_buffer(log, num_blks);
+ if (!buffer) {
+ buffer = xlog_alloc_buffer(log, 1);
+ if (!buffer)
return -ENOMEM;
smallmem = 1;
} else {
- error = xlog_bread(log, start_blk, num_blks, bp, &offset);
+ error = xlog_bread(log, start_blk, num_blks, buffer, &offset);
if (error)
goto out;
offset += ((num_blks - 1) << BBSHIFT);
@@ -554,7 +474,7 @@ xlog_find_verify_log_record(
}
if (smallmem) {
- error = xlog_bread(log, i, 1, bp, &offset);
+ error = xlog_bread(log, i, 1, buffer, &offset);
if (error)
goto out;
}
@@ -607,7 +527,7 @@ xlog_find_verify_log_record(
*last_blk = i;
out:
- xlog_put_bp(bp);
+ kmem_free(buffer);
return error;
}
@@ -629,7 +549,7 @@ xlog_find_head(
struct xlog *log,
xfs_daddr_t *return_head_blk)
{
- xfs_buf_t *bp;
+ char *buffer;
char *offset;
xfs_daddr_t new_blk, first_blk, start_blk, last_blk, head_blk;
int num_scan_bblks;
@@ -659,20 +579,20 @@ xlog_find_head(
}
first_blk = 0; /* get cycle # of 1st block */
- bp = xlog_get_bp(log, 1);
- if (!bp)
+ buffer = xlog_alloc_buffer(log, 1);
+ if (!buffer)
return -ENOMEM;
- error = xlog_bread(log, 0, 1, bp, &offset);
+ error = xlog_bread(log, 0, 1, buffer, &offset);
if (error)
- goto bp_err;
+ goto out_free_buffer;
first_half_cycle = xlog_get_cycle(offset);
last_blk = head_blk = log_bbnum - 1; /* get cycle # of last block */
- error = xlog_bread(log, last_blk, 1, bp, &offset);
+ error = xlog_bread(log, last_blk, 1, buffer, &offset);
if (error)
- goto bp_err;
+ goto out_free_buffer;
last_half_cycle = xlog_get_cycle(offset);
ASSERT(last_half_cycle != 0);
@@ -740,9 +660,10 @@ xlog_find_head(
* ^ we want to locate this spot
*/
stop_on_cycle = last_half_cycle;
- if ((error = xlog_find_cycle_start(log, bp, first_blk,
- &head_blk, last_half_cycle)))
- goto bp_err;
+ error = xlog_find_cycle_start(log, buffer, first_blk, &head_blk,
+ last_half_cycle);
+ if (error)
+ goto out_free_buffer;
}
/*
@@ -762,7 +683,7 @@ xlog_find_head(
if ((error = xlog_find_verify_cycle(log,
start_blk, num_scan_bblks,
stop_on_cycle, &new_blk)))
- goto bp_err;
+ goto out_free_buffer;
if (new_blk != -1)
head_blk = new_blk;
} else { /* need to read 2 parts of log */
@@ -799,7 +720,7 @@ xlog_find_head(
if ((error = xlog_find_verify_cycle(log, start_blk,
num_scan_bblks - (int)head_blk,
(stop_on_cycle - 1), &new_blk)))
- goto bp_err;
+ goto out_free_buffer;
if (new_blk != -1) {
head_blk = new_blk;
goto validate_head;
@@ -815,7 +736,7 @@ xlog_find_head(
if ((error = xlog_find_verify_cycle(log,
start_blk, (int)head_blk,
stop_on_cycle, &new_blk)))
- goto bp_err;
+ goto out_free_buffer;
if (new_blk != -1)
head_blk = new_blk;
}
@@ -834,13 +755,13 @@ validate_head:
if (error == 1)
error = -EIO;
if (error)
- goto bp_err;
+ goto out_free_buffer;
} else {
start_blk = 0;
ASSERT(head_blk <= INT_MAX);
error = xlog_find_verify_log_record(log, start_blk, &head_blk, 0);
if (error < 0)
- goto bp_err;
+ goto out_free_buffer;
if (error == 1) {
/* We hit the beginning of the log during our search */
start_blk = log_bbnum - (num_scan_bblks - head_blk);
@@ -853,14 +774,14 @@ validate_head:
if (error == 1)
error = -EIO;
if (error)
- goto bp_err;
+ goto out_free_buffer;
if (new_blk != log_bbnum)
head_blk = new_blk;
} else if (error)
- goto bp_err;
+ goto out_free_buffer;
}
- xlog_put_bp(bp);
+ kmem_free(buffer);
if (head_blk == log_bbnum)
*return_head_blk = 0;
else
@@ -873,9 +794,8 @@ validate_head:
*/
return 0;
- bp_err:
- xlog_put_bp(bp);
-
+out_free_buffer:
+ kmem_free(buffer);
if (error)
xfs_warn(log->l_mp, "failed to find log head");
return error;
@@ -895,7 +815,7 @@ xlog_rseek_logrec_hdr(
xfs_daddr_t head_blk,
xfs_daddr_t tail_blk,
int count,
- struct xfs_buf *bp,
+ char *buffer,
xfs_daddr_t *rblk,
struct xlog_rec_header **rhead,
bool *wrapped)
@@ -914,7 +834,7 @@ xlog_rseek_logrec_hdr(
*/
end_blk = head_blk > tail_blk ? tail_blk : 0;
for (i = (int) head_blk - 1; i >= end_blk; i--) {
- error = xlog_bread(log, i, 1, bp, &offset);
+ error = xlog_bread(log, i, 1, buffer, &offset);
if (error)
goto out_error;
@@ -933,7 +853,7 @@ xlog_rseek_logrec_hdr(
*/
if (tail_blk >= head_blk && found != count) {
for (i = log->l_logBBsize - 1; i >= (int) tail_blk; i--) {
- error = xlog_bread(log, i, 1, bp, &offset);
+ error = xlog_bread(log, i, 1, buffer, &offset);
if (error)
goto out_error;
@@ -969,7 +889,7 @@ xlog_seek_logrec_hdr(
xfs_daddr_t head_blk,
xfs_daddr_t tail_blk,
int count,
- struct xfs_buf *bp,
+ char *buffer,
xfs_daddr_t *rblk,
struct xlog_rec_header **rhead,
bool *wrapped)
@@ -988,7 +908,7 @@ xlog_seek_logrec_hdr(
*/
end_blk = head_blk > tail_blk ? head_blk : log->l_logBBsize - 1;
for (i = (int) tail_blk; i <= end_blk; i++) {
- error = xlog_bread(log, i, 1, bp, &offset);
+ error = xlog_bread(log, i, 1, buffer, &offset);
if (error)
goto out_error;
@@ -1006,7 +926,7 @@ xlog_seek_logrec_hdr(
*/
if (tail_blk > head_blk && found != count) {
for (i = 0; i < (int) head_blk; i++) {
- error = xlog_bread(log, i, 1, bp, &offset);
+ error = xlog_bread(log, i, 1, buffer, &offset);
if (error)
goto out_error;
@@ -1069,22 +989,22 @@ xlog_verify_tail(
int hsize)
{
struct xlog_rec_header *thead;
- struct xfs_buf *bp;
+ char *buffer;
xfs_daddr_t first_bad;
int error = 0;
bool wrapped;
xfs_daddr_t tmp_tail;
xfs_daddr_t orig_tail = *tail_blk;
- bp = xlog_get_bp(log, 1);
- if (!bp)
+ buffer = xlog_alloc_buffer(log, 1);
+ if (!buffer)
return -ENOMEM;
/*
* Make sure the tail points to a record (returns positive count on
* success).
*/
- error = xlog_seek_logrec_hdr(log, head_blk, *tail_blk, 1, bp,
+ error = xlog_seek_logrec_hdr(log, head_blk, *tail_blk, 1, buffer,
&tmp_tail, &thead, &wrapped);
if (error < 0)
goto out;
@@ -1113,8 +1033,8 @@ xlog_verify_tail(
break;
/* skip to the next record; returns positive count on success */
- error = xlog_seek_logrec_hdr(log, head_blk, first_bad, 2, bp,
- &tmp_tail, &thead, &wrapped);
+ error = xlog_seek_logrec_hdr(log, head_blk, first_bad, 2,
+ buffer, &tmp_tail, &thead, &wrapped);
if (error < 0)
goto out;
@@ -1129,7 +1049,7 @@ xlog_verify_tail(
"Tail block (0x%llx) overwrite detected. Updated to 0x%llx",
orig_tail, *tail_blk);
out:
- xlog_put_bp(bp);
+ kmem_free(buffer);
return error;
}
@@ -1151,13 +1071,13 @@ xlog_verify_head(
struct xlog *log,
xfs_daddr_t *head_blk, /* in/out: unverified head */
xfs_daddr_t *tail_blk, /* out: tail block */
- struct xfs_buf *bp,
+ char *buffer,
xfs_daddr_t *rhead_blk, /* start blk of last record */
struct xlog_rec_header **rhead, /* ptr to last record */
bool *wrapped) /* last rec. wraps phys. log */
{
struct xlog_rec_header *tmp_rhead;
- struct xfs_buf *tmp_bp;
+ char *tmp_buffer;
xfs_daddr_t first_bad;
xfs_daddr_t tmp_rhead_blk;
int found;
@@ -1168,15 +1088,15 @@ xlog_verify_head(
* Check the head of the log for torn writes. Search backwards from the
* head until we hit the tail or the maximum number of log record I/Os
* that could have been in flight at one time. Use a temporary buffer so
- * we don't trash the rhead/bp pointers from the caller.
+ * we don't trash the rhead/buffer pointers from the caller.
*/
- tmp_bp = xlog_get_bp(log, 1);
- if (!tmp_bp)
+ tmp_buffer = xlog_alloc_buffer(log, 1);
+ if (!tmp_buffer)
return -ENOMEM;
error = xlog_rseek_logrec_hdr(log, *head_blk, *tail_blk,
- XLOG_MAX_ICLOGS, tmp_bp, &tmp_rhead_blk,
- &tmp_rhead, &tmp_wrapped);
- xlog_put_bp(tmp_bp);
+ XLOG_MAX_ICLOGS, tmp_buffer,
+ &tmp_rhead_blk, &tmp_rhead, &tmp_wrapped);
+ kmem_free(tmp_buffer);
if (error < 0)
return error;
@@ -1205,8 +1125,8 @@ xlog_verify_head(
* (i.e., the records with invalid CRC) if the cycle number
* matches the the current cycle.
*/
- found = xlog_rseek_logrec_hdr(log, first_bad, *tail_blk, 1, bp,
- rhead_blk, rhead, wrapped);
+ found = xlog_rseek_logrec_hdr(log, first_bad, *tail_blk, 1,
+ buffer, rhead_blk, rhead, wrapped);
if (found < 0)
return found;
if (found == 0) /* XXX: right thing to do here? */
@@ -1266,7 +1186,7 @@ xlog_check_unmount_rec(
xfs_daddr_t *tail_blk,
struct xlog_rec_header *rhead,
xfs_daddr_t rhead_blk,
- struct xfs_buf *bp,
+ char *buffer,
bool *clean)
{
struct xlog_op_header *op_head;
@@ -1309,7 +1229,7 @@ xlog_check_unmount_rec(
if (*head_blk == after_umount_blk &&
be32_to_cpu(rhead->h_num_logops) == 1) {
umount_data_blk = xlog_wrap_logbno(log, rhead_blk + hblks);
- error = xlog_bread(log, umount_data_blk, 1, bp, &offset);
+ error = xlog_bread(log, umount_data_blk, 1, buffer, &offset);
if (error)
return error;
@@ -1388,7 +1308,7 @@ xlog_find_tail(
{
xlog_rec_header_t *rhead;
char *offset = NULL;
- xfs_buf_t *bp;
+ char *buffer;
int error;
xfs_daddr_t rhead_blk;
xfs_lsn_t tail_lsn;
@@ -1402,11 +1322,11 @@ xlog_find_tail(
return error;
ASSERT(*head_blk < INT_MAX);
- bp = xlog_get_bp(log, 1);
- if (!bp)
+ buffer = xlog_alloc_buffer(log, 1);
+ if (!buffer)
return -ENOMEM;
if (*head_blk == 0) { /* special case */
- error = xlog_bread(log, 0, 1, bp, &offset);
+ error = xlog_bread(log, 0, 1, buffer, &offset);
if (error)
goto done;
@@ -1422,7 +1342,7 @@ xlog_find_tail(
* block. This wraps all the way back around to the head so something is
* seriously wrong if we can't find it.
*/
- error = xlog_rseek_logrec_hdr(log, *head_blk, *head_blk, 1, bp,
+ error = xlog_rseek_logrec_hdr(log, *head_blk, *head_blk, 1, buffer,
&rhead_blk, &rhead, &wrapped);
if (error < 0)
return error;
@@ -1443,7 +1363,7 @@ xlog_find_tail(
* state to determine whether recovery is necessary.
*/
error = xlog_check_unmount_rec(log, head_blk, tail_blk, rhead,
- rhead_blk, bp, &clean);
+ rhead_blk, buffer, &clean);
if (error)
goto done;
@@ -1460,7 +1380,7 @@ xlog_find_tail(
if (!clean) {
xfs_daddr_t orig_head = *head_blk;
- error = xlog_verify_head(log, head_blk, tail_blk, bp,
+ error = xlog_verify_head(log, head_blk, tail_blk, buffer,
&rhead_blk, &rhead, &wrapped);
if (error)
goto done;
@@ -1471,7 +1391,7 @@ xlog_find_tail(
wrapped);
tail_lsn = atomic64_read(&log->l_tail_lsn);
error = xlog_check_unmount_rec(log, head_blk, tail_blk,
- rhead, rhead_blk, bp,
+ rhead, rhead_blk, buffer,
&clean);
if (error)
goto done;
@@ -1505,11 +1425,11 @@ xlog_find_tail(
* But... if the -device- itself is readonly, just skip this.
* We can't recover this device anyway, so it won't matter.
*/
- if (!xfs_readonly_buftarg(log->l_mp->m_logdev_targp))
+ if (!xfs_readonly_buftarg(log->l_targ))
error = xlog_clear_stale_blocks(log, tail_lsn);
done:
- xlog_put_bp(bp);
+ kmem_free(buffer);
if (error)
xfs_warn(log->l_mp, "failed to locate log tail");
@@ -1537,7 +1457,7 @@ xlog_find_zeroed(
struct xlog *log,
xfs_daddr_t *blk_no)
{
- xfs_buf_t *bp;
+ char *buffer;
char *offset;
uint first_cycle, last_cycle;
xfs_daddr_t new_blk, last_blk, start_blk;
@@ -1547,35 +1467,36 @@ xlog_find_zeroed(
*blk_no = 0;
/* check totally zeroed log */
- bp = xlog_get_bp(log, 1);
- if (!bp)
+ buffer = xlog_alloc_buffer(log, 1);
+ if (!buffer)
return -ENOMEM;
- error = xlog_bread(log, 0, 1, bp, &offset);
+ error = xlog_bread(log, 0, 1, buffer, &offset);
if (error)
- goto bp_err;
+ goto out_free_buffer;
first_cycle = xlog_get_cycle(offset);
if (first_cycle == 0) { /* completely zeroed log */
*blk_no = 0;
- xlog_put_bp(bp);
+ kmem_free(buffer);
return 1;
}
/* check partially zeroed log */
- error = xlog_bread(log, log_bbnum-1, 1, bp, &offset);
+ error = xlog_bread(log, log_bbnum-1, 1, buffer, &offset);
if (error)
- goto bp_err;
+ goto out_free_buffer;
last_cycle = xlog_get_cycle(offset);
if (last_cycle != 0) { /* log completely written to */
- xlog_put_bp(bp);
+ kmem_free(buffer);
return 0;
}
/* we have a partially zeroed log */
last_blk = log_bbnum-1;
- if ((error = xlog_find_cycle_start(log, bp, 0, &last_blk, 0)))
- goto bp_err;
+ error = xlog_find_cycle_start(log, buffer, 0, &last_blk, 0);
+ if (error)
+ goto out_free_buffer;
/*
* Validate the answer. Because there is no way to guarantee that
@@ -1598,7 +1519,7 @@ xlog_find_zeroed(
*/
if ((error = xlog_find_verify_cycle(log, start_blk,
(int)num_scan_bblks, 0, &new_blk)))
- goto bp_err;
+ goto out_free_buffer;
if (new_blk != -1)
last_blk = new_blk;
@@ -1610,11 +1531,11 @@ xlog_find_zeroed(
if (error == 1)
error = -EIO;
if (error)
- goto bp_err;
+ goto out_free_buffer;
*blk_no = last_blk;
-bp_err:
- xlog_put_bp(bp);
+out_free_buffer:
+ kmem_free(buffer);
if (error)
return error;
return 1;
@@ -1657,7 +1578,7 @@ xlog_write_log_records(
int tail_block)
{
char *offset;
- xfs_buf_t *bp;
+ char *buffer;
int balign, ealign;
int sectbb = log->l_sectBBsize;
int end_block = start_block + blocks;
@@ -1674,7 +1595,7 @@ xlog_write_log_records(
bufblks = 1 << ffs(blocks);
while (bufblks > log->l_logBBsize)
bufblks >>= 1;
- while (!(bp = xlog_get_bp(log, bufblks))) {
+ while (!(buffer = xlog_alloc_buffer(log, bufblks))) {
bufblks >>= 1;
if (bufblks < sectbb)
return -ENOMEM;
@@ -1686,9 +1607,9 @@ xlog_write_log_records(
*/
balign = round_down(start_block, sectbb);
if (balign != start_block) {
- error = xlog_bread_noalign(log, start_block, 1, bp);
+ error = xlog_bread_noalign(log, start_block, 1, buffer);
if (error)
- goto out_put_bp;
+ goto out_free_buffer;
j = start_block - balign;
}
@@ -1705,29 +1626,28 @@ xlog_write_log_records(
*/
ealign = round_down(end_block, sectbb);
if (j == 0 && (start_block + endcount > ealign)) {
- offset = bp->b_addr + BBTOB(ealign - start_block);
- error = xlog_bread_offset(log, ealign, sectbb,
- bp, offset);
+ error = xlog_bread_noalign(log, ealign, sectbb,
+ buffer + BBTOB(ealign - start_block));
if (error)
break;
}
- offset = xlog_align(log, start_block, endcount, bp);
+ offset = buffer + xlog_align(log, start_block);
for (; j < endcount; j++) {
xlog_add_record(log, offset, cycle, i+j,
tail_cycle, tail_block);
offset += BBSIZE;
}
- error = xlog_bwrite(log, start_block, endcount, bp);
+ error = xlog_bwrite(log, start_block, endcount, buffer);
if (error)
break;
start_block += endcount;
j = 0;
}
- out_put_bp:
- xlog_put_bp(bp);
+out_free_buffer:
+ kmem_free(buffer);
return error;
}
@@ -2162,7 +2082,7 @@ xlog_recover_do_inode_buffer(
if (xfs_sb_version_hascrc(&mp->m_sb))
bp->b_ops = &xfs_inode_buf_ops;
- inodes_per_buf = BBTOB(bp->b_io_length) >> mp->m_sb.sb_inodelog;
+ inodes_per_buf = BBTOB(bp->b_length) >> mp->m_sb.sb_inodelog;
for (i = 0; i < inodes_per_buf; i++) {
next_unlinked_offset = (i * mp->m_sb.sb_inodesize) +
offsetof(xfs_dinode_t, di_next_unlinked);
@@ -2204,8 +2124,7 @@ xlog_recover_do_inode_buffer(
ASSERT(item->ri_buf[item_index].i_addr != NULL);
ASSERT((item->ri_buf[item_index].i_len % XFS_BLF_CHUNK) == 0);
- ASSERT((reg_buf_offset + reg_buf_bytes) <=
- BBTOB(bp->b_io_length));
+ ASSERT((reg_buf_offset + reg_buf_bytes) <= BBTOB(bp->b_length));
/*
* The current logged region contains a copy of the
@@ -2439,17 +2358,21 @@ xlog_recover_validate_buf_type(
case XFS_BLFT_BTREE_BUF:
switch (magic32) {
case XFS_ABTB_CRC_MAGIC:
- case XFS_ABTC_CRC_MAGIC:
case XFS_ABTB_MAGIC:
+ bp->b_ops = &xfs_bnobt_buf_ops;
+ break;
+ case XFS_ABTC_CRC_MAGIC:
case XFS_ABTC_MAGIC:
- bp->b_ops = &xfs_allocbt_buf_ops;
+ bp->b_ops = &xfs_cntbt_buf_ops;
break;
case XFS_IBT_CRC_MAGIC:
- case XFS_FIBT_CRC_MAGIC:
case XFS_IBT_MAGIC:
- case XFS_FIBT_MAGIC:
bp->b_ops = &xfs_inobt_buf_ops;
break;
+ case XFS_FIBT_CRC_MAGIC:
+ case XFS_FIBT_MAGIC:
+ bp->b_ops = &xfs_finobt_buf_ops;
+ break;
case XFS_BMAP_CRC_MAGIC:
case XFS_BMAP_MAGIC:
bp->b_ops = &xfs_bmbt_buf_ops;
@@ -2666,7 +2589,7 @@ xlog_recover_do_reg_buffer(
ASSERT(nbits > 0);
ASSERT(item->ri_buf[i].i_addr != NULL);
ASSERT(item->ri_buf[i].i_len % XFS_BLF_CHUNK == 0);
- ASSERT(BBTOB(bp->b_io_length) >=
+ ASSERT(BBTOB(bp->b_length) >=
((uint)bit << XFS_BLF_SHIFT) + (nbits << XFS_BLF_SHIFT));
/*
@@ -2878,23 +2801,22 @@ xlog_recover_buffer_pass2(
*
* Also make sure that only inode buffers with good sizes stay in
* the buffer cache. The kernel moves inodes in buffers of 1 block
- * or mp->m_inode_cluster_size bytes, whichever is bigger. The inode
+ * or inode_cluster_size bytes, whichever is bigger. The inode
* buffers in the log can be a different size if the log was generated
* by an older kernel using unclustered inode buffers or a newer kernel
* running with a different inode cluster size. Regardless, if the
- * the inode buffer size isn't max(blocksize, mp->m_inode_cluster_size)
- * for *our* value of mp->m_inode_cluster_size, then we need to keep
+ * the inode buffer size isn't max(blocksize, inode_cluster_size)
+ * for *our* value of inode_cluster_size, then we need to keep
* the buffer out of the buffer cache so that the buffer won't
* overlap with future reads of those inodes.
*/
if (XFS_DINODE_MAGIC ==
be16_to_cpu(*((__be16 *)xfs_buf_offset(bp, 0))) &&
- (BBTOB(bp->b_io_length) != max(log->l_mp->m_sb.sb_blocksize,
- (uint32_t)log->l_mp->m_inode_cluster_size))) {
+ (BBTOB(bp->b_length) != M_IGEO(log->l_mp)->inode_cluster_size)) {
xfs_buf_stale(bp);
error = xfs_bwrite(bp);
} else {
- ASSERT(bp->b_target->bt_mount == mp);
+ ASSERT(bp->b_mount == mp);
bp->b_iodone = xlog_recover_iodone;
xfs_buf_delwri_queue(bp, buffer_list);
}
@@ -3045,7 +2967,7 @@ xlog_recover_inode_pass2(
* Make sure the place we're flushing out to really looks
* like an inode!
*/
- if (unlikely(dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC))) {
+ if (unlikely(!xfs_verify_magic16(bp, dip->di_magic))) {
xfs_alert(mp,
"%s: Bad inode magic number, dip = "PTR_FMT", dino bp = "PTR_FMT", ino = %Ld",
__func__, dip, bp, in_f->ilf_ino);
@@ -3256,7 +3178,7 @@ out_owner_change:
/* re-generate the checksum. */
xfs_dinode_calc_crc(log->l_mp, dip);
- ASSERT(bp->b_target->bt_mount == mp);
+ ASSERT(bp->b_mount == mp);
bp->b_iodone = xlog_recover_iodone;
xfs_buf_delwri_queue(bp, buffer_list);
@@ -3395,7 +3317,7 @@ xlog_recover_dquot_pass2(
}
ASSERT(dq_f->qlf_size == 2);
- ASSERT(bp->b_target->bt_mount == mp);
+ ASSERT(bp->b_mount == mp);
bp->b_iodone = xlog_recover_iodone;
xfs_buf_delwri_queue(bp, buffer_list);
@@ -3459,7 +3381,7 @@ xlog_recover_efd_pass2(
{
xfs_efd_log_format_t *efd_formatp;
xfs_efi_log_item_t *efip = NULL;
- xfs_log_item_t *lip;
+ struct xfs_log_item *lip;
uint64_t efi_id;
struct xfs_ail_cursor cur;
struct xfs_ail *ailp = log->l_ailp;
@@ -3845,6 +3767,7 @@ xlog_recover_do_icreate_pass2(
{
struct xfs_mount *mp = log->l_mp;
struct xfs_icreate_log *icl;
+ struct xfs_ino_geometry *igeo = M_IGEO(mp);
xfs_agnumber_t agno;
xfs_agblock_t agbno;
unsigned int count;
@@ -3894,10 +3817,10 @@ xlog_recover_do_icreate_pass2(
/*
* The inode chunk is either full or sparse and we only support
- * m_ialloc_min_blks sized sparse allocations at this time.
+ * m_ino_geo.ialloc_min_blks sized sparse allocations at this time.
*/
- if (length != mp->m_ialloc_blks &&
- length != mp->m_ialloc_min_blks) {
+ if (length != igeo->ialloc_blks &&
+ length != igeo->ialloc_min_blks) {
xfs_warn(log->l_mp,
"%s: unsupported chunk length", __FUNCTION__);
return -EINVAL;
@@ -3917,13 +3840,13 @@ xlog_recover_do_icreate_pass2(
* buffers for cancellation so we don't overwrite anything written after
* a cancellation.
*/
- bb_per_cluster = XFS_FSB_TO_BB(mp, mp->m_blocks_per_cluster);
- nbufs = length / mp->m_blocks_per_cluster;
+ bb_per_cluster = XFS_FSB_TO_BB(mp, igeo->blocks_per_cluster);
+ nbufs = length / igeo->blocks_per_cluster;
for (i = 0, cancel_count = 0; i < nbufs; i++) {
xfs_daddr_t daddr;
daddr = XFS_AGB_TO_DADDR(mp, agno,
- agbno + i * mp->m_blocks_per_cluster);
+ agbno + i * igeo->blocks_per_cluster);
if (xlog_check_buffer_cancelled(log, daddr, bb_per_cluster, 0))
cancel_count++;
}
@@ -4952,12 +4875,11 @@ out:
* A cancel occurs when the mount has failed and we're bailing out.
* Release all pending log intent items so they don't pin the AIL.
*/
-STATIC int
+STATIC void
xlog_recover_cancel_intents(
struct xlog *log)
{
struct xfs_log_item *lip;
- int error = 0;
struct xfs_ail_cursor cur;
struct xfs_ail *ailp;
@@ -4997,7 +4919,6 @@ xlog_recover_cancel_intents(
xfs_trans_ail_cursor_done(&cur);
spin_unlock(&ailp->ail_lock);
- return error;
}
/*
@@ -5163,7 +5084,7 @@ xlog_recover_process_iunlinks(
}
}
-STATIC int
+STATIC void
xlog_unpack_data(
struct xlog_rec_header *rhead,
char *dp,
@@ -5186,8 +5107,6 @@ xlog_unpack_data(
dp += BBSIZE;
}
}
-
- return 0;
}
/*
@@ -5202,11 +5121,9 @@ xlog_recover_process(
int pass,
struct list_head *buffer_list)
{
- int error;
__le32 old_crc = rhead->h_crc;
__le32 crc;
-
crc = xlog_cksum(log, rhead, dp, be32_to_cpu(rhead->h_len));
/*
@@ -5245,9 +5162,7 @@ xlog_recover_process(
return -EFSCORRUPTED;
}
- error = xlog_unpack_data(rhead, dp, log);
- if (error)
- return error;
+ xlog_unpack_data(rhead, dp, log);
return xlog_recover_process_data(log, rhash, rhead, dp, pass,
buffer_list);
@@ -5309,7 +5224,7 @@ xlog_do_recovery_pass(
xfs_daddr_t blk_no, rblk_no;
xfs_daddr_t rhead_blk;
char *offset;
- xfs_buf_t *hbp, *dbp;
+ char *hbp, *dbp;
int error = 0, h_size, h_len;
int error2 = 0;
int bblks, split_bblks;
@@ -5334,7 +5249,7 @@ xlog_do_recovery_pass(
* iclog header and extract the header size from it. Get a
* new hbp that is the correct size.
*/
- hbp = xlog_get_bp(log, 1);
+ hbp = xlog_alloc_buffer(log, 1);
if (!hbp)
return -ENOMEM;
@@ -5376,23 +5291,23 @@ xlog_do_recovery_pass(
hblks = h_size / XLOG_HEADER_CYCLE_SIZE;
if (h_size % XLOG_HEADER_CYCLE_SIZE)
hblks++;
- xlog_put_bp(hbp);
- hbp = xlog_get_bp(log, hblks);
+ kmem_free(hbp);
+ hbp = xlog_alloc_buffer(log, hblks);
} else {
hblks = 1;
}
} else {
ASSERT(log->l_sectBBsize == 1);
hblks = 1;
- hbp = xlog_get_bp(log, 1);
+ hbp = xlog_alloc_buffer(log, 1);
h_size = XLOG_BIG_RECORD_BSIZE;
}
if (!hbp)
return -ENOMEM;
- dbp = xlog_get_bp(log, BTOBB(h_size));
+ dbp = xlog_alloc_buffer(log, BTOBB(h_size));
if (!dbp) {
- xlog_put_bp(hbp);
+ kmem_free(hbp);
return -ENOMEM;
}
@@ -5407,7 +5322,7 @@ xlog_do_recovery_pass(
/*
* Check for header wrapping around physical end-of-log
*/
- offset = hbp->b_addr;
+ offset = hbp;
split_hblks = 0;
wrapped_hblks = 0;
if (blk_no + hblks <= log->l_logBBsize) {
@@ -5443,8 +5358,8 @@ xlog_do_recovery_pass(
* - order is important.
*/
wrapped_hblks = hblks - split_hblks;
- error = xlog_bread_offset(log, 0,
- wrapped_hblks, hbp,
+ error = xlog_bread_noalign(log, 0,
+ wrapped_hblks,
offset + BBTOB(split_hblks));
if (error)
goto bread_err2;
@@ -5475,7 +5390,7 @@ xlog_do_recovery_pass(
} else {
/* This log record is split across the
* physical end of log */
- offset = dbp->b_addr;
+ offset = dbp;
split_bblks = 0;
if (blk_no != log->l_logBBsize) {
/* some data is before the physical
@@ -5504,8 +5419,8 @@ xlog_do_recovery_pass(
* _first_, then the log start (LR header end)
* - order is important.
*/
- error = xlog_bread_offset(log, 0,
- bblks - split_bblks, dbp,
+ error = xlog_bread_noalign(log, 0,
+ bblks - split_bblks,
offset + BBTOB(split_bblks));
if (error)
goto bread_err2;
@@ -5553,9 +5468,9 @@ xlog_do_recovery_pass(
}
bread_err2:
- xlog_put_bp(dbp);
+ kmem_free(dbp);
bread_err1:
- xlog_put_bp(hbp);
+ kmem_free(hbp);
/*
* Submit buffers that have been added from the last record processed,
@@ -5689,7 +5604,7 @@ xlog_do_recover(
* Now that we've finished replaying all buffer and inode
* updates, re-read in the superblock and reverify it.
*/
- bp = xfs_getsb(mp, 0);
+ bp = xfs_getsb(mp);
bp->b_flags &= ~(XBF_DONE | XBF_ASYNC);
ASSERT(!(bp->b_flags & XBF_WRITE));
bp->b_flags |= XBF_READ;
@@ -5862,16 +5777,12 @@ xlog_recover_finish(
return 0;
}
-int
+void
xlog_recover_cancel(
struct xlog *log)
{
- int error = 0;
-
if (log->l_flags & XLOG_RECOVERY_NEEDED)
- error = xlog_recover_cancel_intents(log);
-
- return error;
+ xlog_recover_cancel_intents(log);
}
#if defined(DEBUG)
diff --git a/fs/xfs/xfs_message.c b/fs/xfs/xfs_message.c
index 6b736ea58d35..9804efe525a9 100644
--- a/fs/xfs/xfs_message.c
+++ b/fs/xfs/xfs_message.c
@@ -6,8 +6,8 @@
#include "xfs.h"
#include "xfs_fs.h"
#include "xfs_error.h"
+#include "xfs_shared.h"
#include "xfs_format.h"
-#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index b4d8c318be3c..322da6909290 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -12,9 +12,6 @@
#include "xfs_bit.h"
#include "xfs_sb.h"
#include "xfs_mount.h"
-#include "xfs_defer.h"
-#include "xfs_da_format.h"
-#include "xfs_da_btree.h"
#include "xfs_inode.h"
#include "xfs_dir2.h"
#include "xfs_ialloc.h"
@@ -27,13 +24,13 @@
#include "xfs_error.h"
#include "xfs_quota.h"
#include "xfs_fsops.h"
-#include "xfs_trace.h"
#include "xfs_icache.h"
#include "xfs_sysfs.h"
#include "xfs_rmap_btree.h"
#include "xfs_refcount_btree.h"
#include "xfs_reflink.h"
#include "xfs_extent_busy.h"
+#include "xfs_health.h"
static DEFINE_MUTEX(xfs_uuid_table_mutex);
@@ -149,6 +146,7 @@ xfs_free_perag(
spin_unlock(&mp->m_perag_lock);
ASSERT(pag);
ASSERT(atomic_read(&pag->pag_ref) == 0);
+ xfs_iunlink_destroy(pag);
xfs_buf_hash_destroy(pag);
mutex_destroy(&pag->pag_ici_reclaim_lock);
call_rcu(&pag->rcu_head, __xfs_free_perag);
@@ -227,6 +225,10 @@ xfs_initialize_perag(
/* first new pag is fully initialized */
if (first_initialised == NULLAGNUMBER)
first_initialised = index;
+ error = xfs_iunlink_init(pag);
+ if (error)
+ goto out_hash_destroy;
+ spin_lock_init(&pag->pag_state_lock);
}
index = xfs_set_inode_alloc(mp, agcount);
@@ -249,6 +251,7 @@ out_unwind_new_pags:
if (!pag)
break;
xfs_buf_hash_destroy(pag);
+ xfs_iunlink_destroy(pag);
mutex_destroy(&pag->pag_ici_reclaim_lock);
kmem_free(pag);
}
@@ -423,30 +426,6 @@ xfs_update_alignment(xfs_mount_t *mp)
}
/*
- * Set the maximum inode count for this filesystem
- */
-STATIC void
-xfs_set_maxicount(xfs_mount_t *mp)
-{
- xfs_sb_t *sbp = &(mp->m_sb);
- uint64_t icount;
-
- if (sbp->sb_imax_pct) {
- /*
- * Make sure the maximum inode count is a multiple
- * of the units we allocate inodes in.
- */
- icount = sbp->sb_dblocks * sbp->sb_imax_pct;
- do_div(icount, 100);
- do_div(icount, mp->m_ialloc_blks);
- mp->m_maxicount = (icount * mp->m_ialloc_blks) <<
- sbp->sb_inopblog;
- } else {
- mp->m_maxicount = 0;
- }
-}
-
-/*
* Set the default minimum read and write sizes unless
* already specified in a mount option.
* We use smaller I/O sizes when the file system
@@ -502,29 +481,6 @@ xfs_set_low_space_thresholds(
}
}
-
-/*
- * Set whether we're using inode alignment.
- */
-STATIC void
-xfs_set_inoalignment(xfs_mount_t *mp)
-{
- if (xfs_sb_version_hasalign(&mp->m_sb) &&
- mp->m_sb.sb_inoalignmt >= xfs_icluster_size_fsb(mp))
- mp->m_inoalign_mask = mp->m_sb.sb_inoalignmt - 1;
- else
- mp->m_inoalign_mask = 0;
- /*
- * If we are using stripe alignment, check whether
- * the stripe unit is a multiple of the inode alignment
- */
- if (mp->m_dalign && mp->m_inoalign_mask &&
- !(mp->m_dalign & mp->m_inoalign_mask))
- mp->m_sinoalign = mp->m_dalign;
- else
- mp->m_sinoalign = 0;
-}
-
/*
* Check that the data (and log if separate) is an ok size.
*/
@@ -639,7 +595,7 @@ xfs_check_summary_counts(
(mp->m_sb.sb_fdblocks > mp->m_sb.sb_dblocks ||
!xfs_verify_icount(mp, mp->m_sb.sb_icount) ||
mp->m_sb.sb_ifree > mp->m_sb.sb_icount))
- mp->m_flags |= XFS_MOUNT_BAD_SUMMARY;
+ xfs_fs_mark_sick(mp, XFS_SICK_FS_COUNTERS);
/*
* We can safely re-initialise incore superblock counters from the
@@ -654,7 +610,7 @@ xfs_check_summary_counts(
*/
if ((!xfs_sb_version_haslazysbcount(&mp->m_sb) ||
XFS_LAST_UNMOUNT_WAS_CLEAN(mp)) &&
- !(mp->m_flags & XFS_MOUNT_BAD_SUMMARY))
+ !xfs_fs_has_sickness(mp, XFS_SICK_FS_COUNTERS))
return 0;
return xfs_initialize_perag_data(mp, mp->m_sb.sb_agcount);
@@ -676,6 +632,7 @@ xfs_mountfs(
{
struct xfs_sb *sbp = &(mp->m_sb);
struct xfs_inode *rip;
+ struct xfs_ino_geometry *igeo = M_IGEO(mp);
uint64_t resblks;
uint quotamount = 0;
uint quotaflags = 0;
@@ -742,12 +699,10 @@ xfs_mountfs(
xfs_alloc_compute_maxlevels(mp);
xfs_bmap_compute_maxlevels(mp, XFS_DATA_FORK);
xfs_bmap_compute_maxlevels(mp, XFS_ATTR_FORK);
- xfs_ialloc_compute_maxlevels(mp);
+ xfs_ialloc_setup_geometry(mp);
xfs_rmapbt_compute_maxlevels(mp);
xfs_refcountbt_compute_maxlevels(mp);
- xfs_set_maxicount(mp);
-
/* enable fail_at_unmount as default */
mp->m_fail_unmount = true;
@@ -781,50 +736,22 @@ xfs_mountfs(
xfs_set_low_space_thresholds(mp);
/*
- * Set the inode cluster size.
- * This may still be overridden by the file system
- * block size if it is larger than the chosen cluster size.
- *
- * For v5 filesystems, scale the cluster size with the inode size to
- * keep a constant ratio of inode per cluster buffer, but only if mkfs
- * has set the inode alignment value appropriately for larger cluster
- * sizes.
- */
- mp->m_inode_cluster_size = XFS_INODE_BIG_CLUSTER_SIZE;
- if (xfs_sb_version_hascrc(&mp->m_sb)) {
- int new_size = mp->m_inode_cluster_size;
-
- new_size *= mp->m_sb.sb_inodesize / XFS_DINODE_MIN_SIZE;
- if (mp->m_sb.sb_inoalignmt >= XFS_B_TO_FSBT(mp, new_size))
- mp->m_inode_cluster_size = new_size;
- }
- mp->m_blocks_per_cluster = xfs_icluster_size_fsb(mp);
- mp->m_inodes_per_cluster = XFS_FSB_TO_INO(mp, mp->m_blocks_per_cluster);
- mp->m_cluster_align = xfs_ialloc_cluster_alignment(mp);
- mp->m_cluster_align_inodes = XFS_FSB_TO_INO(mp, mp->m_cluster_align);
-
- /*
* If enabled, sparse inode chunk alignment is expected to match the
* cluster size. Full inode chunk alignment must match the chunk size,
* but that is checked on sb read verification...
*/
if (xfs_sb_version_hassparseinodes(&mp->m_sb) &&
mp->m_sb.sb_spino_align !=
- XFS_B_TO_FSBT(mp, mp->m_inode_cluster_size)) {
+ XFS_B_TO_FSBT(mp, igeo->inode_cluster_size_raw)) {
xfs_warn(mp,
"Sparse inode block alignment (%u) must match cluster size (%llu).",
mp->m_sb.sb_spino_align,
- XFS_B_TO_FSBT(mp, mp->m_inode_cluster_size));
+ XFS_B_TO_FSBT(mp, igeo->inode_cluster_size_raw));
error = -EINVAL;
goto out_remove_uuid;
}
/*
- * Set inode alignment fields
- */
- xfs_set_inoalignment(mp);
-
- /*
* Check that the data (and log if separate) is an ok size.
*/
error = xfs_check_sizes(mp);
@@ -1063,6 +990,7 @@ xfs_mountfs(
*/
cancel_delayed_work_sync(&mp->m_reclaim_work);
xfs_reclaim_inodes(mp, SYNC_WAIT);
+ xfs_health_unmount(mp);
out_log_dealloc:
mp->m_flags |= XFS_MOUNT_UNMOUNTING;
xfs_log_mount_cancel(mp);
@@ -1099,7 +1027,7 @@ xfs_unmountfs(
uint64_t resblks;
int error;
- xfs_icache_disable_reclaim(mp);
+ xfs_stop_block_reaping(mp);
xfs_fs_unreserve_ag_blocks(mp);
xfs_qm_unmount_quotas(mp);
xfs_rtunmount_inodes(mp);
@@ -1145,6 +1073,7 @@ xfs_unmountfs(
*/
cancel_delayed_work_sync(&mp->m_reclaim_work);
xfs_reclaim_inodes(mp, SYNC_WAIT);
+ xfs_health_unmount(mp);
xfs_qm_unmount(mp);
@@ -1376,24 +1305,14 @@ xfs_mod_frextents(
* xfs_getsb() is called to obtain the buffer for the superblock.
* The buffer is returned locked and read in from disk.
* The buffer should be released with a call to xfs_brelse().
- *
- * If the flags parameter is BUF_TRYLOCK, then we'll only return
- * the superblock buffer if it can be locked without sleeping.
- * If it can't then we'll return NULL.
*/
struct xfs_buf *
xfs_getsb(
- struct xfs_mount *mp,
- int flags)
+ struct xfs_mount *mp)
{
struct xfs_buf *bp = mp->m_sb_bp;
- if (!xfs_buf_trylock(bp)) {
- if (flags & XBF_TRYLOCK)
- return NULL;
- xfs_buf_lock(bp);
- }
-
+ xfs_buf_lock(bp);
xfs_buf_hold(bp);
ASSERT(bp->b_flags & XBF_DONE);
return bp;
@@ -1440,7 +1359,26 @@ xfs_force_summary_recalc(
if (!xfs_sb_version_haslazysbcount(&mp->m_sb))
return;
- spin_lock(&mp->m_sb_lock);
- mp->m_flags |= XFS_MOUNT_BAD_SUMMARY;
- spin_unlock(&mp->m_sb_lock);
+ xfs_fs_mark_sick(mp, XFS_SICK_FS_COUNTERS);
+}
+
+/*
+ * Update the in-core delayed block counter.
+ *
+ * We prefer to update the counter without having to take a spinlock for every
+ * counter update (i.e. batching). Each change to delayed allocation
+ * reservations can change can easily exceed the default percpu counter
+ * batching, so we use a larger batch factor here.
+ *
+ * Note that we don't currently have any callers requiring fast summation
+ * (e.g. percpu_counter_read) so we can use a big batch value here.
+ */
+#define XFS_DELALLOC_BATCH (4096)
+void
+xfs_mod_delalloc(
+ struct xfs_mount *mp,
+ int64_t delta)
+{
+ percpu_counter_add_batch(&mp->m_delalloc_blks, delta,
+ XFS_DELALLOC_BATCH);
}
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 7daafe064af8..4adb6837439a 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -60,6 +60,20 @@ struct xfs_error_cfg {
typedef struct xfs_mount {
struct super_block *m_super;
xfs_tid_t m_tid; /* next unused tid for fs */
+
+ /*
+ * Bitsets of per-fs metadata that have been checked and/or are sick.
+ * Callers must hold m_sb_lock to access these two fields.
+ */
+ uint8_t m_fs_checked;
+ uint8_t m_fs_sick;
+ /*
+ * Bitsets of rt metadata that have been checked and/or are sick.
+ * Callers must hold m_sb_lock to access this field.
+ */
+ uint8_t m_rt_checked;
+ uint8_t m_rt_sick;
+
struct xfs_ail *m_ail; /* fs active log item list */
struct xfs_sb m_sb; /* copy of fs superblock */
@@ -67,6 +81,12 @@ typedef struct xfs_mount {
struct percpu_counter m_icount; /* allocated inodes counter */
struct percpu_counter m_ifree; /* free inodes counter */
struct percpu_counter m_fdblocks; /* free block counter */
+ /*
+ * Count of data device blocks reserved for delayed allocations,
+ * including indlen blocks. Does not include allocated CoW staging
+ * extents or anything related to the rt device.
+ */
+ struct percpu_counter m_delalloc_blks;
struct xfs_buf *m_sb_bp; /* buffer for superblock */
char *m_fsname; /* filesystem name */
@@ -85,6 +105,7 @@ typedef struct xfs_mount {
struct xfs_da_geometry *m_dir_geo; /* directory block geometry */
struct xfs_da_geometry *m_attr_geo; /* attribute block geometry */
struct xlog *m_log; /* log specific stuff */
+ struct xfs_ino_geometry m_ino_geo; /* inode geometry */
int m_logbufs; /* number of log buffers */
int m_logbsize; /* size of each log buffer */
uint m_rsumlevels; /* rt summary levels */
@@ -106,12 +127,6 @@ typedef struct xfs_mount {
uint8_t m_blkbit_log; /* blocklog + NBBY */
uint8_t m_blkbb_log; /* blocklog - BBSHIFT */
uint8_t m_agno_log; /* log #ag's */
- uint8_t m_agino_log; /* #bits for agino in inum */
- uint m_inode_cluster_size;/* min inode buf size */
- unsigned int m_inodes_per_cluster;
- unsigned int m_blocks_per_cluster;
- unsigned int m_cluster_align;
- unsigned int m_cluster_align_inodes;
uint m_blockmask; /* sb_blocksize-1 */
uint m_blockwsize; /* sb_blocksize in words */
uint m_blockwmask; /* blockwsize-1 */
@@ -119,15 +134,12 @@ typedef struct xfs_mount {
uint m_alloc_mnr[2]; /* min alloc btree records */
uint m_bmap_dmxr[2]; /* max bmap btree records */
uint m_bmap_dmnr[2]; /* min bmap btree records */
- uint m_inobt_mxr[2]; /* max inobt btree records */
- uint m_inobt_mnr[2]; /* min inobt btree records */
uint m_rmap_mxr[2]; /* max rmap btree records */
uint m_rmap_mnr[2]; /* min rmap btree records */
uint m_refc_mxr[2]; /* max refc btree records */
uint m_refc_mnr[2]; /* min refc btree records */
uint m_ag_maxlevels; /* XFS_AG_MAXLEVELS */
uint m_bm_maxlevels[2]; /* XFS_BM_MAXLEVELS */
- uint m_in_maxlevels; /* max inobt btree levels. */
uint m_rmap_maxlevels; /* max rmap btree levels */
uint m_refc_maxlevels; /* max refcount btree level */
xfs_extlen_t m_ag_prealloc_blocks; /* reserved ag blocks */
@@ -138,21 +150,14 @@ typedef struct xfs_mount {
struct mutex m_growlock; /* growfs mutex */
int m_fixedfsid[2]; /* unchanged for life of FS */
uint64_t m_flags; /* global mount flags */
- bool m_inotbt_nores; /* no per-AG finobt resv. */
- int m_ialloc_inos; /* inodes in inode allocation */
- int m_ialloc_blks; /* blocks in inode allocation */
- int m_ialloc_min_blks;/* min blocks in sparse inode
- * allocation */
- int m_inoalign_mask;/* mask sb_inoalignmt if used */
+ bool m_finobt_nores; /* no per-AG finobt resv. */
uint m_qflags; /* quota status flags */
struct xfs_trans_resv m_resv; /* precomputed res values */
- uint64_t m_maxicount; /* maximum inode count */
uint64_t m_resblks; /* total reserved blocks */
uint64_t m_resblks_avail;/* available reserved blocks */
uint64_t m_resblks_save; /* reserved blks @ remount,ro */
int m_dalign; /* stripe unit */
int m_swidth; /* stripe width */
- int m_sinoalign; /* stripe unit inode alignment */
uint8_t m_sectbb_log; /* sectlog - BBSHIFT */
const struct xfs_nameops *m_dirnameops; /* vector of dir name ops */
const struct xfs_dir_ops *m_dir_inode_ops; /* vector of dir inode ops */
@@ -175,11 +180,9 @@ typedef struct xfs_mount {
struct xstats m_stats; /* per-fs stats */
struct workqueue_struct *m_buf_workqueue;
- struct workqueue_struct *m_data_workqueue;
struct workqueue_struct *m_unwritten_workqueue;
struct workqueue_struct *m_cil_workqueue;
struct workqueue_struct *m_reclaim_workqueue;
- struct workqueue_struct *m_log_workqueue;
struct workqueue_struct *m_eofblocks_workqueue;
struct workqueue_struct *m_sync_workqueue;
@@ -194,6 +197,7 @@ typedef struct xfs_mount {
*/
uint32_t m_generation;
+ bool m_always_cow;
bool m_fail_unmount;
#ifdef DEBUG
/*
@@ -206,6 +210,8 @@ typedef struct xfs_mount {
#endif
} xfs_mount_t;
+#define M_IGEO(mp) (&(mp)->m_ino_geo)
+
/*
* Flags for m_flags.
*/
@@ -213,7 +219,6 @@ typedef struct xfs_mount {
must be synchronous except
for space allocations */
#define XFS_MOUNT_UNMOUNTING (1ULL << 1) /* filesystem is unmounting */
-#define XFS_MOUNT_BAD_SUMMARY (1ULL << 2) /* summary counters are bad */
#define XFS_MOUNT_WAS_CLEAN (1ULL << 3)
#define XFS_MOUNT_FS_SHUTDOWN (1ULL << 4) /* atomic stop of all filesystem
operations, typically for
@@ -368,6 +373,15 @@ typedef struct xfs_perag {
xfs_agino_t pagl_pagino;
xfs_agino_t pagl_leftrec;
xfs_agino_t pagl_rightrec;
+
+ /*
+ * Bitsets of per-ag metadata that have been checked and/or are sick.
+ * Callers should hold pag_state_lock before accessing this field.
+ */
+ uint16_t pag_checked;
+ uint16_t pag_sick;
+ spinlock_t pag_state_lock;
+
spinlock_t pagb_lock; /* lock for pagb_tree */
struct rb_root pagb_tree; /* ordered tree of busy extents */
unsigned int pagb_gen; /* generation count for pagb_tree */
@@ -396,6 +410,13 @@ typedef struct xfs_perag {
/* reference count */
uint8_t pagf_refcount_level;
+
+ /*
+ * Unlinked inode information. This incore information reflects
+ * data stored in the AGI, so callers must hold the AGI buffer lock
+ * or have some other means to control concurrency.
+ */
+ struct rhashtable pagi_unlinked_hash;
} xfs_perag_t;
static inline struct xfs_ag_resv *
@@ -430,7 +451,7 @@ extern int xfs_mod_fdblocks(struct xfs_mount *mp, int64_t delta,
bool reserved);
extern int xfs_mod_frextents(struct xfs_mount *mp, int64_t delta);
-extern struct xfs_buf *xfs_getsb(xfs_mount_t *, int);
+extern struct xfs_buf *xfs_getsb(xfs_mount_t *);
extern int xfs_readsb(xfs_mount_t *, int);
extern void xfs_freesb(xfs_mount_t *);
extern bool xfs_fs_writable(struct xfs_mount *mp, int level);
@@ -446,5 +467,6 @@ int xfs_zero_extent(struct xfs_inode *ip, xfs_fsblock_t start_fsb,
struct xfs_error_cfg * xfs_error_get_cfg(struct xfs_mount *mp,
int error_class, int error);
void xfs_force_summary_recalc(struct xfs_mount *mp);
+void xfs_mod_delalloc(struct xfs_mount *mp, int64_t delta);
#endif /* __XFS_MOUNT_H__ */
diff --git a/fs/xfs/xfs_ondisk.h b/fs/xfs/xfs_ondisk.h
index d3e04d20d8d4..b6701b4f59a9 100644
--- a/fs/xfs/xfs_ondisk.h
+++ b/fs/xfs/xfs_ondisk.h
@@ -125,6 +125,32 @@ xfs_check_ondisk_structs(void)
XFS_CHECK_STRUCT_SIZE(struct xfs_inode_log_format, 56);
XFS_CHECK_STRUCT_SIZE(struct xfs_qoff_logformat, 20);
XFS_CHECK_STRUCT_SIZE(struct xfs_trans_header, 16);
+
+ /*
+ * The v5 superblock format extended several v4 header structures with
+ * additional data. While new fields are only accessible on v5
+ * superblocks, it's important that the v5 structures place original v4
+ * fields/headers in the correct location on-disk. For example, we must
+ * be able to find magic values at the same location in certain blocks
+ * regardless of superblock version.
+ *
+ * The following checks ensure that various v5 data structures place the
+ * subset of v4 metadata associated with the same type of block at the
+ * start of the on-disk block. If there is no data structure definition
+ * for certain types of v4 blocks, traverse down to the first field of
+ * common metadata (e.g., magic value) and make sure it is at offset
+ * zero.
+ */
+ XFS_CHECK_OFFSET(struct xfs_dir3_leaf, hdr.info.hdr, 0);
+ XFS_CHECK_OFFSET(struct xfs_da3_intnode, hdr.info.hdr, 0);
+ XFS_CHECK_OFFSET(struct xfs_dir3_data_hdr, hdr.magic, 0);
+ XFS_CHECK_OFFSET(struct xfs_dir3_free, hdr.hdr.magic, 0);
+ XFS_CHECK_OFFSET(struct xfs_attr3_leafblock, hdr.info.hdr, 0);
+
+ XFS_CHECK_STRUCT_SIZE(struct xfs_bulkstat, 192);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_inumbers, 24);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_bulkstat_req, 64);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_inumbers_req, 64);
}
#endif /* __XFS_ONDISK_H */
diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c
index f44c3599527d..0c954cad7449 100644
--- a/fs/xfs/xfs_pnfs.c
+++ b/fs/xfs/xfs_pnfs.c
@@ -2,23 +2,16 @@
/*
* Copyright (c) 2014 Christoph Hellwig.
*/
-#include <linux/iomap.h>
#include "xfs.h"
+#include "xfs_shared.h"
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
-#include "xfs_sb.h"
#include "xfs_mount.h"
#include "xfs_inode.h"
#include "xfs_trans.h"
-#include "xfs_log.h"
#include "xfs_bmap.h"
-#include "xfs_bmap_util.h"
-#include "xfs_error.h"
#include "xfs_iomap.h"
-#include "xfs_shared.h"
-#include "xfs_bit.h"
-#include "xfs_pnfs.h"
/*
* Ensure that we do not have any outstanding pNFS layouts that can be used by
@@ -185,7 +178,7 @@ xfs_fs_map_blocks(
}
xfs_iunlock(ip, XFS_IOLOCK_EXCL);
- xfs_bmbt_to_iomap(ip, iomap, &imap);
+ error = xfs_bmbt_to_iomap(ip, iomap, &imap, false);
*device_generation = mp->m_generation;
return error;
out_unlock:
diff --git a/fs/xfs/xfs_pwork.c b/fs/xfs/xfs_pwork.c
new file mode 100644
index 000000000000..4bcc3e61056c
--- /dev/null
+++ b/fs/xfs/xfs_pwork.c
@@ -0,0 +1,136 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2019 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_trace.h"
+#include "xfs_sysctl.h"
+#include "xfs_pwork.h"
+#include <linux/nmi.h>
+
+/*
+ * Parallel Work Queue
+ * ===================
+ *
+ * Abstract away the details of running a large and "obviously" parallelizable
+ * task across multiple CPUs. Callers initialize the pwork control object with
+ * a desired level of parallelization and a work function. Next, they embed
+ * struct xfs_pwork in whatever structure they use to pass work context to a
+ * worker thread and queue that pwork. The work function will be passed the
+ * pwork item when it is run (from process context) and any returned error will
+ * be recorded in xfs_pwork_ctl.error. Work functions should check for errors
+ * and abort if necessary; the non-zeroness of xfs_pwork_ctl.error does not
+ * stop workqueue item processing.
+ *
+ * This is the rough equivalent of the xfsprogs workqueue code, though we can't
+ * reuse that name here.
+ */
+
+/* Invoke our caller's function. */
+static void
+xfs_pwork_work(
+ struct work_struct *work)
+{
+ struct xfs_pwork *pwork;
+ struct xfs_pwork_ctl *pctl;
+ int error;
+
+ pwork = container_of(work, struct xfs_pwork, work);
+ pctl = pwork->pctl;
+ error = pctl->work_fn(pctl->mp, pwork);
+ if (error && !pctl->error)
+ pctl->error = error;
+ if (atomic_dec_and_test(&pctl->nr_work))
+ wake_up(&pctl->poll_wait);
+}
+
+/*
+ * Set up control data for parallel work. @work_fn is the function that will
+ * be called. @tag will be written into the kernel threads. @nr_threads is
+ * the level of parallelism desired, or 0 for no limit.
+ */
+int
+xfs_pwork_init(
+ struct xfs_mount *mp,
+ struct xfs_pwork_ctl *pctl,
+ xfs_pwork_work_fn work_fn,
+ const char *tag,
+ unsigned int nr_threads)
+{
+#ifdef DEBUG
+ if (xfs_globals.pwork_threads >= 0)
+ nr_threads = xfs_globals.pwork_threads;
+#endif
+ trace_xfs_pwork_init(mp, nr_threads, current->pid);
+
+ pctl->wq = alloc_workqueue("%s-%d", WQ_FREEZABLE, nr_threads, tag,
+ current->pid);
+ if (!pctl->wq)
+ return -ENOMEM;
+ pctl->work_fn = work_fn;
+ pctl->error = 0;
+ pctl->mp = mp;
+ atomic_set(&pctl->nr_work, 0);
+ init_waitqueue_head(&pctl->poll_wait);
+
+ return 0;
+}
+
+/* Queue some parallel work. */
+void
+xfs_pwork_queue(
+ struct xfs_pwork_ctl *pctl,
+ struct xfs_pwork *pwork)
+{
+ INIT_WORK(&pwork->work, xfs_pwork_work);
+ pwork->pctl = pctl;
+ atomic_inc(&pctl->nr_work);
+ queue_work(pctl->wq, &pwork->work);
+}
+
+/* Wait for the work to finish and tear down the control structure. */
+int
+xfs_pwork_destroy(
+ struct xfs_pwork_ctl *pctl)
+{
+ destroy_workqueue(pctl->wq);
+ pctl->wq = NULL;
+ return pctl->error;
+}
+
+/*
+ * Wait for the work to finish by polling completion status and touch the soft
+ * lockup watchdog. This is for callers such as mount which hold locks.
+ */
+void
+xfs_pwork_poll(
+ struct xfs_pwork_ctl *pctl)
+{
+ while (wait_event_timeout(pctl->poll_wait,
+ atomic_read(&pctl->nr_work) == 0, HZ) == 0)
+ touch_softlockup_watchdog();
+}
+
+/*
+ * Return the amount of parallelism that the data device can handle, or 0 for
+ * no limit.
+ */
+unsigned int
+xfs_pwork_guess_datadev_parallelism(
+ struct xfs_mount *mp)
+{
+ struct xfs_buftarg *btp = mp->m_ddev_targp;
+
+ /*
+ * For now we'll go with the most conservative setting possible,
+ * which is two threads for an SSD and 1 thread everywhere else.
+ */
+ return blk_queue_nonrot(btp->bt_bdev->bd_queue) ? 2 : 1;
+}
diff --git a/fs/xfs/xfs_pwork.h b/fs/xfs/xfs_pwork.h
new file mode 100644
index 000000000000..8133124cf3bb
--- /dev/null
+++ b/fs/xfs/xfs_pwork.h
@@ -0,0 +1,61 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (C) 2019 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ */
+#ifndef __XFS_PWORK_H__
+#define __XFS_PWORK_H__
+
+struct xfs_pwork;
+struct xfs_mount;
+
+typedef int (*xfs_pwork_work_fn)(struct xfs_mount *mp, struct xfs_pwork *pwork);
+
+/*
+ * Parallel work coordination structure.
+ */
+struct xfs_pwork_ctl {
+ struct workqueue_struct *wq;
+ struct xfs_mount *mp;
+ xfs_pwork_work_fn work_fn;
+ struct wait_queue_head poll_wait;
+ atomic_t nr_work;
+ int error;
+};
+
+/*
+ * Embed this parallel work control item inside your own work structure,
+ * then queue work with it.
+ */
+struct xfs_pwork {
+ struct work_struct work;
+ struct xfs_pwork_ctl *pctl;
+};
+
+#define XFS_PWORK_SINGLE_THREADED { .pctl = NULL }
+
+/* Have we been told to abort? */
+static inline bool
+xfs_pwork_ctl_want_abort(
+ struct xfs_pwork_ctl *pctl)
+{
+ return pctl && pctl->error;
+}
+
+/* Have we been told to abort? */
+static inline bool
+xfs_pwork_want_abort(
+ struct xfs_pwork *pwork)
+{
+ return xfs_pwork_ctl_want_abort(pwork->pctl);
+}
+
+int xfs_pwork_init(struct xfs_mount *mp, struct xfs_pwork_ctl *pctl,
+ xfs_pwork_work_fn work_fn, const char *tag,
+ unsigned int nr_threads);
+void xfs_pwork_queue(struct xfs_pwork_ctl *pctl, struct xfs_pwork *pwork);
+int xfs_pwork_destroy(struct xfs_pwork_ctl *pctl);
+void xfs_pwork_poll(struct xfs_pwork_ctl *pctl);
+unsigned int xfs_pwork_guess_datadev_parallelism(struct xfs_mount *mp);
+
+#endif /* __XFS_PWORK_H__ */
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index 52ed7904df10..5e7a37f0cf84 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -13,19 +13,15 @@
#include "xfs_sb.h"
#include "xfs_mount.h"
#include "xfs_inode.h"
-#include "xfs_ialloc.h"
-#include "xfs_itable.h"
+#include "xfs_iwalk.h"
#include "xfs_quota.h"
-#include "xfs_error.h"
#include "xfs_bmap.h"
-#include "xfs_bmap_btree.h"
#include "xfs_bmap_util.h"
#include "xfs_trans.h"
#include "xfs_trans_space.h"
#include "xfs_qm.h"
#include "xfs_trace.h"
#include "xfs_icache.h"
-#include "xfs_cksum.h"
/*
* The global quota manager. There is only one of these for the entire
@@ -1118,17 +1114,15 @@ xfs_qm_quotacheck_dqadjust(
/* ARGSUSED */
STATIC int
xfs_qm_dqusage_adjust(
- xfs_mount_t *mp, /* mount point for filesystem */
- xfs_ino_t ino, /* inode number to get data for */
- void __user *buffer, /* not used */
- int ubsize, /* not used */
- int *ubused, /* not used */
- int *res) /* result code value */
+ struct xfs_mount *mp,
+ struct xfs_trans *tp,
+ xfs_ino_t ino,
+ void *data)
{
- xfs_inode_t *ip;
- xfs_qcnt_t nblks;
- xfs_filblks_t rtblks = 0; /* total rt blks */
- int error;
+ struct xfs_inode *ip;
+ xfs_qcnt_t nblks;
+ xfs_filblks_t rtblks = 0; /* total rt blks */
+ int error;
ASSERT(XFS_IS_QUOTA_RUNNING(mp));
@@ -1136,20 +1130,18 @@ xfs_qm_dqusage_adjust(
* rootino must have its resources accounted for, not so with the quota
* inodes.
*/
- if (xfs_is_quota_inode(&mp->m_sb, ino)) {
- *res = BULKSTAT_RV_NOTHING;
- return -EINVAL;
- }
+ if (xfs_is_quota_inode(&mp->m_sb, ino))
+ return 0;
/*
* We don't _need_ to take the ilock EXCL here because quotacheck runs
* at mount time and therefore nobody will be racing chown/chproj.
*/
- error = xfs_iget(mp, NULL, ino, XFS_IGET_DONTCACHE, 0, &ip);
- if (error) {
- *res = BULKSTAT_RV_NOTHING;
+ error = xfs_iget(mp, tp, ino, XFS_IGET_DONTCACHE, 0, &ip);
+ if (error == -EINVAL || error == -ENOENT)
+ return 0;
+ if (error)
return error;
- }
ASSERT(ip->i_delayed_blks == 0);
@@ -1157,7 +1149,7 @@ xfs_qm_dqusage_adjust(
struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
if (!(ifp->if_flags & XFS_IFEXTENTS)) {
- error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK);
+ error = xfs_iread_extents(tp, ip, XFS_DATA_FORK);
if (error)
goto error0;
}
@@ -1200,13 +1192,8 @@ xfs_qm_dqusage_adjust(
goto error0;
}
- xfs_irele(ip);
- *res = BULKSTAT_RV_DIDONE;
- return 0;
-
error0:
xfs_irele(ip);
- *res = BULKSTAT_RV_GIVEUP;
return error;
}
@@ -1270,18 +1257,13 @@ STATIC int
xfs_qm_quotacheck(
xfs_mount_t *mp)
{
- int done, count, error, error2;
- xfs_ino_t lastino;
- size_t structsz;
+ int error, error2;
uint flags;
LIST_HEAD (buffer_list);
struct xfs_inode *uip = mp->m_quotainfo->qi_uquotaip;
struct xfs_inode *gip = mp->m_quotainfo->qi_gquotaip;
struct xfs_inode *pip = mp->m_quotainfo->qi_pquotaip;
- count = INT_MAX;
- structsz = 1;
- lastino = 0;
flags = 0;
ASSERT(uip || gip || pip);
@@ -1318,18 +1300,10 @@ xfs_qm_quotacheck(
flags |= XFS_PQUOTA_CHKD;
}
- do {
- /*
- * Iterate thru all the inodes in the file system,
- * adjusting the corresponding dquot counters in core.
- */
- error = xfs_bulkstat(mp, &lastino, &count,
- xfs_qm_dqusage_adjust,
- structsz, NULL, &done);
- if (error)
- break;
-
- } while (!done);
+ error = xfs_iwalk_threaded(mp, 0, 0, xfs_qm_dqusage_adjust, 0, true,
+ NULL);
+ if (error)
+ goto error_return;
/*
* We've made all the changes that we need to make incore. Flush them
@@ -1812,7 +1786,8 @@ xfs_qm_vop_chown_reserve(
uint flags)
{
struct xfs_mount *mp = ip->i_mount;
- uint delblks, blkflags, prjflags = 0;
+ uint64_t delblks;
+ unsigned int blkflags, prjflags = 0;
struct xfs_dquot *udq_unres = NULL;
struct xfs_dquot *gdq_unres = NULL;
struct xfs_dquot *pdq_unres = NULL;
diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h
index 3ccf0fbc9071..b41b75089548 100644
--- a/fs/xfs/xfs_qm.h
+++ b/fs/xfs/xfs_qm.h
@@ -113,12 +113,8 @@ xfs_quota_inode(xfs_mount_t *mp, uint dq_flags)
return NULL;
}
-extern void xfs_trans_mod_dquot(struct xfs_trans *,
- struct xfs_dquot *, uint, long);
-extern int xfs_trans_reserve_quota_bydquots(struct xfs_trans *,
- struct xfs_mount *, struct xfs_dquot *,
- struct xfs_dquot *, struct xfs_dquot *,
- long, long, uint);
+extern void xfs_trans_mod_dquot(struct xfs_trans *tp, struct xfs_dquot *dqp,
+ uint field, int64_t delta);
extern void xfs_trans_dqjoin(struct xfs_trans *, struct xfs_dquot *);
extern void xfs_trans_log_dquot(struct xfs_trans *, struct xfs_dquot *);
diff --git a/fs/xfs/xfs_qm_bhv.c b/fs/xfs/xfs_qm_bhv.c
index 3091e4bc04ef..5d72e88598b4 100644
--- a/fs/xfs/xfs_qm_bhv.c
+++ b/fs/xfs/xfs_qm_bhv.c
@@ -5,13 +5,13 @@
*/
#include "xfs.h"
#include "xfs_fs.h"
+#include "xfs_shared.h"
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
#include "xfs_quota.h"
#include "xfs_mount.h"
#include "xfs_inode.h"
-#include "xfs_error.h"
#include "xfs_trans.h"
#include "xfs_qm.h"
diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c
index b3190890f096..da7ad0383037 100644
--- a/fs/xfs/xfs_qm_syscalls.c
+++ b/fs/xfs/xfs_qm_syscalls.c
@@ -4,7 +4,6 @@
* All Rights Reserved.
*/
-#include <linux/capability.h>
#include "xfs.h"
#include "xfs_fs.h"
@@ -12,17 +11,13 @@
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
-#include "xfs_bit.h"
#include "xfs_sb.h"
#include "xfs_mount.h"
#include "xfs_inode.h"
#include "xfs_trans.h"
-#include "xfs_error.h"
#include "xfs_quota.h"
#include "xfs_qm.h"
-#include "xfs_trace.h"
#include "xfs_icache.h"
-#include "xfs_defer.h"
STATIC int xfs_qm_log_quotaoff(xfs_mount_t *, xfs_qoff_logitem_t **, uint);
STATIC int xfs_qm_log_quotaoff_end(xfs_mount_t *, xfs_qoff_logitem_t *,
diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h
index 55b798265ef7..efe42ae7a2f3 100644
--- a/fs/xfs/xfs_quota.h
+++ b/fs/xfs/xfs_quota.h
@@ -56,32 +56,35 @@ xfs_quota_chkd_flag(
* The structure kept inside the xfs_trans_t keep track of dquot changes
* within a transaction and apply them later.
*/
-typedef struct xfs_dqtrx {
+struct xfs_dqtrx {
struct xfs_dquot *qt_dquot; /* the dquot this refers to */
- ulong qt_blk_res; /* blks reserved on a dquot */
- ulong qt_ino_res; /* inode reserved on a dquot */
- ulong qt_ino_res_used; /* inodes used from the reservation */
- long qt_bcount_delta; /* dquot blk count changes */
- long qt_delbcnt_delta; /* delayed dquot blk count changes */
- long qt_icount_delta; /* dquot inode count changes */
- ulong qt_rtblk_res; /* # blks reserved on a dquot */
- ulong qt_rtblk_res_used;/* # blks used from reservation */
- long qt_rtbcount_delta;/* dquot realtime blk changes */
- long qt_delrtb_delta; /* delayed RT blk count changes */
-} xfs_dqtrx_t;
+
+ uint64_t qt_blk_res; /* blks reserved on a dquot */
+ int64_t qt_bcount_delta; /* dquot blk count changes */
+ int64_t qt_delbcnt_delta; /* delayed dquot blk count changes */
+
+ uint64_t qt_rtblk_res; /* # blks reserved on a dquot */
+ uint64_t qt_rtblk_res_used;/* # blks used from reservation */
+ int64_t qt_rtbcount_delta;/* dquot realtime blk changes */
+ int64_t qt_delrtb_delta; /* delayed RT blk count changes */
+
+ uint64_t qt_ino_res; /* inode reserved on a dquot */
+ uint64_t qt_ino_res_used; /* inodes used from the reservation */
+ int64_t qt_icount_delta; /* dquot inode count changes */
+};
#ifdef CONFIG_XFS_QUOTA
extern void xfs_trans_dup_dqinfo(struct xfs_trans *, struct xfs_trans *);
extern void xfs_trans_free_dqinfo(struct xfs_trans *);
extern void xfs_trans_mod_dquot_byino(struct xfs_trans *, struct xfs_inode *,
- uint, long);
+ uint, int64_t);
extern void xfs_trans_apply_dquot_deltas(struct xfs_trans *);
extern void xfs_trans_unreserve_and_mod_dquots(struct xfs_trans *);
extern int xfs_trans_reserve_quota_nblks(struct xfs_trans *,
- struct xfs_inode *, long, long, uint);
+ struct xfs_inode *, int64_t, long, uint);
extern int xfs_trans_reserve_quota_bydquots(struct xfs_trans *,
struct xfs_mount *, struct xfs_dquot *,
- struct xfs_dquot *, struct xfs_dquot *, long, long, uint);
+ struct xfs_dquot *, struct xfs_dquot *, int64_t, long, uint);
extern int xfs_qm_vop_dqalloc(struct xfs_inode *, xfs_dqid_t, xfs_dqid_t,
prid_t, uint, struct xfs_dquot **, struct xfs_dquot **,
@@ -121,14 +124,14 @@ xfs_qm_vop_dqalloc(struct xfs_inode *ip, xfs_dqid_t uid, xfs_dqid_t gid,
#define xfs_trans_apply_dquot_deltas(tp)
#define xfs_trans_unreserve_and_mod_dquots(tp)
static inline int xfs_trans_reserve_quota_nblks(struct xfs_trans *tp,
- struct xfs_inode *ip, long nblks, long ninos, uint flags)
+ struct xfs_inode *ip, int64_t nblks, long ninos, uint flags)
{
return 0;
}
static inline int xfs_trans_reserve_quota_bydquots(struct xfs_trans *tp,
struct xfs_mount *mp, struct xfs_dquot *udqp,
struct xfs_dquot *gdqp, struct xfs_dquot *pdqp,
- long nblks, long nions, uint flags)
+ int64_t nblks, long nions, uint flags)
{
return 0;
}
diff --git a/fs/xfs/xfs_quotaops.c b/fs/xfs/xfs_quotaops.c
index a7c0c657dfaf..cd6c7210a373 100644
--- a/fs/xfs/xfs_quotaops.c
+++ b/fs/xfs/xfs_quotaops.c
@@ -4,6 +4,7 @@
* All Rights Reserved.
*/
#include "xfs.h"
+#include "xfs_shared.h"
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
@@ -11,10 +12,8 @@
#include "xfs_inode.h"
#include "xfs_quota.h"
#include "xfs_trans.h"
-#include "xfs_trace.h"
#include "xfs_icache.h"
#include "xfs_qm.h"
-#include <linux/quota.h>
static void
diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c
index fce38b56b962..d8288aa0670a 100644
--- a/fs/xfs/xfs_refcount_item.c
+++ b/fs/xfs/xfs_refcount_item.c
@@ -14,7 +14,6 @@
#include "xfs_defer.h"
#include "xfs_trans.h"
#include "xfs_trans_priv.h"
-#include "xfs_buf_item.h"
#include "xfs_refcount_item.h"
#include "xfs_log.h"
#include "xfs_refcount.h"
@@ -95,15 +94,6 @@ xfs_cui_item_format(
}
/*
- * Pinning has no meaning for an cui item, so just return.
- */
-STATIC void
-xfs_cui_item_pin(
- struct xfs_log_item *lip)
-{
-}
-
-/*
* The unpin operation is the last place an CUI is manipulated in the log. It is
* either inserted in the AIL or aborted in the event of a log I/O error. In
* either case, the CUI transaction has been successfully committed to make it
@@ -122,71 +112,22 @@ xfs_cui_item_unpin(
}
/*
- * CUI items have no locking or pushing. However, since CUIs are pulled from
- * the AIL when their corresponding CUDs are committed to disk, their situation
- * is very similar to being pinned. Return XFS_ITEM_PINNED so that the caller
- * will eventually flush the log. This should help in getting the CUI out of
- * the AIL.
- */
-STATIC uint
-xfs_cui_item_push(
- struct xfs_log_item *lip,
- struct list_head *buffer_list)
-{
- return XFS_ITEM_PINNED;
-}
-
-/*
* The CUI has been either committed or aborted if the transaction has been
* cancelled. If the transaction was cancelled, an CUD isn't going to be
* constructed and thus we free the CUI here directly.
*/
STATIC void
-xfs_cui_item_unlock(
+xfs_cui_item_release(
struct xfs_log_item *lip)
{
- if (test_bit(XFS_LI_ABORTED, &lip->li_flags))
- xfs_cui_release(CUI_ITEM(lip));
+ xfs_cui_release(CUI_ITEM(lip));
}
-/*
- * The CUI is logged only once and cannot be moved in the log, so simply return
- * the lsn at which it's been logged.
- */
-STATIC xfs_lsn_t
-xfs_cui_item_committed(
- struct xfs_log_item *lip,
- xfs_lsn_t lsn)
-{
- return lsn;
-}
-
-/*
- * The CUI dependency tracking op doesn't do squat. It can't because
- * it doesn't know where the free extent is coming from. The dependency
- * tracking has to be handled by the "enclosing" metadata object. For
- * example, for inodes, the inode is locked throughout the extent freeing
- * so the dependency should be recorded there.
- */
-STATIC void
-xfs_cui_item_committing(
- struct xfs_log_item *lip,
- xfs_lsn_t lsn)
-{
-}
-
-/*
- * This is the ops vector shared by all cui log items.
- */
static const struct xfs_item_ops xfs_cui_item_ops = {
.iop_size = xfs_cui_item_size,
.iop_format = xfs_cui_item_format,
- .iop_pin = xfs_cui_item_pin,
.iop_unpin = xfs_cui_item_unpin,
- .iop_unlock = xfs_cui_item_unlock,
- .iop_committed = xfs_cui_item_committed,
- .iop_push = xfs_cui_item_push,
- .iop_committing = xfs_cui_item_committing,
+ .iop_release = xfs_cui_item_release,
};
/*
@@ -254,126 +195,250 @@ xfs_cud_item_format(
}
/*
- * Pinning has no meaning for an cud item, so just return.
+ * The CUD is either committed or aborted if the transaction is cancelled. If
+ * the transaction is cancelled, drop our reference to the CUI and free the
+ * CUD.
*/
STATIC void
-xfs_cud_item_pin(
+xfs_cud_item_release(
struct xfs_log_item *lip)
{
+ struct xfs_cud_log_item *cudp = CUD_ITEM(lip);
+
+ xfs_cui_release(cudp->cud_cuip);
+ kmem_zone_free(xfs_cud_zone, cudp);
}
-/*
- * Since pinning has no meaning for an cud item, unpinning does
- * not either.
- */
-STATIC void
-xfs_cud_item_unpin(
- struct xfs_log_item *lip,
- int remove)
+static const struct xfs_item_ops xfs_cud_item_ops = {
+ .flags = XFS_ITEM_RELEASE_WHEN_COMMITTED,
+ .iop_size = xfs_cud_item_size,
+ .iop_format = xfs_cud_item_format,
+ .iop_release = xfs_cud_item_release,
+};
+
+static struct xfs_cud_log_item *
+xfs_trans_get_cud(
+ struct xfs_trans *tp,
+ struct xfs_cui_log_item *cuip)
{
+ struct xfs_cud_log_item *cudp;
+
+ cudp = kmem_zone_zalloc(xfs_cud_zone, KM_SLEEP);
+ xfs_log_item_init(tp->t_mountp, &cudp->cud_item, XFS_LI_CUD,
+ &xfs_cud_item_ops);
+ cudp->cud_cuip = cuip;
+ cudp->cud_format.cud_cui_id = cuip->cui_format.cui_id;
+
+ xfs_trans_add_item(tp, &cudp->cud_item);
+ return cudp;
}
/*
- * There isn't much you can do to push on an cud item. It is simply stuck
- * waiting for the log to be flushed to disk.
+ * Finish an refcount update and log it to the CUD. Note that the
+ * transaction is marked dirty regardless of whether the refcount
+ * update succeeds or fails to support the CUI/CUD lifecycle rules.
*/
-STATIC uint
-xfs_cud_item_push(
- struct xfs_log_item *lip,
- struct list_head *buffer_list)
+static int
+xfs_trans_log_finish_refcount_update(
+ struct xfs_trans *tp,
+ struct xfs_cud_log_item *cudp,
+ enum xfs_refcount_intent_type type,
+ xfs_fsblock_t startblock,
+ xfs_extlen_t blockcount,
+ xfs_fsblock_t *new_fsb,
+ xfs_extlen_t *new_len,
+ struct xfs_btree_cur **pcur)
{
- return XFS_ITEM_PINNED;
+ int error;
+
+ error = xfs_refcount_finish_one(tp, type, startblock,
+ blockcount, new_fsb, new_len, pcur);
+
+ /*
+ * Mark the transaction dirty, even on error. This ensures the
+ * transaction is aborted, which:
+ *
+ * 1.) releases the CUI and frees the CUD
+ * 2.) shuts down the filesystem
+ */
+ tp->t_flags |= XFS_TRANS_DIRTY;
+ set_bit(XFS_LI_DIRTY, &cudp->cud_item.li_flags);
+
+ return error;
}
-/*
- * The CUD is either committed or aborted if the transaction is cancelled. If
- * the transaction is cancelled, drop our reference to the CUI and free the
- * CUD.
- */
-STATIC void
-xfs_cud_item_unlock(
- struct xfs_log_item *lip)
+/* Sort refcount intents by AG. */
+static int
+xfs_refcount_update_diff_items(
+ void *priv,
+ struct list_head *a,
+ struct list_head *b)
{
- struct xfs_cud_log_item *cudp = CUD_ITEM(lip);
+ struct xfs_mount *mp = priv;
+ struct xfs_refcount_intent *ra;
+ struct xfs_refcount_intent *rb;
+
+ ra = container_of(a, struct xfs_refcount_intent, ri_list);
+ rb = container_of(b, struct xfs_refcount_intent, ri_list);
+ return XFS_FSB_TO_AGNO(mp, ra->ri_startblock) -
+ XFS_FSB_TO_AGNO(mp, rb->ri_startblock);
+}
- if (test_bit(XFS_LI_ABORTED, &lip->li_flags)) {
- xfs_cui_release(cudp->cud_cuip);
- kmem_zone_free(xfs_cud_zone, cudp);
+/* Get an CUI. */
+STATIC void *
+xfs_refcount_update_create_intent(
+ struct xfs_trans *tp,
+ unsigned int count)
+{
+ struct xfs_cui_log_item *cuip;
+
+ ASSERT(tp != NULL);
+ ASSERT(count > 0);
+
+ cuip = xfs_cui_init(tp->t_mountp, count);
+ ASSERT(cuip != NULL);
+
+ /*
+ * Get a log_item_desc to point at the new item.
+ */
+ xfs_trans_add_item(tp, &cuip->cui_item);
+ return cuip;
+}
+
+/* Set the phys extent flags for this reverse mapping. */
+static void
+xfs_trans_set_refcount_flags(
+ struct xfs_phys_extent *refc,
+ enum xfs_refcount_intent_type type)
+{
+ refc->pe_flags = 0;
+ switch (type) {
+ case XFS_REFCOUNT_INCREASE:
+ case XFS_REFCOUNT_DECREASE:
+ case XFS_REFCOUNT_ALLOC_COW:
+ case XFS_REFCOUNT_FREE_COW:
+ refc->pe_flags |= type;
+ break;
+ default:
+ ASSERT(0);
}
}
-/*
- * When the cud item is committed to disk, all we need to do is delete our
- * reference to our partner cui item and then free ourselves. Since we're
- * freeing ourselves we must return -1 to keep the transaction code from
- * further referencing this item.
- */
-STATIC xfs_lsn_t
-xfs_cud_item_committed(
- struct xfs_log_item *lip,
- xfs_lsn_t lsn)
+/* Log refcount updates in the intent item. */
+STATIC void
+xfs_refcount_update_log_item(
+ struct xfs_trans *tp,
+ void *intent,
+ struct list_head *item)
{
- struct xfs_cud_log_item *cudp = CUD_ITEM(lip);
+ struct xfs_cui_log_item *cuip = intent;
+ struct xfs_refcount_intent *refc;
+ uint next_extent;
+ struct xfs_phys_extent *ext;
+
+ refc = container_of(item, struct xfs_refcount_intent, ri_list);
+
+ tp->t_flags |= XFS_TRANS_DIRTY;
+ set_bit(XFS_LI_DIRTY, &cuip->cui_item.li_flags);
/*
- * Drop the CUI reference regardless of whether the CUD has been
- * aborted. Once the CUD transaction is constructed, it is the sole
- * responsibility of the CUD to release the CUI (even if the CUI is
- * aborted due to log I/O error).
+ * atomic_inc_return gives us the value after the increment;
+ * we want to use it as an array index so we need to subtract 1 from
+ * it.
*/
- xfs_cui_release(cudp->cud_cuip);
- kmem_zone_free(xfs_cud_zone, cudp);
+ next_extent = atomic_inc_return(&cuip->cui_next_extent) - 1;
+ ASSERT(next_extent < cuip->cui_format.cui_nextents);
+ ext = &cuip->cui_format.cui_extents[next_extent];
+ ext->pe_startblock = refc->ri_startblock;
+ ext->pe_len = refc->ri_blockcount;
+ xfs_trans_set_refcount_flags(ext, refc->ri_type);
+}
- return (xfs_lsn_t)-1;
+/* Get an CUD so we can process all the deferred refcount updates. */
+STATIC void *
+xfs_refcount_update_create_done(
+ struct xfs_trans *tp,
+ void *intent,
+ unsigned int count)
+{
+ return xfs_trans_get_cud(tp, intent);
}
-/*
- * The CUD dependency tracking op doesn't do squat. It can't because
- * it doesn't know where the free extent is coming from. The dependency
- * tracking has to be handled by the "enclosing" metadata object. For
- * example, for inodes, the inode is locked throughout the extent freeing
- * so the dependency should be recorded there.
- */
-STATIC void
-xfs_cud_item_committing(
- struct xfs_log_item *lip,
- xfs_lsn_t lsn)
+/* Process a deferred refcount update. */
+STATIC int
+xfs_refcount_update_finish_item(
+ struct xfs_trans *tp,
+ struct list_head *item,
+ void *done_item,
+ void **state)
{
+ struct xfs_refcount_intent *refc;
+ xfs_fsblock_t new_fsb;
+ xfs_extlen_t new_aglen;
+ int error;
+
+ refc = container_of(item, struct xfs_refcount_intent, ri_list);
+ error = xfs_trans_log_finish_refcount_update(tp, done_item,
+ refc->ri_type,
+ refc->ri_startblock,
+ refc->ri_blockcount,
+ &new_fsb, &new_aglen,
+ (struct xfs_btree_cur **)state);
+ /* Did we run out of reservation? Requeue what we didn't finish. */
+ if (!error && new_aglen > 0) {
+ ASSERT(refc->ri_type == XFS_REFCOUNT_INCREASE ||
+ refc->ri_type == XFS_REFCOUNT_DECREASE);
+ refc->ri_startblock = new_fsb;
+ refc->ri_blockcount = new_aglen;
+ return -EAGAIN;
+ }
+ kmem_free(refc);
+ return error;
}
-/*
- * This is the ops vector shared by all cud log items.
- */
-static const struct xfs_item_ops xfs_cud_item_ops = {
- .iop_size = xfs_cud_item_size,
- .iop_format = xfs_cud_item_format,
- .iop_pin = xfs_cud_item_pin,
- .iop_unpin = xfs_cud_item_unpin,
- .iop_unlock = xfs_cud_item_unlock,
- .iop_committed = xfs_cud_item_committed,
- .iop_push = xfs_cud_item_push,
- .iop_committing = xfs_cud_item_committing,
-};
+/* Clean up after processing deferred refcounts. */
+STATIC void
+xfs_refcount_update_finish_cleanup(
+ struct xfs_trans *tp,
+ void *state,
+ int error)
+{
+ struct xfs_btree_cur *rcur = state;
-/*
- * Allocate and initialize an cud item with the given number of extents.
- */
-struct xfs_cud_log_item *
-xfs_cud_init(
- struct xfs_mount *mp,
- struct xfs_cui_log_item *cuip)
+ xfs_refcount_finish_one_cleanup(tp, rcur, error);
+}
+/* Abort all pending CUIs. */
+STATIC void
+xfs_refcount_update_abort_intent(
+ void *intent)
{
- struct xfs_cud_log_item *cudp;
+ xfs_cui_release(intent);
+}
- cudp = kmem_zone_zalloc(xfs_cud_zone, KM_SLEEP);
- xfs_log_item_init(mp, &cudp->cud_item, XFS_LI_CUD, &xfs_cud_item_ops);
- cudp->cud_cuip = cuip;
- cudp->cud_format.cud_cui_id = cuip->cui_format.cui_id;
+/* Cancel a deferred refcount update. */
+STATIC void
+xfs_refcount_update_cancel_item(
+ struct list_head *item)
+{
+ struct xfs_refcount_intent *refc;
- return cudp;
+ refc = container_of(item, struct xfs_refcount_intent, ri_list);
+ kmem_free(refc);
}
+const struct xfs_defer_op_type xfs_refcount_update_defer_type = {
+ .max_items = XFS_CUI_MAX_FAST_EXTENTS,
+ .diff_items = xfs_refcount_update_diff_items,
+ .create_intent = xfs_refcount_update_create_intent,
+ .abort_intent = xfs_refcount_update_abort_intent,
+ .log_item = xfs_refcount_update_log_item,
+ .create_done = xfs_refcount_update_create_done,
+ .finish_item = xfs_refcount_update_finish_item,
+ .finish_cleanup = xfs_refcount_update_finish_cleanup,
+ .cancel_item = xfs_refcount_update_cancel_item,
+};
+
/*
* Process a refcount update intent item that was recovered from the log.
* We need to update the refcountbt.
diff --git a/fs/xfs/xfs_refcount_item.h b/fs/xfs/xfs_refcount_item.h
index 3896dcc2368f..e47530f30489 100644
--- a/fs/xfs/xfs_refcount_item.h
+++ b/fs/xfs/xfs_refcount_item.h
@@ -78,8 +78,6 @@ extern struct kmem_zone *xfs_cui_zone;
extern struct kmem_zone *xfs_cud_zone;
struct xfs_cui_log_item *xfs_cui_init(struct xfs_mount *, uint);
-struct xfs_cud_log_item *xfs_cud_init(struct xfs_mount *,
- struct xfs_cui_log_item *);
void xfs_cui_item_free(struct xfs_cui_log_item *);
void xfs_cui_release(struct xfs_cui_log_item *);
int xfs_cui_recover(struct xfs_trans *parent_tp, struct xfs_cui_log_item *cuip);
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index c5b4fa004ca4..c4ec7afd1170 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -11,21 +11,12 @@
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
#include "xfs_defer.h"
-#include "xfs_da_format.h"
-#include "xfs_da_btree.h"
#include "xfs_inode.h"
#include "xfs_trans.h"
-#include "xfs_inode_item.h"
#include "xfs_bmap.h"
#include "xfs_bmap_util.h"
-#include "xfs_error.h"
-#include "xfs_dir2.h"
-#include "xfs_dir2_priv.h"
-#include "xfs_ioctl.h"
#include "xfs_trace.h"
-#include "xfs_log.h"
#include "xfs_icache.h"
-#include "xfs_pnfs.h"
#include "xfs_btree.h"
#include "xfs_refcount_btree.h"
#include "xfs_refcount.h"
@@ -33,11 +24,9 @@
#include "xfs_trans_space.h"
#include "xfs_bit.h"
#include "xfs_alloc.h"
-#include "xfs_quota_defs.h"
#include "xfs_quota.h"
#include "xfs_reflink.h"
#include "xfs_iomap.h"
-#include "xfs_rmap_btree.h"
#include "xfs_sb.h"
#include "xfs_ag_resv.h"
@@ -192,7 +181,7 @@ xfs_reflink_trim_around_shared(
int error = 0;
/* Holes, unwritten, and delalloc extents cannot be shared */
- if (!xfs_is_reflink_inode(ip) || !xfs_bmap_is_real_extent(irec)) {
+ if (!xfs_is_cow_inode(ip) || !xfs_bmap_is_real_extent(irec)) {
*shared = false;
return 0;
}
@@ -234,93 +223,59 @@ xfs_reflink_trim_around_shared(
}
}
-/*
- * Trim the passed in imap to the next shared/unshared extent boundary, and
- * if imap->br_startoff points to a shared extent reserve space for it in the
- * COW fork.
- *
- * Note that imap will always contain the block numbers for the existing blocks
- * in the data fork, as the upper layers need them for read-modify-write
- * operations.
- */
-int
-xfs_reflink_reserve_cow(
+bool
+xfs_inode_need_cow(
struct xfs_inode *ip,
- struct xfs_bmbt_irec *imap)
+ struct xfs_bmbt_irec *imap,
+ bool *shared)
{
- struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
- struct xfs_bmbt_irec got;
- int error = 0;
- bool eof = false;
- struct xfs_iext_cursor icur;
- bool shared;
-
- /*
- * Search the COW fork extent list first. This serves two purposes:
- * first this implement the speculative preallocation using cowextisze,
- * so that we also unshared block adjacent to shared blocks instead
- * of just the shared blocks themselves. Second the lookup in the
- * extent list is generally faster than going out to the shared extent
- * tree.
- */
-
- if (!xfs_iext_lookup_extent(ip, ifp, imap->br_startoff, &icur, &got))
- eof = true;
- if (!eof && got.br_startoff <= imap->br_startoff) {
- trace_xfs_reflink_cow_found(ip, imap);
- xfs_trim_extent(imap, got.br_startoff, got.br_blockcount);
+ /* We can't update any real extents in always COW mode. */
+ if (xfs_is_always_cow_inode(ip) &&
+ !isnullstartblock(imap->br_startblock)) {
+ *shared = true;
return 0;
}
/* Trim the mapping to the nearest shared extent boundary. */
- error = xfs_reflink_trim_around_shared(ip, imap, &shared);
- if (error)
- return error;
-
- /* Not shared? Just report the (potentially capped) extent. */
- if (!shared)
- return 0;
-
- /*
- * Fork all the shared blocks from our write offset until the end of
- * the extent.
- */
- error = xfs_qm_dqattach_locked(ip, false);
- if (error)
- return error;
-
- error = xfs_bmapi_reserve_delalloc(ip, XFS_COW_FORK, imap->br_startoff,
- imap->br_blockcount, 0, &got, &icur, eof);
- if (error == -ENOSPC || error == -EDQUOT)
- trace_xfs_reflink_cow_enospc(ip, imap);
- if (error)
- return error;
-
- xfs_trim_extent(imap, got.br_startoff, got.br_blockcount);
- trace_xfs_reflink_cow_alloc(ip, &got);
- return 0;
+ return xfs_reflink_trim_around_shared(ip, imap, shared);
}
-/* Convert part of an unwritten CoW extent to a real one. */
-STATIC int
-xfs_reflink_convert_cow_extent(
- struct xfs_inode *ip,
- struct xfs_bmbt_irec *imap,
- xfs_fileoff_t offset_fsb,
- xfs_filblks_t count_fsb)
+static int
+xfs_reflink_convert_cow_locked(
+ struct xfs_inode *ip,
+ xfs_fileoff_t offset_fsb,
+ xfs_filblks_t count_fsb)
{
- int nimaps = 1;
+ struct xfs_iext_cursor icur;
+ struct xfs_bmbt_irec got;
+ struct xfs_btree_cur *dummy_cur = NULL;
+ int dummy_logflags;
+ int error = 0;
- if (imap->br_state == XFS_EXT_NORM)
+ if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &got))
return 0;
- xfs_trim_extent(imap, offset_fsb, count_fsb);
- trace_xfs_reflink_convert_cow(ip, imap);
- if (imap->br_blockcount == 0)
- return 0;
- return xfs_bmapi_write(NULL, ip, imap->br_startoff, imap->br_blockcount,
- XFS_BMAPI_COWFORK | XFS_BMAPI_CONVERT, 0, imap,
- &nimaps);
+ do {
+ if (got.br_startoff >= offset_fsb + count_fsb)
+ break;
+ if (got.br_state == XFS_EXT_NORM)
+ continue;
+ if (WARN_ON_ONCE(isnullstartblock(got.br_startblock)))
+ return -EIO;
+
+ xfs_trim_extent(&got, offset_fsb, count_fsb);
+ if (!got.br_blockcount)
+ continue;
+
+ got.br_state = XFS_EXT_NORM;
+ error = xfs_bmap_add_extent_unwritten_real(NULL, ip,
+ XFS_COW_FORK, &icur, &dummy_cur, &got,
+ &dummy_logflags);
+ if (error)
+ return error;
+ } while (xfs_iext_next_extent(ip->i_cowfp, &icur, &got));
+
+ return error;
}
/* Convert all of the unwritten CoW extents in a file's range to real ones. */
@@ -334,15 +289,12 @@ xfs_reflink_convert_cow(
xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + count);
xfs_filblks_t count_fsb = end_fsb - offset_fsb;
- struct xfs_bmbt_irec imap;
- int nimaps = 1, error = 0;
+ int error;
ASSERT(count != 0);
xfs_ilock(ip, XFS_ILOCK_EXCL);
- error = xfs_bmapi_write(NULL, ip, offset_fsb, count_fsb,
- XFS_BMAPI_COWFORK | XFS_BMAPI_CONVERT |
- XFS_BMAPI_CONVERT_ONLY, 0, &imap, &nimaps);
+ error = xfs_reflink_convert_cow_locked(ip, offset_fsb, count_fsb);
xfs_iunlock(ip, XFS_ILOCK_EXCL);
return error;
}
@@ -375,7 +327,7 @@ xfs_find_trim_cow_extent(
if (got.br_startoff > offset_fsb) {
xfs_trim_extent(imap, imap->br_startoff,
got.br_startoff - imap->br_startoff);
- return xfs_reflink_trim_around_shared(ip, imap, shared);
+ return xfs_inode_need_cow(ip, imap, shared);
}
*shared = true;
@@ -397,7 +349,8 @@ xfs_reflink_allocate_cow(
struct xfs_inode *ip,
struct xfs_bmbt_irec *imap,
bool *shared,
- uint *lockmode)
+ uint *lockmode,
+ bool convert_now)
{
struct xfs_mount *mp = ip->i_mount;
xfs_fileoff_t offset_fsb = imap->br_startoff;
@@ -409,7 +362,10 @@ xfs_reflink_allocate_cow(
xfs_extlen_t resblks = 0;
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
- ASSERT(xfs_is_reflink_inode(ip));
+ if (!ip->i_cowfp) {
+ ASSERT(!xfs_is_reflink_inode(ip));
+ xfs_ifork_init_cow(ip);
+ }
error = xfs_find_trim_cow_extent(ip, imap, shared, &found);
if (error || !*shared)
@@ -471,7 +427,16 @@ xfs_reflink_allocate_cow(
if (nimaps == 0)
return -ENOSPC;
convert:
- return xfs_reflink_convert_cow_extent(ip, imap, offset_fsb, count_fsb);
+ xfs_trim_extent(imap, offset_fsb, count_fsb);
+ /*
+ * COW fork extents are supposed to remain unwritten until we're ready
+ * to initiate a disk write. For direct I/O we are going to write the
+ * data and need the conversion, but for buffered writes we're done.
+ */
+ if (!convert_now || imap->br_state == XFS_EXT_NORM)
+ return 0;
+ trace_xfs_reflink_convert_cow(ip, imap);
+ return xfs_reflink_convert_cow_locked(ip, offset_fsb, count_fsb);
out_unreserve:
xfs_trans_unreserve_quota_nblks(tp, ip, (long)resblks, 0,
@@ -586,7 +551,7 @@ xfs_reflink_cancel_cow_range(
int error;
trace_xfs_reflink_cancel_cow_range(ip, offset, count);
- ASSERT(xfs_is_reflink_inode(ip));
+ ASSERT(ip->i_cowfp);
offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset);
if (count == NULLFILEOFF)
@@ -596,7 +561,7 @@ xfs_reflink_cancel_cow_range(
/* Start a rolling transaction to remove the mappings */
error = xfs_trans_alloc(ip->i_mount, &M_RES(ip->i_mount)->tr_write,
- 0, 0, XFS_TRANS_NOFS, &tp);
+ 0, 0, 0, &tp);
if (error)
goto out;
@@ -655,7 +620,7 @@ xfs_reflink_end_cow_extent(
resblks = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK);
error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0,
- XFS_TRANS_RESERVE | XFS_TRANS_NOFS, &tp);
+ XFS_TRANS_RESERVE, &tp);
if (error)
return error;
@@ -1192,7 +1157,7 @@ xfs_reflink_remap_blocks(
break;
ASSERT(nimaps == 1);
- trace_xfs_reflink_remap_imap(src, srcoff, len, XFS_IO_OVERWRITE,
+ trace_xfs_reflink_remap_imap(src, srcoff, len, XFS_DATA_FORK,
&imap);
/* Translate imap into the destination file. */
diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h
index 6d73daef1f13..28a43b7f581d 100644
--- a/fs/xfs/xfs_reflink.h
+++ b/fs/xfs/xfs_reflink.h
@@ -6,16 +6,28 @@
#ifndef __XFS_REFLINK_H
#define __XFS_REFLINK_H 1
+static inline bool xfs_is_always_cow_inode(struct xfs_inode *ip)
+{
+ return ip->i_mount->m_always_cow &&
+ xfs_sb_version_hasreflink(&ip->i_mount->m_sb);
+}
+
+static inline bool xfs_is_cow_inode(struct xfs_inode *ip)
+{
+ return xfs_is_reflink_inode(ip) || xfs_is_always_cow_inode(ip);
+}
+
extern int xfs_reflink_find_shared(struct xfs_mount *mp, struct xfs_trans *tp,
xfs_agnumber_t agno, xfs_agblock_t agbno, xfs_extlen_t aglen,
xfs_agblock_t *fbno, xfs_extlen_t *flen, bool find_maximal);
extern int xfs_reflink_trim_around_shared(struct xfs_inode *ip,
struct xfs_bmbt_irec *irec, bool *shared);
+bool xfs_inode_need_cow(struct xfs_inode *ip, struct xfs_bmbt_irec *imap,
+ bool *shared);
-extern int xfs_reflink_reserve_cow(struct xfs_inode *ip,
- struct xfs_bmbt_irec *imap);
extern int xfs_reflink_allocate_cow(struct xfs_inode *ip,
- struct xfs_bmbt_irec *imap, bool *shared, uint *lockmode);
+ struct xfs_bmbt_irec *imap, bool *shared, uint *lockmode,
+ bool convert_now);
extern int xfs_reflink_convert_cow(struct xfs_inode *ip, xfs_off_t offset,
xfs_off_t count);
diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c
index 127dc9c32a54..77ed557b6127 100644
--- a/fs/xfs/xfs_rmap_item.c
+++ b/fs/xfs/xfs_rmap_item.c
@@ -14,7 +14,6 @@
#include "xfs_defer.h"
#include "xfs_trans.h"
#include "xfs_trans_priv.h"
-#include "xfs_buf_item.h"
#include "xfs_rmap_item.h"
#include "xfs_log.h"
#include "xfs_rmap.h"
@@ -94,15 +93,6 @@ xfs_rui_item_format(
}
/*
- * Pinning has no meaning for an rui item, so just return.
- */
-STATIC void
-xfs_rui_item_pin(
- struct xfs_log_item *lip)
-{
-}
-
-/*
* The unpin operation is the last place an RUI is manipulated in the log. It is
* either inserted in the AIL or aborted in the event of a log I/O error. In
* either case, the RUI transaction has been successfully committed to make it
@@ -121,71 +111,22 @@ xfs_rui_item_unpin(
}
/*
- * RUI items have no locking or pushing. However, since RUIs are pulled from
- * the AIL when their corresponding RUDs are committed to disk, their situation
- * is very similar to being pinned. Return XFS_ITEM_PINNED so that the caller
- * will eventually flush the log. This should help in getting the RUI out of
- * the AIL.
- */
-STATIC uint
-xfs_rui_item_push(
- struct xfs_log_item *lip,
- struct list_head *buffer_list)
-{
- return XFS_ITEM_PINNED;
-}
-
-/*
* The RUI has been either committed or aborted if the transaction has been
* cancelled. If the transaction was cancelled, an RUD isn't going to be
* constructed and thus we free the RUI here directly.
*/
STATIC void
-xfs_rui_item_unlock(
+xfs_rui_item_release(
struct xfs_log_item *lip)
{
- if (test_bit(XFS_LI_ABORTED, &lip->li_flags))
- xfs_rui_release(RUI_ITEM(lip));
+ xfs_rui_release(RUI_ITEM(lip));
}
-/*
- * The RUI is logged only once and cannot be moved in the log, so simply return
- * the lsn at which it's been logged.
- */
-STATIC xfs_lsn_t
-xfs_rui_item_committed(
- struct xfs_log_item *lip,
- xfs_lsn_t lsn)
-{
- return lsn;
-}
-
-/*
- * The RUI dependency tracking op doesn't do squat. It can't because
- * it doesn't know where the free extent is coming from. The dependency
- * tracking has to be handled by the "enclosing" metadata object. For
- * example, for inodes, the inode is locked throughout the extent freeing
- * so the dependency should be recorded there.
- */
-STATIC void
-xfs_rui_item_committing(
- struct xfs_log_item *lip,
- xfs_lsn_t lsn)
-{
-}
-
-/*
- * This is the ops vector shared by all rui log items.
- */
static const struct xfs_item_ops xfs_rui_item_ops = {
.iop_size = xfs_rui_item_size,
.iop_format = xfs_rui_item_format,
- .iop_pin = xfs_rui_item_pin,
.iop_unpin = xfs_rui_item_unpin,
- .iop_unlock = xfs_rui_item_unlock,
- .iop_committed = xfs_rui_item_committed,
- .iop_push = xfs_rui_item_push,
- .iop_committing = xfs_rui_item_committing,
+ .iop_release = xfs_rui_item_release,
};
/*
@@ -275,126 +216,271 @@ xfs_rud_item_format(
}
/*
- * Pinning has no meaning for an rud item, so just return.
+ * The RUD is either committed or aborted if the transaction is cancelled. If
+ * the transaction is cancelled, drop our reference to the RUI and free the
+ * RUD.
*/
STATIC void
-xfs_rud_item_pin(
+xfs_rud_item_release(
struct xfs_log_item *lip)
{
+ struct xfs_rud_log_item *rudp = RUD_ITEM(lip);
+
+ xfs_rui_release(rudp->rud_ruip);
+ kmem_zone_free(xfs_rud_zone, rudp);
}
-/*
- * Since pinning has no meaning for an rud item, unpinning does
- * not either.
- */
-STATIC void
-xfs_rud_item_unpin(
- struct xfs_log_item *lip,
- int remove)
+static const struct xfs_item_ops xfs_rud_item_ops = {
+ .flags = XFS_ITEM_RELEASE_WHEN_COMMITTED,
+ .iop_size = xfs_rud_item_size,
+ .iop_format = xfs_rud_item_format,
+ .iop_release = xfs_rud_item_release,
+};
+
+static struct xfs_rud_log_item *
+xfs_trans_get_rud(
+ struct xfs_trans *tp,
+ struct xfs_rui_log_item *ruip)
{
+ struct xfs_rud_log_item *rudp;
+
+ rudp = kmem_zone_zalloc(xfs_rud_zone, KM_SLEEP);
+ xfs_log_item_init(tp->t_mountp, &rudp->rud_item, XFS_LI_RUD,
+ &xfs_rud_item_ops);
+ rudp->rud_ruip = ruip;
+ rudp->rud_format.rud_rui_id = ruip->rui_format.rui_id;
+
+ xfs_trans_add_item(tp, &rudp->rud_item);
+ return rudp;
}
-/*
- * There isn't much you can do to push on an rud item. It is simply stuck
- * waiting for the log to be flushed to disk.
- */
-STATIC uint
-xfs_rud_item_push(
- struct xfs_log_item *lip,
- struct list_head *buffer_list)
+/* Set the map extent flags for this reverse mapping. */
+static void
+xfs_trans_set_rmap_flags(
+ struct xfs_map_extent *rmap,
+ enum xfs_rmap_intent_type type,
+ int whichfork,
+ xfs_exntst_t state)
{
- return XFS_ITEM_PINNED;
+ rmap->me_flags = 0;
+ if (state == XFS_EXT_UNWRITTEN)
+ rmap->me_flags |= XFS_RMAP_EXTENT_UNWRITTEN;
+ if (whichfork == XFS_ATTR_FORK)
+ rmap->me_flags |= XFS_RMAP_EXTENT_ATTR_FORK;
+ switch (type) {
+ case XFS_RMAP_MAP:
+ rmap->me_flags |= XFS_RMAP_EXTENT_MAP;
+ break;
+ case XFS_RMAP_MAP_SHARED:
+ rmap->me_flags |= XFS_RMAP_EXTENT_MAP_SHARED;
+ break;
+ case XFS_RMAP_UNMAP:
+ rmap->me_flags |= XFS_RMAP_EXTENT_UNMAP;
+ break;
+ case XFS_RMAP_UNMAP_SHARED:
+ rmap->me_flags |= XFS_RMAP_EXTENT_UNMAP_SHARED;
+ break;
+ case XFS_RMAP_CONVERT:
+ rmap->me_flags |= XFS_RMAP_EXTENT_CONVERT;
+ break;
+ case XFS_RMAP_CONVERT_SHARED:
+ rmap->me_flags |= XFS_RMAP_EXTENT_CONVERT_SHARED;
+ break;
+ case XFS_RMAP_ALLOC:
+ rmap->me_flags |= XFS_RMAP_EXTENT_ALLOC;
+ break;
+ case XFS_RMAP_FREE:
+ rmap->me_flags |= XFS_RMAP_EXTENT_FREE;
+ break;
+ default:
+ ASSERT(0);
+ }
}
/*
- * The RUD is either committed or aborted if the transaction is cancelled. If
- * the transaction is cancelled, drop our reference to the RUI and free the
- * RUD.
+ * Finish an rmap update and log it to the RUD. Note that the transaction is
+ * marked dirty regardless of whether the rmap update succeeds or fails to
+ * support the RUI/RUD lifecycle rules.
*/
-STATIC void
-xfs_rud_item_unlock(
- struct xfs_log_item *lip)
+static int
+xfs_trans_log_finish_rmap_update(
+ struct xfs_trans *tp,
+ struct xfs_rud_log_item *rudp,
+ enum xfs_rmap_intent_type type,
+ uint64_t owner,
+ int whichfork,
+ xfs_fileoff_t startoff,
+ xfs_fsblock_t startblock,
+ xfs_filblks_t blockcount,
+ xfs_exntst_t state,
+ struct xfs_btree_cur **pcur)
{
- struct xfs_rud_log_item *rudp = RUD_ITEM(lip);
+ int error;
- if (test_bit(XFS_LI_ABORTED, &lip->li_flags)) {
- xfs_rui_release(rudp->rud_ruip);
- kmem_zone_free(xfs_rud_zone, rudp);
- }
+ error = xfs_rmap_finish_one(tp, type, owner, whichfork, startoff,
+ startblock, blockcount, state, pcur);
+
+ /*
+ * Mark the transaction dirty, even on error. This ensures the
+ * transaction is aborted, which:
+ *
+ * 1.) releases the RUI and frees the RUD
+ * 2.) shuts down the filesystem
+ */
+ tp->t_flags |= XFS_TRANS_DIRTY;
+ set_bit(XFS_LI_DIRTY, &rudp->rud_item.li_flags);
+
+ return error;
}
-/*
- * When the rud item is committed to disk, all we need to do is delete our
- * reference to our partner rui item and then free ourselves. Since we're
- * freeing ourselves we must return -1 to keep the transaction code from
- * further referencing this item.
- */
-STATIC xfs_lsn_t
-xfs_rud_item_committed(
- struct xfs_log_item *lip,
- xfs_lsn_t lsn)
+/* Sort rmap intents by AG. */
+static int
+xfs_rmap_update_diff_items(
+ void *priv,
+ struct list_head *a,
+ struct list_head *b)
{
- struct xfs_rud_log_item *rudp = RUD_ITEM(lip);
+ struct xfs_mount *mp = priv;
+ struct xfs_rmap_intent *ra;
+ struct xfs_rmap_intent *rb;
+
+ ra = container_of(a, struct xfs_rmap_intent, ri_list);
+ rb = container_of(b, struct xfs_rmap_intent, ri_list);
+ return XFS_FSB_TO_AGNO(mp, ra->ri_bmap.br_startblock) -
+ XFS_FSB_TO_AGNO(mp, rb->ri_bmap.br_startblock);
+}
+
+/* Get an RUI. */
+STATIC void *
+xfs_rmap_update_create_intent(
+ struct xfs_trans *tp,
+ unsigned int count)
+{
+ struct xfs_rui_log_item *ruip;
+
+ ASSERT(tp != NULL);
+ ASSERT(count > 0);
+
+ ruip = xfs_rui_init(tp->t_mountp, count);
+ ASSERT(ruip != NULL);
/*
- * Drop the RUI reference regardless of whether the RUD has been
- * aborted. Once the RUD transaction is constructed, it is the sole
- * responsibility of the RUD to release the RUI (even if the RUI is
- * aborted due to log I/O error).
+ * Get a log_item_desc to point at the new item.
*/
- xfs_rui_release(rudp->rud_ruip);
- kmem_zone_free(xfs_rud_zone, rudp);
-
- return (xfs_lsn_t)-1;
+ xfs_trans_add_item(tp, &ruip->rui_item);
+ return ruip;
}
-/*
- * The RUD dependency tracking op doesn't do squat. It can't because
- * it doesn't know where the free extent is coming from. The dependency
- * tracking has to be handled by the "enclosing" metadata object. For
- * example, for inodes, the inode is locked throughout the extent freeing
- * so the dependency should be recorded there.
- */
+/* Log rmap updates in the intent item. */
STATIC void
-xfs_rud_item_committing(
- struct xfs_log_item *lip,
- xfs_lsn_t lsn)
+xfs_rmap_update_log_item(
+ struct xfs_trans *tp,
+ void *intent,
+ struct list_head *item)
{
+ struct xfs_rui_log_item *ruip = intent;
+ struct xfs_rmap_intent *rmap;
+ uint next_extent;
+ struct xfs_map_extent *map;
+
+ rmap = container_of(item, struct xfs_rmap_intent, ri_list);
+
+ tp->t_flags |= XFS_TRANS_DIRTY;
+ set_bit(XFS_LI_DIRTY, &ruip->rui_item.li_flags);
+
+ /*
+ * atomic_inc_return gives us the value after the increment;
+ * we want to use it as an array index so we need to subtract 1 from
+ * it.
+ */
+ next_extent = atomic_inc_return(&ruip->rui_next_extent) - 1;
+ ASSERT(next_extent < ruip->rui_format.rui_nextents);
+ map = &ruip->rui_format.rui_extents[next_extent];
+ map->me_owner = rmap->ri_owner;
+ map->me_startblock = rmap->ri_bmap.br_startblock;
+ map->me_startoff = rmap->ri_bmap.br_startoff;
+ map->me_len = rmap->ri_bmap.br_blockcount;
+ xfs_trans_set_rmap_flags(map, rmap->ri_type, rmap->ri_whichfork,
+ rmap->ri_bmap.br_state);
}
-/*
- * This is the ops vector shared by all rud log items.
- */
-static const struct xfs_item_ops xfs_rud_item_ops = {
- .iop_size = xfs_rud_item_size,
- .iop_format = xfs_rud_item_format,
- .iop_pin = xfs_rud_item_pin,
- .iop_unpin = xfs_rud_item_unpin,
- .iop_unlock = xfs_rud_item_unlock,
- .iop_committed = xfs_rud_item_committed,
- .iop_push = xfs_rud_item_push,
- .iop_committing = xfs_rud_item_committing,
-};
+/* Get an RUD so we can process all the deferred rmap updates. */
+STATIC void *
+xfs_rmap_update_create_done(
+ struct xfs_trans *tp,
+ void *intent,
+ unsigned int count)
+{
+ return xfs_trans_get_rud(tp, intent);
+}
-/*
- * Allocate and initialize an rud item with the given number of extents.
- */
-struct xfs_rud_log_item *
-xfs_rud_init(
- struct xfs_mount *mp,
- struct xfs_rui_log_item *ruip)
+/* Process a deferred rmap update. */
+STATIC int
+xfs_rmap_update_finish_item(
+ struct xfs_trans *tp,
+ struct list_head *item,
+ void *done_item,
+ void **state)
+{
+ struct xfs_rmap_intent *rmap;
+ int error;
+
+ rmap = container_of(item, struct xfs_rmap_intent, ri_list);
+ error = xfs_trans_log_finish_rmap_update(tp, done_item,
+ rmap->ri_type,
+ rmap->ri_owner, rmap->ri_whichfork,
+ rmap->ri_bmap.br_startoff,
+ rmap->ri_bmap.br_startblock,
+ rmap->ri_bmap.br_blockcount,
+ rmap->ri_bmap.br_state,
+ (struct xfs_btree_cur **)state);
+ kmem_free(rmap);
+ return error;
+}
+
+/* Clean up after processing deferred rmaps. */
+STATIC void
+xfs_rmap_update_finish_cleanup(
+ struct xfs_trans *tp,
+ void *state,
+ int error)
+{
+ struct xfs_btree_cur *rcur = state;
+
+ xfs_rmap_finish_one_cleanup(tp, rcur, error);
+}
+/* Abort all pending RUIs. */
+STATIC void
+xfs_rmap_update_abort_intent(
+ void *intent)
{
- struct xfs_rud_log_item *rudp;
+ xfs_rui_release(intent);
+}
- rudp = kmem_zone_zalloc(xfs_rud_zone, KM_SLEEP);
- xfs_log_item_init(mp, &rudp->rud_item, XFS_LI_RUD, &xfs_rud_item_ops);
- rudp->rud_ruip = ruip;
- rudp->rud_format.rud_rui_id = ruip->rui_format.rui_id;
+/* Cancel a deferred rmap update. */
+STATIC void
+xfs_rmap_update_cancel_item(
+ struct list_head *item)
+{
+ struct xfs_rmap_intent *rmap;
- return rudp;
+ rmap = container_of(item, struct xfs_rmap_intent, ri_list);
+ kmem_free(rmap);
}
+const struct xfs_defer_op_type xfs_rmap_update_defer_type = {
+ .max_items = XFS_RUI_MAX_FAST_EXTENTS,
+ .diff_items = xfs_rmap_update_diff_items,
+ .create_intent = xfs_rmap_update_create_intent,
+ .abort_intent = xfs_rmap_update_abort_intent,
+ .log_item = xfs_rmap_update_log_item,
+ .create_done = xfs_rmap_update_create_done,
+ .finish_item = xfs_rmap_update_finish_item,
+ .finish_cleanup = xfs_rmap_update_finish_cleanup,
+ .cancel_item = xfs_rmap_update_cancel_item,
+};
+
/*
* Process an rmap update intent item that was recovered from the log.
* We need to update the rmapbt.
diff --git a/fs/xfs/xfs_rmap_item.h b/fs/xfs/xfs_rmap_item.h
index 7e482baa27f5..8708e4a5aa5c 100644
--- a/fs/xfs/xfs_rmap_item.h
+++ b/fs/xfs/xfs_rmap_item.h
@@ -78,8 +78,6 @@ extern struct kmem_zone *xfs_rui_zone;
extern struct kmem_zone *xfs_rud_zone;
struct xfs_rui_log_item *xfs_rui_init(struct xfs_mount *, uint);
-struct xfs_rud_log_item *xfs_rud_init(struct xfs_mount *,
- struct xfs_rui_log_item *);
int xfs_rui_copy_format(struct xfs_log_iovec *buf,
struct xfs_rui_log_format *dst_rui_fmt);
void xfs_rui_item_free(struct xfs_rui_log_item *);
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index ac0fcdad0c4e..5fa4db3c3e32 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -11,17 +11,11 @@
#include "xfs_trans_resv.h"
#include "xfs_bit.h"
#include "xfs_mount.h"
-#include "xfs_defer.h"
#include "xfs_inode.h"
#include "xfs_bmap.h"
-#include "xfs_bmap_util.h"
#include "xfs_bmap_btree.h"
-#include "xfs_alloc.h"
-#include "xfs_error.h"
#include "xfs_trans.h"
#include "xfs_trans_space.h"
-#include "xfs_trace.h"
-#include "xfs_buf.h"
#include "xfs_icache.h"
#include "xfs_rtalloc.h"
diff --git a/fs/xfs/xfs_stats.c b/fs/xfs/xfs_stats.c
index cc509743facd..113883c4f202 100644
--- a/fs/xfs/xfs_stats.c
+++ b/fs/xfs/xfs_stats.c
@@ -4,7 +4,6 @@
* All Rights Reserved.
*/
#include "xfs.h"
-#include <linux/proc_fs.h>
struct xstats xfsstats;
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index c9097cb0b955..f9450235533c 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -11,18 +11,15 @@
#include "xfs_trans_resv.h"
#include "xfs_sb.h"
#include "xfs_mount.h"
-#include "xfs_da_format.h"
#include "xfs_inode.h"
#include "xfs_btree.h"
#include "xfs_bmap.h"
#include "xfs_alloc.h"
-#include "xfs_error.h"
#include "xfs_fsops.h"
#include "xfs_trans.h"
#include "xfs_buf_item.h"
#include "xfs_log.h"
#include "xfs_log_priv.h"
-#include "xfs_da_btree.h"
#include "xfs_dir2.h"
#include "xfs_extfree_item.h"
#include "xfs_mru_cache.h"
@@ -38,18 +35,8 @@
#include "xfs_refcount_item.h"
#include "xfs_bmap_item.h"
#include "xfs_reflink.h"
-#include "xfs_defer.h"
-#include <linux/namei.h>
-#include <linux/dax.h>
-#include <linux/init.h>
-#include <linux/slab.h>
#include <linux/magic.h>
-#include <linux/mount.h>
-#include <linux/mempool.h>
-#include <linux/writeback.h>
-#include <linux/kthread.h>
-#include <linux/freezer.h>
#include <linux/parser.h>
static const struct super_operations xfs_super_operations;
@@ -66,7 +53,7 @@ static struct xfs_kobj xfs_dbg_kobj; /* global debug sysfs attrs */
enum {
Opt_logbufs, Opt_logbsize, Opt_logdev, Opt_rtdev, Opt_biosize,
Opt_wsync, Opt_noalign, Opt_swalloc, Opt_sunit, Opt_swidth, Opt_nouuid,
- Opt_mtpt, Opt_grpid, Opt_nogrpid, Opt_bsdgroups, Opt_sysvgroups,
+ Opt_grpid, Opt_nogrpid, Opt_bsdgroups, Opt_sysvgroups,
Opt_allocsize, Opt_norecovery, Opt_inode64, Opt_inode32, Opt_ikeep,
Opt_noikeep, Opt_largeio, Opt_nolargeio, Opt_attr2, Opt_noattr2,
Opt_filestreams, Opt_quota, Opt_noquota, Opt_usrquota, Opt_grpquota,
@@ -87,7 +74,6 @@ static const match_table_t tokens = {
{Opt_sunit, "sunit=%u"}, /* data volume stripe unit */
{Opt_swidth, "swidth=%u"}, /* data volume stripe width */
{Opt_nouuid, "nouuid"}, /* ignore filesystem UUID */
- {Opt_mtpt, "mtpt"}, /* filesystem mount point */
{Opt_grpid, "grpid"}, /* group-ID from parent directory */
{Opt_nogrpid, "nogrpid"}, /* group-ID from current process */
{Opt_bsdgroups, "bsdgroups"}, /* group-ID from parent directory */
@@ -236,9 +222,6 @@ xfs_parseargs(
if (!mp->m_logname)
return -ENOMEM;
break;
- case Opt_mtpt:
- xfs_warn(mp, "%s option not allowed on this system", p);
- return -EINVAL;
case Opt_rtdev:
kfree(mp->m_rtname);
mp->m_rtname = match_strdup(args);
@@ -448,7 +431,7 @@ struct proc_xfs_info {
char *str;
};
-STATIC int
+STATIC void
xfs_showargs(
struct xfs_mount *mp,
struct seq_file *m)
@@ -527,9 +510,8 @@ xfs_showargs(
if (!(mp->m_qflags & XFS_ALL_QUOTA_ACCT))
seq_puts(m, ",noquota");
-
- return 0;
}
+
static uint64_t
xfs_max_file_offset(
unsigned int blockshift)
@@ -539,26 +521,18 @@ xfs_max_file_offset(
/* Figure out maximum filesize, on Linux this can depend on
* the filesystem blocksize (on 32 bit platforms).
- * __block_write_begin does this in an [unsigned] long...
+ * __block_write_begin does this in an [unsigned] long long...
* page->index << (PAGE_SHIFT - bbits)
* So, for page sized blocks (4K on 32 bit platforms),
* this wraps at around 8Tb (hence MAX_LFS_FILESIZE which is
* (((u64)PAGE_SIZE << (BITS_PER_LONG-1))-1)
* but for smaller blocksizes it is less (bbits = log2 bsize).
- * Note1: get_block_t takes a long (implicit cast from above)
- * Note2: The Large Block Device (LBD and HAVE_SECTOR_T) patch
- * can optionally convert the [unsigned] long from above into
- * an [unsigned] long long.
*/
#if BITS_PER_LONG == 32
-# if defined(CONFIG_LBDAF)
ASSERT(sizeof(sector_t) == 8);
pagefactor = PAGE_SIZE;
bitshift = BITS_PER_LONG;
-# else
- pagefactor = PAGE_SIZE >> (PAGE_SHIFT - blockshift);
-# endif
#endif
return (((uint64_t)pagefactor) << bitshift) - 1;
@@ -595,7 +569,7 @@ xfs_set_inode_alloc(
* Calculate how much should be reserved for inodes to meet
* the max inode percentage. Used only for inode32.
*/
- if (mp->m_maxicount) {
+ if (M_IGEO(mp)->maxicount) {
uint64_t icount;
icount = sbp->sb_dblocks * sbp->sb_imax_pct;
@@ -838,15 +812,10 @@ xfs_init_mount_workqueues(
if (!mp->m_buf_workqueue)
goto out;
- mp->m_data_workqueue = alloc_workqueue("xfs-data/%s",
- WQ_MEM_RECLAIM|WQ_FREEZABLE, 0, mp->m_fsname);
- if (!mp->m_data_workqueue)
- goto out_destroy_buf;
-
mp->m_unwritten_workqueue = alloc_workqueue("xfs-conv/%s",
WQ_MEM_RECLAIM|WQ_FREEZABLE, 0, mp->m_fsname);
if (!mp->m_unwritten_workqueue)
- goto out_destroy_data_iodone_queue;
+ goto out_destroy_buf;
mp->m_cil_workqueue = alloc_workqueue("xfs-cil/%s",
WQ_MEM_RECLAIM|WQ_FREEZABLE, 0, mp->m_fsname);
@@ -858,16 +827,10 @@ xfs_init_mount_workqueues(
if (!mp->m_reclaim_workqueue)
goto out_destroy_cil;
- mp->m_log_workqueue = alloc_workqueue("xfs-log/%s",
- WQ_MEM_RECLAIM|WQ_FREEZABLE|WQ_HIGHPRI, 0,
- mp->m_fsname);
- if (!mp->m_log_workqueue)
- goto out_destroy_reclaim;
-
mp->m_eofblocks_workqueue = alloc_workqueue("xfs-eofblocks/%s",
WQ_MEM_RECLAIM|WQ_FREEZABLE, 0, mp->m_fsname);
if (!mp->m_eofblocks_workqueue)
- goto out_destroy_log;
+ goto out_destroy_reclaim;
mp->m_sync_workqueue = alloc_workqueue("xfs-sync/%s", WQ_FREEZABLE, 0,
mp->m_fsname);
@@ -878,16 +841,12 @@ xfs_init_mount_workqueues(
out_destroy_eofb:
destroy_workqueue(mp->m_eofblocks_workqueue);
-out_destroy_log:
- destroy_workqueue(mp->m_log_workqueue);
out_destroy_reclaim:
destroy_workqueue(mp->m_reclaim_workqueue);
out_destroy_cil:
destroy_workqueue(mp->m_cil_workqueue);
out_destroy_unwritten:
destroy_workqueue(mp->m_unwritten_workqueue);
-out_destroy_data_iodone_queue:
- destroy_workqueue(mp->m_data_workqueue);
out_destroy_buf:
destroy_workqueue(mp->m_buf_workqueue);
out:
@@ -900,10 +859,8 @@ xfs_destroy_mount_workqueues(
{
destroy_workqueue(mp->m_sync_workqueue);
destroy_workqueue(mp->m_eofblocks_workqueue);
- destroy_workqueue(mp->m_log_workqueue);
destroy_workqueue(mp->m_reclaim_workqueue);
destroy_workqueue(mp->m_cil_workqueue);
- destroy_workqueue(mp->m_data_workqueue);
destroy_workqueue(mp->m_unwritten_workqueue);
destroy_workqueue(mp->m_buf_workqueue);
}
@@ -1152,10 +1109,10 @@ xfs_fs_statfs(
fakeinos = XFS_FSB_TO_INO(mp, statp->f_bfree);
statp->f_files = min(icount + fakeinos, (uint64_t)XFS_MAXINUMBER);
- if (mp->m_maxicount)
+ if (M_IGEO(mp)->maxicount)
statp->f_files = min_t(typeof(statp->f_files),
statp->f_files,
- mp->m_maxicount);
+ M_IGEO(mp)->maxicount);
/* If sb_icount overshot maxicount, report actual allocation */
statp->f_files = max_t(typeof(statp->f_files),
@@ -1376,7 +1333,7 @@ xfs_fs_remount(
xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
return error;
}
- xfs_icache_enable_reclaim(mp);
+ xfs_start_block_reaping(mp);
/* Create the per-AG metadata reservation pool .*/
error = xfs_fs_reserve_ag_blocks(mp);
@@ -1390,7 +1347,7 @@ xfs_fs_remount(
* Cancel background eofb scanning so it cannot race with the
* final log force+buftarg wait and deadlock the remount.
*/
- xfs_icache_disable_reclaim(mp);
+ xfs_stop_block_reaping(mp);
/* Get rid of any leftover CoW reservations... */
error = xfs_icache_free_cowblocks(mp, NULL);
@@ -1434,7 +1391,7 @@ xfs_fs_freeze(
{
struct xfs_mount *mp = XFS_M(sb);
- xfs_icache_disable_reclaim(mp);
+ xfs_stop_block_reaping(mp);
xfs_save_resvblks(mp);
xfs_quiesce_attr(mp);
return xfs_sync_sb(mp, true);
@@ -1448,7 +1405,7 @@ xfs_fs_unfreeze(
xfs_restore_resvblks(mp);
xfs_log_work_queue(mp);
- xfs_icache_enable_reclaim(mp);
+ xfs_start_block_reaping(mp);
return 0;
}
@@ -1457,7 +1414,8 @@ xfs_fs_show_options(
struct seq_file *m,
struct dentry *root)
{
- return xfs_showargs(XFS_M(root->d_sb), m);
+ xfs_showargs(XFS_M(root->d_sb), m);
+ return 0;
}
/*
@@ -1546,8 +1504,14 @@ xfs_init_percpu_counters(
if (error)
goto free_ifree;
+ error = percpu_counter_init(&mp->m_delalloc_blks, 0, GFP_KERNEL);
+ if (error)
+ goto free_fdblocks;
+
return 0;
+free_fdblocks:
+ percpu_counter_destroy(&mp->m_fdblocks);
free_ifree:
percpu_counter_destroy(&mp->m_ifree);
free_icount:
@@ -1571,6 +1535,9 @@ xfs_destroy_percpu_counters(
percpu_counter_destroy(&mp->m_icount);
percpu_counter_destroy(&mp->m_ifree);
percpu_counter_destroy(&mp->m_fdblocks);
+ ASSERT(XFS_FORCED_SHUTDOWN(mp) ||
+ percpu_counter_sum(&mp->m_delalloc_blks) == 0);
+ percpu_counter_destroy(&mp->m_delalloc_blks);
}
static struct xfs_mount *
@@ -1594,6 +1561,13 @@ xfs_mount_alloc(
INIT_DELAYED_WORK(&mp->m_eofblocks_work, xfs_eofblocks_worker);
INIT_DELAYED_WORK(&mp->m_cowblocks_work, xfs_cowblocks_worker);
mp->m_kobj.kobject.kset = xfs_kset;
+ /*
+ * We don't create the finobt per-ag space reservation until after log
+ * recovery, so we must set this to true so that an ifree transaction
+ * started during log recovery will not depend on space reservations
+ * for finobt expansion.
+ */
+ mp->m_finobt_nores = true;
return mp;
}
@@ -1689,6 +1663,8 @@ xfs_fs_fill_super(
sb->s_maxbytes = xfs_max_file_offset(sb->s_blocksize_bits);
sb->s_max_links = XFS_MAXLINK;
sb->s_time_gran = 1;
+ sb->s_iflags |= SB_I_CGROUPWB;
+
set_posix_acl_flag(sb);
/* version 5 superblocks support inode version counters. */
@@ -1729,11 +1705,18 @@ xfs_fs_fill_super(
}
}
- if (xfs_sb_version_hasreflink(&mp->m_sb) && mp->m_sb.sb_rblocks) {
- xfs_alert(mp,
+ if (xfs_sb_version_hasreflink(&mp->m_sb)) {
+ if (mp->m_sb.sb_rblocks) {
+ xfs_alert(mp,
"reflink not compatible with realtime device!");
- error = -EINVAL;
- goto out_filestream_unmount;
+ error = -EINVAL;
+ goto out_filestream_unmount;
+ }
+
+ if (xfs_globals.always_cow) {
+ xfs_info(mp, "using DEBUG-only always_cow mode.");
+ mp->m_always_cow = true;
+ }
}
if (xfs_sb_version_hasrmapbt(&mp->m_sb) && mp->m_sb.sb_rblocks) {
diff --git a/fs/xfs/xfs_super.h b/fs/xfs/xfs_super.h
index 21cb49a43d7c..763e43d22dee 100644
--- a/fs/xfs/xfs_super.h
+++ b/fs/xfs/xfs_super.h
@@ -38,6 +38,18 @@ extern void xfs_qm_exit(void);
# define XFS_SCRUB_STRING
#endif
+#ifdef CONFIG_XFS_ONLINE_REPAIR
+# define XFS_REPAIR_STRING "repair, "
+#else
+# define XFS_REPAIR_STRING
+#endif
+
+#ifdef CONFIG_XFS_WARN
+# define XFS_WARN_STRING "verbose warnings, "
+#else
+# define XFS_WARN_STRING
+#endif
+
#ifdef DEBUG
# define XFS_DBG_STRING "debug"
#else
@@ -49,6 +61,8 @@ extern void xfs_qm_exit(void);
XFS_SECURITY_STRING \
XFS_REALTIME_STRING \
XFS_SCRUB_STRING \
+ XFS_REPAIR_STRING \
+ XFS_WARN_STRING \
XFS_DBG_STRING /* DBG must be last */
struct xfs_inode;
diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c
index b2c1177c717f..ed66fd2de327 100644
--- a/fs/xfs/xfs_symlink.c
+++ b/fs/xfs/xfs_symlink.c
@@ -12,23 +12,14 @@
#include "xfs_trans_resv.h"
#include "xfs_bit.h"
#include "xfs_mount.h"
-#include "xfs_da_format.h"
-#include "xfs_da_btree.h"
-#include "xfs_defer.h"
#include "xfs_dir2.h"
#include "xfs_inode.h"
-#include "xfs_ialloc.h"
-#include "xfs_alloc.h"
#include "xfs_bmap.h"
#include "xfs_bmap_btree.h"
-#include "xfs_bmap_util.h"
-#include "xfs_error.h"
#include "xfs_quota.h"
#include "xfs_trans_space.h"
#include "xfs_trace.h"
-#include "xfs_symlink.h"
#include "xfs_trans.h"
-#include "xfs_log.h"
/* ----- Kernel only functions below ----- */
int
diff --git a/fs/xfs/xfs_sysctl.c b/fs/xfs/xfs_sysctl.c
index 0cc034dfb786..31b3bdbd2eba 100644
--- a/fs/xfs/xfs_sysctl.c
+++ b/fs/xfs/xfs_sysctl.c
@@ -4,10 +4,7 @@
* All Rights Reserved.
*/
#include "xfs.h"
-#include <linux/sysctl.h>
-#include <linux/proc_fs.h>
#include "xfs_error.h"
-#include "xfs_stats.h"
static struct ctl_table_header *xfs_table_header;
diff --git a/fs/xfs/xfs_sysctl.h b/fs/xfs/xfs_sysctl.h
index 168488130a19..8abf4640f1d5 100644
--- a/fs/xfs/xfs_sysctl.h
+++ b/fs/xfs/xfs_sysctl.h
@@ -82,9 +82,13 @@ enum {
extern xfs_param_t xfs_params;
struct xfs_globals {
+#ifdef DEBUG
+ int pwork_threads; /* parallel workqueue threads */
+#endif
int log_recovery_delay; /* log recovery delay (secs) */
int mount_delay; /* mount setup delay (secs) */
bool bug_on_assert; /* BUG() the kernel on assert failure */
+ bool always_cow; /* use COW fork for all overwrites */
};
extern struct xfs_globals xfs_globals;
diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c
index cd6a994a7250..ddd0bf7a4740 100644
--- a/fs/xfs/xfs_sysfs.c
+++ b/fs/xfs/xfs_sysfs.c
@@ -10,9 +10,7 @@
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
#include "xfs_sysfs.h"
-#include "xfs_log.h"
#include "xfs_log_priv.h"
-#include "xfs_stats.h"
#include "xfs_mount.h"
struct xfs_sysfs_attr {
@@ -183,10 +181,74 @@ mount_delay_show(
}
XFS_SYSFS_ATTR_RW(mount_delay);
+static ssize_t
+always_cow_store(
+ struct kobject *kobject,
+ const char *buf,
+ size_t count)
+{
+ ssize_t ret;
+
+ ret = kstrtobool(buf, &xfs_globals.always_cow);
+ if (ret < 0)
+ return ret;
+ return count;
+}
+
+static ssize_t
+always_cow_show(
+ struct kobject *kobject,
+ char *buf)
+{
+ return snprintf(buf, PAGE_SIZE, "%d\n", xfs_globals.always_cow);
+}
+XFS_SYSFS_ATTR_RW(always_cow);
+
+#ifdef DEBUG
+/*
+ * Override how many threads the parallel work queue is allowed to create.
+ * This has to be a debug-only global (instead of an errortag) because one of
+ * the main users of parallel workqueues is mount time quotacheck.
+ */
+STATIC ssize_t
+pwork_threads_store(
+ struct kobject *kobject,
+ const char *buf,
+ size_t count)
+{
+ int ret;
+ int val;
+
+ ret = kstrtoint(buf, 0, &val);
+ if (ret)
+ return ret;
+
+ if (val < -1 || val > num_possible_cpus())
+ return -EINVAL;
+
+ xfs_globals.pwork_threads = val;
+
+ return count;
+}
+
+STATIC ssize_t
+pwork_threads_show(
+ struct kobject *kobject,
+ char *buf)
+{
+ return snprintf(buf, PAGE_SIZE, "%d\n", xfs_globals.pwork_threads);
+}
+XFS_SYSFS_ATTR_RW(pwork_threads);
+#endif /* DEBUG */
+
static struct attribute *xfs_dbg_attrs[] = {
ATTR_LIST(bug_on_assert),
ATTR_LIST(log_recovery_delay),
ATTR_LIST(mount_delay),
+ ATTR_LIST(always_cow),
+#ifdef DEBUG
+ ATTR_LIST(pwork_threads),
+#endif
NULL,
};
diff --git a/fs/xfs/xfs_trace.c b/fs/xfs/xfs_trace.c
index cb6489c22cad..bc85b89f88ca 100644
--- a/fs/xfs/xfs_trace.c
+++ b/fs/xfs/xfs_trace.c
@@ -15,24 +15,16 @@
#include "xfs_inode.h"
#include "xfs_btree.h"
#include "xfs_da_btree.h"
-#include "xfs_ialloc.h"
-#include "xfs_itable.h"
#include "xfs_alloc.h"
#include "xfs_bmap.h"
#include "xfs_attr.h"
-#include "xfs_attr_leaf.h"
#include "xfs_trans.h"
-#include "xfs_log.h"
#include "xfs_log_priv.h"
#include "xfs_buf_item.h"
#include "xfs_quota.h"
-#include "xfs_iomap.h"
-#include "xfs_aops.h"
#include "xfs_dquot_item.h"
#include "xfs_dquot.h"
#include "xfs_log_recover.h"
-#include "xfs_inode_item.h"
-#include "xfs_bmap_btree.h"
#include "xfs_filestream.h"
#include "xfs_fsmap.h"
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 6fcc893dfc91..8094b1920eef 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -475,7 +475,7 @@ DEFINE_BUF_ITEM_EVENT(xfs_buf_item_ordered);
DEFINE_BUF_ITEM_EVENT(xfs_buf_item_pin);
DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unpin);
DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unpin_stale);
-DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unlock);
+DEFINE_BUF_ITEM_EVENT(xfs_buf_item_release);
DEFINE_BUF_ITEM_EVENT(xfs_buf_item_committed);
DEFINE_BUF_ITEM_EVENT(xfs_buf_item_push);
DEFINE_BUF_ITEM_EVENT(xfs_trans_get_buf);
@@ -1218,23 +1218,17 @@ DEFINE_EVENT(xfs_readpage_class, name, \
DEFINE_READPAGE_EVENT(xfs_vm_readpage);
DEFINE_READPAGE_EVENT(xfs_vm_readpages);
-TRACE_DEFINE_ENUM(XFS_IO_HOLE);
-TRACE_DEFINE_ENUM(XFS_IO_DELALLOC);
-TRACE_DEFINE_ENUM(XFS_IO_UNWRITTEN);
-TRACE_DEFINE_ENUM(XFS_IO_OVERWRITE);
-TRACE_DEFINE_ENUM(XFS_IO_COW);
-
DECLARE_EVENT_CLASS(xfs_imap_class,
TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count,
- int type, struct xfs_bmbt_irec *irec),
- TP_ARGS(ip, offset, count, type, irec),
+ int whichfork, struct xfs_bmbt_irec *irec),
+ TP_ARGS(ip, offset, count, whichfork, irec),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(xfs_ino_t, ino)
__field(loff_t, size)
__field(loff_t, offset)
__field(size_t, count)
- __field(int, type)
+ __field(int, whichfork)
__field(xfs_fileoff_t, startoff)
__field(xfs_fsblock_t, startblock)
__field(xfs_filblks_t, blockcount)
@@ -1245,33 +1239,33 @@ DECLARE_EVENT_CLASS(xfs_imap_class,
__entry->size = ip->i_d.di_size;
__entry->offset = offset;
__entry->count = count;
- __entry->type = type;
+ __entry->whichfork = whichfork;
__entry->startoff = irec ? irec->br_startoff : 0;
__entry->startblock = irec ? irec->br_startblock : 0;
__entry->blockcount = irec ? irec->br_blockcount : 0;
),
TP_printk("dev %d:%d ino 0x%llx size 0x%llx offset 0x%llx count %zd "
- "type %s startoff 0x%llx startblock %lld blockcount 0x%llx",
+ "fork %s startoff 0x%llx startblock %lld blockcount 0x%llx",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->ino,
__entry->size,
__entry->offset,
__entry->count,
- __print_symbolic(__entry->type, XFS_IO_TYPES),
+ __entry->whichfork == XFS_COW_FORK ? "cow" : "data",
__entry->startoff,
(int64_t)__entry->startblock,
__entry->blockcount)
)
-#define DEFINE_IOMAP_EVENT(name) \
+#define DEFINE_IMAP_EVENT(name) \
DEFINE_EVENT(xfs_imap_class, name, \
TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count, \
- int type, struct xfs_bmbt_irec *irec), \
- TP_ARGS(ip, offset, count, type, irec))
-DEFINE_IOMAP_EVENT(xfs_map_blocks_found);
-DEFINE_IOMAP_EVENT(xfs_map_blocks_alloc);
-DEFINE_IOMAP_EVENT(xfs_iomap_alloc);
-DEFINE_IOMAP_EVENT(xfs_iomap_found);
+ int whichfork, struct xfs_bmbt_irec *irec), \
+ TP_ARGS(ip, offset, count, whichfork, irec))
+DEFINE_IMAP_EVENT(xfs_map_blocks_found);
+DEFINE_IMAP_EVENT(xfs_map_blocks_alloc);
+DEFINE_IMAP_EVENT(xfs_iomap_alloc);
+DEFINE_IMAP_EVENT(xfs_iomap_found);
DECLARE_EVENT_CLASS(xfs_simple_io_class,
TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count),
@@ -3078,7 +3072,7 @@ DEFINE_EVENT(xfs_inode_irec_class, name, \
DEFINE_INODE_EVENT(xfs_reflink_set_inode_flag);
DEFINE_INODE_EVENT(xfs_reflink_unset_inode_flag);
DEFINE_ITRUNC_EVENT(xfs_reflink_update_inode_size);
-DEFINE_IOMAP_EVENT(xfs_reflink_remap_imap);
+DEFINE_IMAP_EVENT(xfs_reflink_remap_imap);
TRACE_EVENT(xfs_reflink_remap_blocks_loop,
TP_PROTO(struct xfs_inode *src, xfs_fileoff_t soffset,
xfs_filblks_t len, struct xfs_inode *dest,
@@ -3202,13 +3196,10 @@ DEFINE_INODE_ERROR_EVENT(xfs_reflink_unshare_error);
/* copy on write */
DEFINE_INODE_IREC_EVENT(xfs_reflink_trim_around_shared);
-DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_alloc);
DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_found);
DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_enospc);
DEFINE_INODE_IREC_EVENT(xfs_reflink_convert_cow);
-DEFINE_RW_EVENT(xfs_reflink_reserve_cow);
-
DEFINE_SIMPLE_IO_EVENT(xfs_reflink_bounce_dio_write);
DEFINE_SIMPLE_IO_EVENT(xfs_reflink_cancel_cow_range);
@@ -3369,8 +3360,221 @@ DEFINE_TRANS_EVENT(xfs_trans_dup);
DEFINE_TRANS_EVENT(xfs_trans_free);
DEFINE_TRANS_EVENT(xfs_trans_roll);
DEFINE_TRANS_EVENT(xfs_trans_add_item);
+DEFINE_TRANS_EVENT(xfs_trans_commit_items);
DEFINE_TRANS_EVENT(xfs_trans_free_items);
+TRACE_EVENT(xfs_iunlink_update_bucket,
+ TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, unsigned int bucket,
+ xfs_agino_t old_ptr, xfs_agino_t new_ptr),
+ TP_ARGS(mp, agno, bucket, old_ptr, new_ptr),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_agnumber_t, agno)
+ __field(unsigned int, bucket)
+ __field(xfs_agino_t, old_ptr)
+ __field(xfs_agino_t, new_ptr)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->agno = agno;
+ __entry->bucket = bucket;
+ __entry->old_ptr = old_ptr;
+ __entry->new_ptr = new_ptr;
+ ),
+ TP_printk("dev %d:%d agno %u bucket %u old 0x%x new 0x%x",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->agno,
+ __entry->bucket,
+ __entry->old_ptr,
+ __entry->new_ptr)
+);
+
+TRACE_EVENT(xfs_iunlink_update_dinode,
+ TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agino_t agino,
+ xfs_agino_t old_ptr, xfs_agino_t new_ptr),
+ TP_ARGS(mp, agno, agino, old_ptr, new_ptr),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_agino_t, agino)
+ __field(xfs_agino_t, old_ptr)
+ __field(xfs_agino_t, new_ptr)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->agno = agno;
+ __entry->agino = agino;
+ __entry->old_ptr = old_ptr;
+ __entry->new_ptr = new_ptr;
+ ),
+ TP_printk("dev %d:%d agno %u agino 0x%x old 0x%x new 0x%x",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->agno,
+ __entry->agino,
+ __entry->old_ptr,
+ __entry->new_ptr)
+);
+
+DECLARE_EVENT_CLASS(xfs_ag_inode_class,
+ TP_PROTO(struct xfs_inode *ip),
+ TP_ARGS(ip),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_agino_t, agino)
+ ),
+ TP_fast_assign(
+ __entry->dev = VFS_I(ip)->i_sb->s_dev;
+ __entry->agno = XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino);
+ __entry->agino = XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino);
+ ),
+ TP_printk("dev %d:%d agno %u agino %u",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->agno, __entry->agino)
+)
+
+#define DEFINE_AGINODE_EVENT(name) \
+DEFINE_EVENT(xfs_ag_inode_class, name, \
+ TP_PROTO(struct xfs_inode *ip), \
+ TP_ARGS(ip))
+DEFINE_AGINODE_EVENT(xfs_iunlink);
+DEFINE_AGINODE_EVENT(xfs_iunlink_remove);
+DEFINE_AG_EVENT(xfs_iunlink_map_prev_fallback);
+
+DECLARE_EVENT_CLASS(xfs_fs_corrupt_class,
+ TP_PROTO(struct xfs_mount *mp, unsigned int flags),
+ TP_ARGS(mp, flags),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(unsigned int, flags)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->flags = flags;
+ ),
+ TP_printk("dev %d:%d flags 0x%x",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->flags)
+);
+#define DEFINE_FS_CORRUPT_EVENT(name) \
+DEFINE_EVENT(xfs_fs_corrupt_class, name, \
+ TP_PROTO(struct xfs_mount *mp, unsigned int flags), \
+ TP_ARGS(mp, flags))
+DEFINE_FS_CORRUPT_EVENT(xfs_fs_mark_sick);
+DEFINE_FS_CORRUPT_EVENT(xfs_fs_mark_healthy);
+DEFINE_FS_CORRUPT_EVENT(xfs_fs_unfixed_corruption);
+DEFINE_FS_CORRUPT_EVENT(xfs_rt_mark_sick);
+DEFINE_FS_CORRUPT_EVENT(xfs_rt_mark_healthy);
+DEFINE_FS_CORRUPT_EVENT(xfs_rt_unfixed_corruption);
+
+DECLARE_EVENT_CLASS(xfs_ag_corrupt_class,
+ TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, unsigned int flags),
+ TP_ARGS(mp, agno, flags),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_agnumber_t, agno)
+ __field(unsigned int, flags)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->agno = agno;
+ __entry->flags = flags;
+ ),
+ TP_printk("dev %d:%d agno %u flags 0x%x",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->agno, __entry->flags)
+);
+#define DEFINE_AG_CORRUPT_EVENT(name) \
+DEFINE_EVENT(xfs_ag_corrupt_class, name, \
+ TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \
+ unsigned int flags), \
+ TP_ARGS(mp, agno, flags))
+DEFINE_AG_CORRUPT_EVENT(xfs_ag_mark_sick);
+DEFINE_AG_CORRUPT_EVENT(xfs_ag_mark_healthy);
+DEFINE_AG_CORRUPT_EVENT(xfs_ag_unfixed_corruption);
+
+DECLARE_EVENT_CLASS(xfs_inode_corrupt_class,
+ TP_PROTO(struct xfs_inode *ip, unsigned int flags),
+ TP_ARGS(ip, flags),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(unsigned int, flags)
+ ),
+ TP_fast_assign(
+ __entry->dev = ip->i_mount->m_super->s_dev;
+ __entry->ino = ip->i_ino;
+ __entry->flags = flags;
+ ),
+ TP_printk("dev %d:%d ino 0x%llx flags 0x%x",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino, __entry->flags)
+);
+#define DEFINE_INODE_CORRUPT_EVENT(name) \
+DEFINE_EVENT(xfs_inode_corrupt_class, name, \
+ TP_PROTO(struct xfs_inode *ip, unsigned int flags), \
+ TP_ARGS(ip, flags))
+DEFINE_INODE_CORRUPT_EVENT(xfs_inode_mark_sick);
+DEFINE_INODE_CORRUPT_EVENT(xfs_inode_mark_healthy);
+
+TRACE_EVENT(xfs_iwalk_ag,
+ TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
+ xfs_agino_t startino),
+ TP_ARGS(mp, agno, startino),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_agino_t, startino)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->agno = agno;
+ __entry->startino = startino;
+ ),
+ TP_printk("dev %d:%d agno %d startino %u",
+ MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno,
+ __entry->startino)
+)
+
+TRACE_EVENT(xfs_iwalk_ag_rec,
+ TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
+ struct xfs_inobt_rec_incore *irec),
+ TP_ARGS(mp, agno, irec),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_agino_t, startino)
+ __field(uint64_t, freemask)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->agno = agno;
+ __entry->startino = irec->ir_startino;
+ __entry->freemask = irec->ir_free;
+ ),
+ TP_printk("dev %d:%d agno %d startino %u freemask 0x%llx",
+ MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno,
+ __entry->startino, __entry->freemask)
+)
+
+TRACE_EVENT(xfs_pwork_init,
+ TP_PROTO(struct xfs_mount *mp, unsigned int nr_threads, pid_t pid),
+ TP_ARGS(mp, nr_threads, pid),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(unsigned int, nr_threads)
+ __field(pid_t, pid)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->nr_threads = nr_threads;
+ __entry->pid = pid;
+ ),
+ TP_printk("dev %d:%d nr_threads %u pid %u",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->nr_threads, __entry->pid)
+)
+
#endif /* _TRACE_XFS_H */
#undef TRACE_INCLUDE_PATH
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 912b42f5fe4a..d42a68d8313b 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -11,7 +11,6 @@
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
-#include "xfs_inode.h"
#include "xfs_extent_busy.h"
#include "xfs_quota.h"
#include "xfs_trans.h"
@@ -264,9 +263,7 @@ xfs_trans_alloc(
* GFP_NOFS allocation context so that we avoid lockdep false positives
* by doing GFP_KERNEL allocations inside sb_start_intwrite().
*/
- tp = kmem_zone_zalloc(xfs_trans_zone,
- (flags & XFS_TRANS_NOFS) ? KM_NOFS : KM_SLEEP);
-
+ tp = kmem_zone_zalloc(xfs_trans_zone, KM_SLEEP);
if (!(flags & XFS_TRANS_NO_WRITECOUNT))
sb_start_intwrite(mp->m_super);
@@ -452,7 +449,7 @@ xfs_trans_apply_sb_deltas(
xfs_buf_t *bp;
int whole = 0;
- bp = xfs_trans_getsb(tp, tp->t_mountp, 0);
+ bp = xfs_trans_getsb(tp, tp->t_mountp);
sbp = XFS_BUF_TO_SBP(bp);
/*
@@ -767,10 +764,9 @@ xfs_trans_del_item(
}
/* Detach and unlock all of the items in a transaction */
-void
+static void
xfs_trans_free_items(
struct xfs_trans *tp,
- xfs_lsn_t commit_lsn,
bool abort)
{
struct xfs_log_item *lip, *next;
@@ -779,11 +775,10 @@ xfs_trans_free_items(
list_for_each_entry_safe(lip, next, &tp->t_items, li_trans) {
xfs_trans_del_item(lip);
- if (commit_lsn != NULLCOMMITLSN)
- lip->li_ops->iop_committing(lip, commit_lsn);
if (abort)
set_bit(XFS_LI_ABORTED, &lip->li_flags);
- lip->li_ops->iop_unlock(lip);
+ if (lip->li_ops->iop_release)
+ lip->li_ops->iop_release(lip);
}
}
@@ -804,7 +799,8 @@ xfs_log_item_batch_insert(
for (i = 0; i < nr_items; i++) {
struct xfs_log_item *lip = log_items[i];
- lip->li_ops->iop_unpin(lip, 0);
+ if (lip->li_ops->iop_unpin)
+ lip->li_ops->iop_unpin(lip, 0);
}
}
@@ -815,7 +811,7 @@ xfs_log_item_batch_insert(
*
* If we are called with the aborted flag set, it is because a log write during
* a CIL checkpoint commit has failed. In this case, all the items in the
- * checkpoint have already gone through iop_commited and iop_unlock, which
+ * checkpoint have already gone through iop_committed and iop_committing, which
* means that checkpoint commit abort handling is treated exactly the same
* as an iclog write error even though we haven't started any IO yet. Hence in
* this case all we need to do is iop_committed processing, followed by an
@@ -833,7 +829,7 @@ xfs_trans_committed_bulk(
struct xfs_ail *ailp,
struct xfs_log_vec *log_vector,
xfs_lsn_t commit_lsn,
- int aborted)
+ bool aborted)
{
#define LOG_ITEM_BATCH_SIZE 32
struct xfs_log_item *log_items[LOG_ITEM_BATCH_SIZE];
@@ -852,7 +848,16 @@ xfs_trans_committed_bulk(
if (aborted)
set_bit(XFS_LI_ABORTED, &lip->li_flags);
- item_lsn = lip->li_ops->iop_committed(lip, commit_lsn);
+
+ if (lip->li_ops->flags & XFS_ITEM_RELEASE_WHEN_COMMITTED) {
+ lip->li_ops->iop_release(lip);
+ continue;
+ }
+
+ if (lip->li_ops->iop_committed)
+ item_lsn = lip->li_ops->iop_committed(lip, commit_lsn);
+ else
+ item_lsn = commit_lsn;
/* item_lsn of -1 means the item needs no further processing */
if (XFS_LSN_CMP(item_lsn, (xfs_lsn_t)-1) == 0)
@@ -864,7 +869,8 @@ xfs_trans_committed_bulk(
*/
if (aborted) {
ASSERT(XFS_FORCED_SHUTDOWN(ailp->ail_mount));
- lip->li_ops->iop_unpin(lip, 1);
+ if (lip->li_ops->iop_unpin)
+ lip->li_ops->iop_unpin(lip, 1);
continue;
}
@@ -882,7 +888,8 @@ xfs_trans_committed_bulk(
xfs_trans_ail_update(ailp, lip, item_lsn);
else
spin_unlock(&ailp->ail_lock);
- lip->li_ops->iop_unpin(lip, 0);
+ if (lip->li_ops->iop_unpin)
+ lip->li_ops->iop_unpin(lip, 0);
continue;
}
@@ -998,7 +1005,7 @@ out_unreserve:
tp->t_ticket = NULL;
}
current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
- xfs_trans_free_items(tp, NULLCOMMITLSN, !!error);
+ xfs_trans_free_items(tp, !!error);
xfs_trans_free(tp);
XFS_STATS_INC(mp, xs_trans_empty);
@@ -1060,7 +1067,7 @@ xfs_trans_cancel(
/* mark this thread as no longer being in a transaction */
current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
- xfs_trans_free_items(tp, NULLCOMMITLSN, dirty);
+ xfs_trans_free_items(tp, dirty);
xfs_trans_free(tp);
}
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index c6e1c5704a8c..64d7f171ebd3 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -27,7 +27,7 @@ struct xfs_cud_log_item;
struct xfs_bui_log_item;
struct xfs_bud_log_item;
-typedef struct xfs_log_item {
+struct xfs_log_item {
struct list_head li_ail; /* AIL pointers */
struct list_head li_trans; /* transaction list */
xfs_lsn_t li_lsn; /* last on-disk lsn */
@@ -48,7 +48,7 @@ typedef struct xfs_log_item {
struct xfs_log_vec *li_lv; /* active log vector */
struct xfs_log_vec *li_lv_shadow; /* standby vector */
xfs_lsn_t li_seq; /* CIL commit seq */
-} xfs_log_item_t;
+};
/*
* li_flags use the (set/test/clear)_bit atomic interfaces because updates can
@@ -67,17 +67,24 @@ typedef struct xfs_log_item {
{ (1 << XFS_LI_DIRTY), "DIRTY" }
struct xfs_item_ops {
- void (*iop_size)(xfs_log_item_t *, int *, int *);
- void (*iop_format)(xfs_log_item_t *, struct xfs_log_vec *);
- void (*iop_pin)(xfs_log_item_t *);
- void (*iop_unpin)(xfs_log_item_t *, int remove);
+ unsigned flags;
+ void (*iop_size)(struct xfs_log_item *, int *, int *);
+ void (*iop_format)(struct xfs_log_item *, struct xfs_log_vec *);
+ void (*iop_pin)(struct xfs_log_item *);
+ void (*iop_unpin)(struct xfs_log_item *, int remove);
uint (*iop_push)(struct xfs_log_item *, struct list_head *);
- void (*iop_unlock)(xfs_log_item_t *);
- xfs_lsn_t (*iop_committed)(xfs_log_item_t *, xfs_lsn_t);
- void (*iop_committing)(xfs_log_item_t *, xfs_lsn_t);
- void (*iop_error)(xfs_log_item_t *, xfs_buf_t *);
+ void (*iop_committing)(struct xfs_log_item *, xfs_lsn_t commit_lsn);
+ void (*iop_release)(struct xfs_log_item *);
+ xfs_lsn_t (*iop_committed)(struct xfs_log_item *, xfs_lsn_t);
+ void (*iop_error)(struct xfs_log_item *, xfs_buf_t *);
};
+/*
+ * Release the log item as soon as committed. This is for items just logging
+ * intents that never need to be written back in place.
+ */
+#define XFS_ITEM_RELEASE_WHEN_COMMITTED (1 << 0)
+
void xfs_log_item_init(struct xfs_mount *mp, struct xfs_log_item *item,
int type, const struct xfs_item_ops *ops);
@@ -203,7 +210,7 @@ xfs_trans_read_buf(
flags, bpp, ops);
}
-struct xfs_buf *xfs_trans_getsb(xfs_trans_t *, struct xfs_mount *, int);
+struct xfs_buf *xfs_trans_getsb(xfs_trans_t *, struct xfs_mount *);
void xfs_trans_brelse(xfs_trans_t *, struct xfs_buf *);
void xfs_trans_bjoin(xfs_trans_t *, struct xfs_buf *);
@@ -223,14 +230,6 @@ void xfs_trans_dirty_buf(struct xfs_trans *, struct xfs_buf *);
bool xfs_trans_buf_is_dirty(struct xfs_buf *bp);
void xfs_trans_log_inode(xfs_trans_t *, struct xfs_inode *, uint);
-struct xfs_efd_log_item *xfs_trans_get_efd(struct xfs_trans *,
- struct xfs_efi_log_item *,
- uint);
-int xfs_trans_free_extent(struct xfs_trans *,
- struct xfs_efd_log_item *, xfs_fsblock_t,
- xfs_extlen_t,
- const struct xfs_owner_info *,
- bool);
int xfs_trans_commit(struct xfs_trans *);
int xfs_trans_roll(struct xfs_trans **);
int xfs_trans_roll_inode(struct xfs_trans **, struct xfs_inode *);
@@ -245,37 +244,4 @@ void xfs_trans_buf_copy_type(struct xfs_buf *dst_bp,
extern kmem_zone_t *xfs_trans_zone;
-/* rmap updates */
-enum xfs_rmap_intent_type;
-
-struct xfs_rud_log_item *xfs_trans_get_rud(struct xfs_trans *tp,
- struct xfs_rui_log_item *ruip);
-int xfs_trans_log_finish_rmap_update(struct xfs_trans *tp,
- struct xfs_rud_log_item *rudp, enum xfs_rmap_intent_type type,
- uint64_t owner, int whichfork, xfs_fileoff_t startoff,
- xfs_fsblock_t startblock, xfs_filblks_t blockcount,
- xfs_exntst_t state, struct xfs_btree_cur **pcur);
-
-/* refcount updates */
-enum xfs_refcount_intent_type;
-
-struct xfs_cud_log_item *xfs_trans_get_cud(struct xfs_trans *tp,
- struct xfs_cui_log_item *cuip);
-int xfs_trans_log_finish_refcount_update(struct xfs_trans *tp,
- struct xfs_cud_log_item *cudp,
- enum xfs_refcount_intent_type type, xfs_fsblock_t startblock,
- xfs_extlen_t blockcount, xfs_fsblock_t *new_fsb,
- xfs_extlen_t *new_len, struct xfs_btree_cur **pcur);
-
-/* mapping updates */
-enum xfs_bmap_intent_type;
-
-struct xfs_bud_log_item *xfs_trans_get_bud(struct xfs_trans *tp,
- struct xfs_bui_log_item *buip);
-int xfs_trans_log_finish_bmap_update(struct xfs_trans *tp,
- struct xfs_bud_log_item *rudp, enum xfs_bmap_intent_type type,
- struct xfs_inode *ip, int whichfork, xfs_fileoff_t startoff,
- xfs_fsblock_t startblock, xfs_filblks_t *blockcount,
- xfs_exntst_t state);
-
#endif /* __XFS_TRANS_H__ */
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index d3a4e89bf4a0..6ccfd75d3c24 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -6,6 +6,7 @@
*/
#include "xfs.h"
#include "xfs_fs.h"
+#include "xfs_shared.h"
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
@@ -74,29 +75,29 @@ xfs_ail_check(
* Return a pointer to the last item in the AIL. If the AIL is empty, then
* return NULL.
*/
-static xfs_log_item_t *
+static struct xfs_log_item *
xfs_ail_max(
struct xfs_ail *ailp)
{
if (list_empty(&ailp->ail_head))
return NULL;
- return list_entry(ailp->ail_head.prev, xfs_log_item_t, li_ail);
+ return list_entry(ailp->ail_head.prev, struct xfs_log_item, li_ail);
}
/*
* Return a pointer to the item which follows the given item in the AIL. If
* the given item is the last item in the list, then return NULL.
*/
-static xfs_log_item_t *
+static struct xfs_log_item *
xfs_ail_next(
- struct xfs_ail *ailp,
- xfs_log_item_t *lip)
+ struct xfs_ail *ailp,
+ struct xfs_log_item *lip)
{
if (lip->li_ail.next == &ailp->ail_head)
return NULL;
- return list_first_entry(&lip->li_ail, xfs_log_item_t, li_ail);
+ return list_first_entry(&lip->li_ail, struct xfs_log_item, li_ail);
}
/*
@@ -109,10 +110,10 @@ xfs_ail_next(
*/
xfs_lsn_t
xfs_ail_min_lsn(
- struct xfs_ail *ailp)
+ struct xfs_ail *ailp)
{
- xfs_lsn_t lsn = 0;
- xfs_log_item_t *lip;
+ xfs_lsn_t lsn = 0;
+ struct xfs_log_item *lip;
spin_lock(&ailp->ail_lock);
lip = xfs_ail_min(ailp);
@@ -128,10 +129,10 @@ xfs_ail_min_lsn(
*/
static xfs_lsn_t
xfs_ail_max_lsn(
- struct xfs_ail *ailp)
+ struct xfs_ail *ailp)
{
- xfs_lsn_t lsn = 0;
- xfs_log_item_t *lip;
+ xfs_lsn_t lsn = 0;
+ struct xfs_log_item *lip;
spin_lock(&ailp->ail_lock);
lip = xfs_ail_max(ailp);
@@ -216,13 +217,13 @@ xfs_trans_ail_cursor_clear(
* ascending traversal. Pass a @lsn of zero to initialise the cursor to the
* first item in the AIL. Returns NULL if the list is empty.
*/
-xfs_log_item_t *
+struct xfs_log_item *
xfs_trans_ail_cursor_first(
struct xfs_ail *ailp,
struct xfs_ail_cursor *cur,
xfs_lsn_t lsn)
{
- xfs_log_item_t *lip;
+ struct xfs_log_item *lip;
xfs_trans_ail_cursor_init(ailp, cur);
@@ -248,7 +249,7 @@ __xfs_trans_ail_cursor_last(
struct xfs_ail *ailp,
xfs_lsn_t lsn)
{
- xfs_log_item_t *lip;
+ struct xfs_log_item *lip;
list_for_each_entry_reverse(lip, &ailp->ail_head, li_ail) {
if (XFS_LSN_CMP(lip->li_lsn, lsn) <= 0)
@@ -327,8 +328,8 @@ xfs_ail_splice(
*/
static void
xfs_ail_delete(
- struct xfs_ail *ailp,
- xfs_log_item_t *lip)
+ struct xfs_ail *ailp,
+ struct xfs_log_item *lip)
{
xfs_ail_check(ailp, lip);
list_del(&lip->li_ail);
@@ -347,6 +348,14 @@ xfsaild_push_item(
if (XFS_TEST_ERROR(false, ailp->ail_mount, XFS_ERRTAG_LOG_ITEM_PIN))
return XFS_ITEM_PINNED;
+ /*
+ * Consider the item pinned if a push callback is not defined so the
+ * caller will force the log. This should only happen for intent items
+ * as they are unpinned once the associated done item is committed to
+ * the on-disk log.
+ */
+ if (!lip->li_ops->iop_push)
+ return XFS_ITEM_PINNED;
return lip->li_ops->iop_push(lip, &ailp->ail_buf_list);
}
@@ -356,7 +365,7 @@ xfsaild_push(
{
xfs_mount_t *mp = ailp->ail_mount;
struct xfs_ail_cursor cur;
- xfs_log_item_t *lip;
+ struct xfs_log_item *lip;
xfs_lsn_t lsn;
xfs_lsn_t target;
long tout;
@@ -611,10 +620,10 @@ xfsaild(
*/
void
xfs_ail_push(
- struct xfs_ail *ailp,
- xfs_lsn_t threshold_lsn)
+ struct xfs_ail *ailp,
+ xfs_lsn_t threshold_lsn)
{
- xfs_log_item_t *lip;
+ struct xfs_log_item *lip;
lip = xfs_ail_min(ailp);
if (!lip || XFS_FORCED_SHUTDOWN(ailp->ail_mount) ||
@@ -699,7 +708,7 @@ xfs_trans_ail_update_bulk(
int nr_items,
xfs_lsn_t lsn) __releases(ailp->ail_lock)
{
- xfs_log_item_t *mlip;
+ struct xfs_log_item *mlip;
int mlip_changed = 0;
int i;
LIST_HEAD(tmp);
diff --git a/fs/xfs/xfs_trans_bmap.c b/fs/xfs/xfs_trans_bmap.c
deleted file mode 100644
index 11cff449d055..000000000000
--- a/fs/xfs/xfs_trans_bmap.c
+++ /dev/null
@@ -1,233 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0+
-/*
- * Copyright (C) 2016 Oracle. All Rights Reserved.
- * Author: Darrick J. Wong <darrick.wong@oracle.com>
- */
-#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_shared.h"
-#include "xfs_format.h"
-#include "xfs_log_format.h"
-#include "xfs_trans_resv.h"
-#include "xfs_mount.h"
-#include "xfs_defer.h"
-#include "xfs_trans.h"
-#include "xfs_trans_priv.h"
-#include "xfs_bmap_item.h"
-#include "xfs_alloc.h"
-#include "xfs_bmap.h"
-#include "xfs_inode.h"
-#include "xfs_defer.h"
-
-/*
- * This routine is called to allocate a "bmap update done"
- * log item.
- */
-struct xfs_bud_log_item *
-xfs_trans_get_bud(
- struct xfs_trans *tp,
- struct xfs_bui_log_item *buip)
-{
- struct xfs_bud_log_item *budp;
-
- budp = xfs_bud_init(tp->t_mountp, buip);
- xfs_trans_add_item(tp, &budp->bud_item);
- return budp;
-}
-
-/*
- * Finish an bmap update and log it to the BUD. Note that the
- * transaction is marked dirty regardless of whether the bmap update
- * succeeds or fails to support the BUI/BUD lifecycle rules.
- */
-int
-xfs_trans_log_finish_bmap_update(
- struct xfs_trans *tp,
- struct xfs_bud_log_item *budp,
- enum xfs_bmap_intent_type type,
- struct xfs_inode *ip,
- int whichfork,
- xfs_fileoff_t startoff,
- xfs_fsblock_t startblock,
- xfs_filblks_t *blockcount,
- xfs_exntst_t state)
-{
- int error;
-
- error = xfs_bmap_finish_one(tp, ip, type, whichfork, startoff,
- startblock, blockcount, state);
-
- /*
- * Mark the transaction dirty, even on error. This ensures the
- * transaction is aborted, which:
- *
- * 1.) releases the BUI and frees the BUD
- * 2.) shuts down the filesystem
- */
- tp->t_flags |= XFS_TRANS_DIRTY;
- set_bit(XFS_LI_DIRTY, &budp->bud_item.li_flags);
-
- return error;
-}
-
-/* Sort bmap intents by inode. */
-static int
-xfs_bmap_update_diff_items(
- void *priv,
- struct list_head *a,
- struct list_head *b)
-{
- struct xfs_bmap_intent *ba;
- struct xfs_bmap_intent *bb;
-
- ba = container_of(a, struct xfs_bmap_intent, bi_list);
- bb = container_of(b, struct xfs_bmap_intent, bi_list);
- return ba->bi_owner->i_ino - bb->bi_owner->i_ino;
-}
-
-/* Get an BUI. */
-STATIC void *
-xfs_bmap_update_create_intent(
- struct xfs_trans *tp,
- unsigned int count)
-{
- struct xfs_bui_log_item *buip;
-
- ASSERT(count == XFS_BUI_MAX_FAST_EXTENTS);
- ASSERT(tp != NULL);
-
- buip = xfs_bui_init(tp->t_mountp);
- ASSERT(buip != NULL);
-
- /*
- * Get a log_item_desc to point at the new item.
- */
- xfs_trans_add_item(tp, &buip->bui_item);
- return buip;
-}
-
-/* Set the map extent flags for this mapping. */
-static void
-xfs_trans_set_bmap_flags(
- struct xfs_map_extent *bmap,
- enum xfs_bmap_intent_type type,
- int whichfork,
- xfs_exntst_t state)
-{
- bmap->me_flags = 0;
- switch (type) {
- case XFS_BMAP_MAP:
- case XFS_BMAP_UNMAP:
- bmap->me_flags = type;
- break;
- default:
- ASSERT(0);
- }
- if (state == XFS_EXT_UNWRITTEN)
- bmap->me_flags |= XFS_BMAP_EXTENT_UNWRITTEN;
- if (whichfork == XFS_ATTR_FORK)
- bmap->me_flags |= XFS_BMAP_EXTENT_ATTR_FORK;
-}
-
-/* Log bmap updates in the intent item. */
-STATIC void
-xfs_bmap_update_log_item(
- struct xfs_trans *tp,
- void *intent,
- struct list_head *item)
-{
- struct xfs_bui_log_item *buip = intent;
- struct xfs_bmap_intent *bmap;
- uint next_extent;
- struct xfs_map_extent *map;
-
- bmap = container_of(item, struct xfs_bmap_intent, bi_list);
-
- tp->t_flags |= XFS_TRANS_DIRTY;
- set_bit(XFS_LI_DIRTY, &buip->bui_item.li_flags);
-
- /*
- * atomic_inc_return gives us the value after the increment;
- * we want to use it as an array index so we need to subtract 1 from
- * it.
- */
- next_extent = atomic_inc_return(&buip->bui_next_extent) - 1;
- ASSERT(next_extent < buip->bui_format.bui_nextents);
- map = &buip->bui_format.bui_extents[next_extent];
- map->me_owner = bmap->bi_owner->i_ino;
- map->me_startblock = bmap->bi_bmap.br_startblock;
- map->me_startoff = bmap->bi_bmap.br_startoff;
- map->me_len = bmap->bi_bmap.br_blockcount;
- xfs_trans_set_bmap_flags(map, bmap->bi_type, bmap->bi_whichfork,
- bmap->bi_bmap.br_state);
-}
-
-/* Get an BUD so we can process all the deferred rmap updates. */
-STATIC void *
-xfs_bmap_update_create_done(
- struct xfs_trans *tp,
- void *intent,
- unsigned int count)
-{
- return xfs_trans_get_bud(tp, intent);
-}
-
-/* Process a deferred rmap update. */
-STATIC int
-xfs_bmap_update_finish_item(
- struct xfs_trans *tp,
- struct list_head *item,
- void *done_item,
- void **state)
-{
- struct xfs_bmap_intent *bmap;
- xfs_filblks_t count;
- int error;
-
- bmap = container_of(item, struct xfs_bmap_intent, bi_list);
- count = bmap->bi_bmap.br_blockcount;
- error = xfs_trans_log_finish_bmap_update(tp, done_item,
- bmap->bi_type,
- bmap->bi_owner, bmap->bi_whichfork,
- bmap->bi_bmap.br_startoff,
- bmap->bi_bmap.br_startblock,
- &count,
- bmap->bi_bmap.br_state);
- if (!error && count > 0) {
- ASSERT(bmap->bi_type == XFS_BMAP_UNMAP);
- bmap->bi_bmap.br_blockcount = count;
- return -EAGAIN;
- }
- kmem_free(bmap);
- return error;
-}
-
-/* Abort all pending BUIs. */
-STATIC void
-xfs_bmap_update_abort_intent(
- void *intent)
-{
- xfs_bui_release(intent);
-}
-
-/* Cancel a deferred rmap update. */
-STATIC void
-xfs_bmap_update_cancel_item(
- struct list_head *item)
-{
- struct xfs_bmap_intent *bmap;
-
- bmap = container_of(item, struct xfs_bmap_intent, bi_list);
- kmem_free(bmap);
-}
-
-const struct xfs_defer_op_type xfs_bmap_update_defer_type = {
- .max_items = XFS_BUI_MAX_FAST_EXTENTS,
- .diff_items = xfs_bmap_update_diff_items,
- .create_intent = xfs_bmap_update_create_intent,
- .abort_intent = xfs_bmap_update_abort_intent,
- .log_item = xfs_bmap_update_log_item,
- .create_done = xfs_bmap_update_create_done,
- .finish_item = xfs_bmap_update_finish_item,
- .cancel_item = xfs_bmap_update_cancel_item,
-};
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index 629f1479c9d2..b5b3a78ef31c 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -10,11 +10,9 @@
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
-#include "xfs_inode.h"
#include "xfs_trans.h"
#include "xfs_buf_item.h"
#include "xfs_trans_priv.h"
-#include "xfs_error.h"
#include "xfs_trace.h"
/*
@@ -174,8 +172,7 @@ xfs_trans_get_buf_map(
xfs_buf_t *
xfs_trans_getsb(
xfs_trans_t *tp,
- struct xfs_mount *mp,
- int flags)
+ struct xfs_mount *mp)
{
xfs_buf_t *bp;
struct xfs_buf_log_item *bip;
@@ -185,7 +182,7 @@ xfs_trans_getsb(
* if tp is NULL.
*/
if (tp == NULL)
- return xfs_getsb(mp, flags);
+ return xfs_getsb(mp);
/*
* If the superblock buffer already has this transaction
@@ -203,7 +200,7 @@ xfs_trans_getsb(
return bp;
}
- bp = xfs_getsb(mp, flags);
+ bp = xfs_getsb(mp);
if (bp == NULL)
return NULL;
@@ -277,7 +274,7 @@ xfs_trans_read_buf_map(
* release this buffer when it kills the tranaction.
*/
ASSERT(bp->b_ops != NULL);
- error = xfs_buf_ensure_ops(bp, ops);
+ error = xfs_buf_reverify(bp, ops);
if (error) {
xfs_buf_ioerror_alert(bp, __func__);
@@ -428,7 +425,7 @@ xfs_trans_brelse(
/*
* Mark the buffer as not needing to be unlocked when the buf item's
- * iop_unlock() routine is called. The buffer must already be locked
+ * iop_committing() routine is called. The buffer must already be locked
* and associated with the given transaction.
*/
/* ARGSUSED */
diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c
index c23257a26c2b..1027c9ca6eb8 100644
--- a/fs/xfs/xfs_trans_dquot.c
+++ b/fs/xfs/xfs_trans_dquot.c
@@ -11,7 +11,6 @@
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
#include "xfs_inode.h"
-#include "xfs_error.h"
#include "xfs_trans.h"
#include "xfs_trans_priv.h"
#include "xfs_quota.h"
@@ -29,7 +28,6 @@ xfs_trans_dqjoin(
xfs_trans_t *tp,
xfs_dquot_t *dqp)
{
- ASSERT(dqp->q_transp != tp);
ASSERT(XFS_DQ_IS_LOCKED(dqp));
ASSERT(dqp->q_logitem.qli_dquot == dqp);
@@ -37,15 +35,8 @@ xfs_trans_dqjoin(
* Get a log_item_desc to point at the new item.
*/
xfs_trans_add_item(tp, &dqp->q_logitem.qli_item);
-
- /*
- * Initialize d_transp so we can later determine if this dquot is
- * associated with this transaction.
- */
- dqp->q_transp = tp;
}
-
/*
* This is called to mark the dquot as needing
* to be logged when the transaction is committed. The dquot must
@@ -61,7 +52,6 @@ xfs_trans_log_dquot(
xfs_trans_t *tp,
xfs_dquot_t *dqp)
{
- ASSERT(dqp->q_transp == tp);
ASSERT(XFS_DQ_IS_LOCKED(dqp));
tp->t_flags |= XFS_TRANS_DIRTY;
@@ -74,13 +64,13 @@ xfs_trans_log_dquot(
*/
void
xfs_trans_dup_dqinfo(
- xfs_trans_t *otp,
- xfs_trans_t *ntp)
+ struct xfs_trans *otp,
+ struct xfs_trans *ntp)
{
- xfs_dqtrx_t *oq, *nq;
- int i, j;
- xfs_dqtrx_t *oqa, *nqa;
- ulong blk_res_used;
+ struct xfs_dqtrx *oq, *nq;
+ int i, j;
+ struct xfs_dqtrx *oqa, *nqa;
+ uint64_t blk_res_used;
if (!otp->t_dqinfo)
return;
@@ -137,7 +127,7 @@ xfs_trans_mod_dquot_byino(
xfs_trans_t *tp,
xfs_inode_t *ip,
uint field,
- long delta)
+ int64_t delta)
{
xfs_mount_t *mp = tp->t_mountp;
@@ -191,12 +181,12 @@ xfs_trans_get_dqtrx(
*/
void
xfs_trans_mod_dquot(
- xfs_trans_t *tp,
- xfs_dquot_t *dqp,
- uint field,
- long delta)
+ struct xfs_trans *tp,
+ struct xfs_dquot *dqp,
+ uint field,
+ int64_t delta)
{
- xfs_dqtrx_t *qtrx;
+ struct xfs_dqtrx *qtrx;
ASSERT(tp);
ASSERT(XFS_IS_QUOTA_RUNNING(tp->t_mountp));
@@ -219,14 +209,14 @@ xfs_trans_mod_dquot(
* regular disk blk reservation
*/
case XFS_TRANS_DQ_RES_BLKS:
- qtrx->qt_blk_res += (ulong)delta;
+ qtrx->qt_blk_res += delta;
break;
/*
* inode reservation
*/
case XFS_TRANS_DQ_RES_INOS:
- qtrx->qt_ino_res += (ulong)delta;
+ qtrx->qt_ino_res += delta;
break;
/*
@@ -245,7 +235,7 @@ xfs_trans_mod_dquot(
*/
case XFS_TRANS_DQ_ICOUNT:
if (qtrx->qt_ino_res && delta > 0) {
- qtrx->qt_ino_res_used += (ulong)delta;
+ qtrx->qt_ino_res_used += delta;
ASSERT(qtrx->qt_ino_res >= qtrx->qt_ino_res_used);
}
qtrx->qt_icount_delta += delta;
@@ -255,7 +245,7 @@ xfs_trans_mod_dquot(
* rtblk reservation
*/
case XFS_TRANS_DQ_RES_RTBLKS:
- qtrx->qt_rtblk_res += (ulong)delta;
+ qtrx->qt_rtblk_res += delta;
break;
/*
@@ -263,7 +253,7 @@ xfs_trans_mod_dquot(
*/
case XFS_TRANS_DQ_RTBCOUNT:
if (qtrx->qt_rtblk_res && delta > 0) {
- qtrx->qt_rtblk_res_used += (ulong)delta;
+ qtrx->qt_rtblk_res_used += delta;
ASSERT(qtrx->qt_rtblk_res >= qtrx->qt_rtblk_res_used);
}
qtrx->qt_rtbcount_delta += delta;
@@ -288,8 +278,8 @@ xfs_trans_mod_dquot(
*/
STATIC void
xfs_trans_dqlockedjoin(
- xfs_trans_t *tp,
- xfs_dqtrx_t *q)
+ struct xfs_trans *tp,
+ struct xfs_dqtrx *q)
{
ASSERT(q[0].qt_dquot != NULL);
if (q[1].qt_dquot == NULL) {
@@ -320,8 +310,8 @@ xfs_trans_apply_dquot_deltas(
struct xfs_dquot *dqp;
struct xfs_dqtrx *qtrx, *qa;
struct xfs_disk_dquot *d;
- long totalbdelta;
- long totalrtbdelta;
+ int64_t totalbdelta;
+ int64_t totalrtbdelta;
if (!(tp->t_flags & XFS_TRANS_DQ_DIRTY))
return;
@@ -347,7 +337,6 @@ xfs_trans_apply_dquot_deltas(
break;
ASSERT(XFS_DQ_IS_LOCKED(dqp));
- ASSERT(dqp->q_transp == tp);
/*
* adjust the actual number of blocks used
@@ -413,7 +402,7 @@ xfs_trans_apply_dquot_deltas(
* reservation that a transaction structure knows of.
*/
if (qtrx->qt_blk_res != 0) {
- ulong blk_res_used = 0;
+ uint64_t blk_res_used = 0;
if (qtrx->qt_bcount_delta > 0)
blk_res_used = qtrx->qt_bcount_delta;
@@ -501,7 +490,7 @@ xfs_trans_unreserve_and_mod_dquots(
{
int i, j;
xfs_dquot_t *dqp;
- xfs_dqtrx_t *qtrx, *qa;
+ struct xfs_dqtrx *qtrx, *qa;
bool locked;
if (!tp->t_dqinfo || !(tp->t_flags & XFS_TRANS_DQ_DIRTY))
@@ -585,7 +574,7 @@ xfs_trans_dqresv(
xfs_trans_t *tp,
xfs_mount_t *mp,
xfs_dquot_t *dqp,
- long nblks,
+ int64_t nblks,
long ninos,
uint flags)
{
@@ -745,7 +734,7 @@ xfs_trans_reserve_quota_bydquots(
struct xfs_dquot *udqp,
struct xfs_dquot *gdqp,
struct xfs_dquot *pdqp,
- long nblks,
+ int64_t nblks,
long ninos,
uint flags)
{
@@ -804,7 +793,7 @@ int
xfs_trans_reserve_quota_nblks(
struct xfs_trans *tp,
struct xfs_inode *ip,
- long nblks,
+ int64_t nblks,
long ninos,
uint flags)
{
diff --git a/fs/xfs/xfs_trans_extfree.c b/fs/xfs/xfs_trans_extfree.c
deleted file mode 100644
index 0710434eb240..000000000000
--- a/fs/xfs/xfs_trans_extfree.c
+++ /dev/null
@@ -1,287 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright (c) 2000,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- */
-#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_shared.h"
-#include "xfs_format.h"
-#include "xfs_log_format.h"
-#include "xfs_trans_resv.h"
-#include "xfs_bit.h"
-#include "xfs_mount.h"
-#include "xfs_defer.h"
-#include "xfs_trans.h"
-#include "xfs_trans_priv.h"
-#include "xfs_extfree_item.h"
-#include "xfs_alloc.h"
-#include "xfs_bmap.h"
-#include "xfs_trace.h"
-#include "xfs_defer.h"
-
-/*
- * This routine is called to allocate an "extent free done"
- * log item that will hold nextents worth of extents. The
- * caller must use all nextents extents, because we are not
- * flexible about this at all.
- */
-struct xfs_efd_log_item *
-xfs_trans_get_efd(struct xfs_trans *tp,
- struct xfs_efi_log_item *efip,
- uint nextents)
-{
- struct xfs_efd_log_item *efdp;
-
- ASSERT(tp != NULL);
- ASSERT(nextents > 0);
-
- efdp = xfs_efd_init(tp->t_mountp, efip, nextents);
- ASSERT(efdp != NULL);
-
- /*
- * Get a log_item_desc to point at the new item.
- */
- xfs_trans_add_item(tp, &efdp->efd_item);
- return efdp;
-}
-
-/*
- * Free an extent and log it to the EFD. Note that the transaction is marked
- * dirty regardless of whether the extent free succeeds or fails to support the
- * EFI/EFD lifecycle rules.
- */
-int
-xfs_trans_free_extent(
- struct xfs_trans *tp,
- struct xfs_efd_log_item *efdp,
- xfs_fsblock_t start_block,
- xfs_extlen_t ext_len,
- const struct xfs_owner_info *oinfo,
- bool skip_discard)
-{
- struct xfs_mount *mp = tp->t_mountp;
- struct xfs_extent *extp;
- uint next_extent;
- xfs_agnumber_t agno = XFS_FSB_TO_AGNO(mp, start_block);
- xfs_agblock_t agbno = XFS_FSB_TO_AGBNO(mp,
- start_block);
- int error;
-
- trace_xfs_bmap_free_deferred(tp->t_mountp, agno, 0, agbno, ext_len);
-
- error = __xfs_free_extent(tp, start_block, ext_len,
- oinfo, XFS_AG_RESV_NONE, skip_discard);
- /*
- * Mark the transaction dirty, even on error. This ensures the
- * transaction is aborted, which:
- *
- * 1.) releases the EFI and frees the EFD
- * 2.) shuts down the filesystem
- */
- tp->t_flags |= XFS_TRANS_DIRTY;
- set_bit(XFS_LI_DIRTY, &efdp->efd_item.li_flags);
-
- next_extent = efdp->efd_next_extent;
- ASSERT(next_extent < efdp->efd_format.efd_nextents);
- extp = &(efdp->efd_format.efd_extents[next_extent]);
- extp->ext_start = start_block;
- extp->ext_len = ext_len;
- efdp->efd_next_extent++;
-
- return error;
-}
-
-/* Sort bmap items by AG. */
-static int
-xfs_extent_free_diff_items(
- void *priv,
- struct list_head *a,
- struct list_head *b)
-{
- struct xfs_mount *mp = priv;
- struct xfs_extent_free_item *ra;
- struct xfs_extent_free_item *rb;
-
- ra = container_of(a, struct xfs_extent_free_item, xefi_list);
- rb = container_of(b, struct xfs_extent_free_item, xefi_list);
- return XFS_FSB_TO_AGNO(mp, ra->xefi_startblock) -
- XFS_FSB_TO_AGNO(mp, rb->xefi_startblock);
-}
-
-/* Get an EFI. */
-STATIC void *
-xfs_extent_free_create_intent(
- struct xfs_trans *tp,
- unsigned int count)
-{
- struct xfs_efi_log_item *efip;
-
- ASSERT(tp != NULL);
- ASSERT(count > 0);
-
- efip = xfs_efi_init(tp->t_mountp, count);
- ASSERT(efip != NULL);
-
- /*
- * Get a log_item_desc to point at the new item.
- */
- xfs_trans_add_item(tp, &efip->efi_item);
- return efip;
-}
-
-/* Log a free extent to the intent item. */
-STATIC void
-xfs_extent_free_log_item(
- struct xfs_trans *tp,
- void *intent,
- struct list_head *item)
-{
- struct xfs_efi_log_item *efip = intent;
- struct xfs_extent_free_item *free;
- uint next_extent;
- struct xfs_extent *extp;
-
- free = container_of(item, struct xfs_extent_free_item, xefi_list);
-
- tp->t_flags |= XFS_TRANS_DIRTY;
- set_bit(XFS_LI_DIRTY, &efip->efi_item.li_flags);
-
- /*
- * atomic_inc_return gives us the value after the increment;
- * we want to use it as an array index so we need to subtract 1 from
- * it.
- */
- next_extent = atomic_inc_return(&efip->efi_next_extent) - 1;
- ASSERT(next_extent < efip->efi_format.efi_nextents);
- extp = &efip->efi_format.efi_extents[next_extent];
- extp->ext_start = free->xefi_startblock;
- extp->ext_len = free->xefi_blockcount;
-}
-
-/* Get an EFD so we can process all the free extents. */
-STATIC void *
-xfs_extent_free_create_done(
- struct xfs_trans *tp,
- void *intent,
- unsigned int count)
-{
- return xfs_trans_get_efd(tp, intent, count);
-}
-
-/* Process a free extent. */
-STATIC int
-xfs_extent_free_finish_item(
- struct xfs_trans *tp,
- struct list_head *item,
- void *done_item,
- void **state)
-{
- struct xfs_extent_free_item *free;
- int error;
-
- free = container_of(item, struct xfs_extent_free_item, xefi_list);
- error = xfs_trans_free_extent(tp, done_item,
- free->xefi_startblock,
- free->xefi_blockcount,
- &free->xefi_oinfo, free->xefi_skip_discard);
- kmem_free(free);
- return error;
-}
-
-/* Abort all pending EFIs. */
-STATIC void
-xfs_extent_free_abort_intent(
- void *intent)
-{
- xfs_efi_release(intent);
-}
-
-/* Cancel a free extent. */
-STATIC void
-xfs_extent_free_cancel_item(
- struct list_head *item)
-{
- struct xfs_extent_free_item *free;
-
- free = container_of(item, struct xfs_extent_free_item, xefi_list);
- kmem_free(free);
-}
-
-const struct xfs_defer_op_type xfs_extent_free_defer_type = {
- .max_items = XFS_EFI_MAX_FAST_EXTENTS,
- .diff_items = xfs_extent_free_diff_items,
- .create_intent = xfs_extent_free_create_intent,
- .abort_intent = xfs_extent_free_abort_intent,
- .log_item = xfs_extent_free_log_item,
- .create_done = xfs_extent_free_create_done,
- .finish_item = xfs_extent_free_finish_item,
- .cancel_item = xfs_extent_free_cancel_item,
-};
-
-/*
- * AGFL blocks are accounted differently in the reserve pools and are not
- * inserted into the busy extent list.
- */
-STATIC int
-xfs_agfl_free_finish_item(
- struct xfs_trans *tp,
- struct list_head *item,
- void *done_item,
- void **state)
-{
- struct xfs_mount *mp = tp->t_mountp;
- struct xfs_efd_log_item *efdp = done_item;
- struct xfs_extent_free_item *free;
- struct xfs_extent *extp;
- struct xfs_buf *agbp;
- int error;
- xfs_agnumber_t agno;
- xfs_agblock_t agbno;
- uint next_extent;
-
- free = container_of(item, struct xfs_extent_free_item, xefi_list);
- ASSERT(free->xefi_blockcount == 1);
- agno = XFS_FSB_TO_AGNO(mp, free->xefi_startblock);
- agbno = XFS_FSB_TO_AGBNO(mp, free->xefi_startblock);
-
- trace_xfs_agfl_free_deferred(mp, agno, 0, agbno, free->xefi_blockcount);
-
- error = xfs_alloc_read_agf(mp, tp, agno, 0, &agbp);
- if (!error)
- error = xfs_free_agfl_block(tp, agno, agbno, agbp,
- &free->xefi_oinfo);
-
- /*
- * Mark the transaction dirty, even on error. This ensures the
- * transaction is aborted, which:
- *
- * 1.) releases the EFI and frees the EFD
- * 2.) shuts down the filesystem
- */
- tp->t_flags |= XFS_TRANS_DIRTY;
- set_bit(XFS_LI_DIRTY, &efdp->efd_item.li_flags);
-
- next_extent = efdp->efd_next_extent;
- ASSERT(next_extent < efdp->efd_format.efd_nextents);
- extp = &(efdp->efd_format.efd_extents[next_extent]);
- extp->ext_start = free->xefi_startblock;
- extp->ext_len = free->xefi_blockcount;
- efdp->efd_next_extent++;
-
- kmem_free(free);
- return error;
-}
-
-
-/* sub-type with special handling for AGFL deferred frees */
-const struct xfs_defer_op_type xfs_agfl_free_defer_type = {
- .max_items = XFS_EFI_MAX_FAST_EXTENTS,
- .diff_items = xfs_extent_free_diff_items,
- .create_intent = xfs_extent_free_create_intent,
- .abort_intent = xfs_extent_free_abort_intent,
- .log_item = xfs_extent_free_log_item,
- .create_done = xfs_extent_free_create_done,
- .finish_item = xfs_agfl_free_finish_item,
- .cancel_item = xfs_extent_free_cancel_item,
-};
diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h
index 091eae9f4e74..2e073c1c4614 100644
--- a/fs/xfs/xfs_trans_priv.h
+++ b/fs/xfs/xfs_trans_priv.h
@@ -16,12 +16,10 @@ struct xfs_log_vec;
void xfs_trans_init(struct xfs_mount *);
void xfs_trans_add_item(struct xfs_trans *, struct xfs_log_item *);
void xfs_trans_del_item(struct xfs_log_item *);
-void xfs_trans_free_items(struct xfs_trans *tp, xfs_lsn_t commit_lsn,
- bool abort);
void xfs_trans_unreserve_and_mod_sb(struct xfs_trans *tp);
void xfs_trans_committed_bulk(struct xfs_ail *ailp, struct xfs_log_vec *lv,
- xfs_lsn_t commit_lsn, int aborted);
+ xfs_lsn_t commit_lsn, bool aborted);
/*
* AIL traversal cursor.
*
diff --git a/fs/xfs/xfs_trans_refcount.c b/fs/xfs/xfs_trans_refcount.c
deleted file mode 100644
index 6c947ff4faf6..000000000000
--- a/fs/xfs/xfs_trans_refcount.c
+++ /dev/null
@@ -1,241 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0+
-/*
- * Copyright (C) 2016 Oracle. All Rights Reserved.
- * Author: Darrick J. Wong <darrick.wong@oracle.com>
- */
-#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_shared.h"
-#include "xfs_format.h"
-#include "xfs_log_format.h"
-#include "xfs_trans_resv.h"
-#include "xfs_mount.h"
-#include "xfs_defer.h"
-#include "xfs_trans.h"
-#include "xfs_trans_priv.h"
-#include "xfs_refcount_item.h"
-#include "xfs_alloc.h"
-#include "xfs_refcount.h"
-#include "xfs_defer.h"
-
-/*
- * This routine is called to allocate a "refcount update done"
- * log item.
- */
-struct xfs_cud_log_item *
-xfs_trans_get_cud(
- struct xfs_trans *tp,
- struct xfs_cui_log_item *cuip)
-{
- struct xfs_cud_log_item *cudp;
-
- cudp = xfs_cud_init(tp->t_mountp, cuip);
- xfs_trans_add_item(tp, &cudp->cud_item);
- return cudp;
-}
-
-/*
- * Finish an refcount update and log it to the CUD. Note that the
- * transaction is marked dirty regardless of whether the refcount
- * update succeeds or fails to support the CUI/CUD lifecycle rules.
- */
-int
-xfs_trans_log_finish_refcount_update(
- struct xfs_trans *tp,
- struct xfs_cud_log_item *cudp,
- enum xfs_refcount_intent_type type,
- xfs_fsblock_t startblock,
- xfs_extlen_t blockcount,
- xfs_fsblock_t *new_fsb,
- xfs_extlen_t *new_len,
- struct xfs_btree_cur **pcur)
-{
- int error;
-
- error = xfs_refcount_finish_one(tp, type, startblock,
- blockcount, new_fsb, new_len, pcur);
-
- /*
- * Mark the transaction dirty, even on error. This ensures the
- * transaction is aborted, which:
- *
- * 1.) releases the CUI and frees the CUD
- * 2.) shuts down the filesystem
- */
- tp->t_flags |= XFS_TRANS_DIRTY;
- set_bit(XFS_LI_DIRTY, &cudp->cud_item.li_flags);
-
- return error;
-}
-
-/* Sort refcount intents by AG. */
-static int
-xfs_refcount_update_diff_items(
- void *priv,
- struct list_head *a,
- struct list_head *b)
-{
- struct xfs_mount *mp = priv;
- struct xfs_refcount_intent *ra;
- struct xfs_refcount_intent *rb;
-
- ra = container_of(a, struct xfs_refcount_intent, ri_list);
- rb = container_of(b, struct xfs_refcount_intent, ri_list);
- return XFS_FSB_TO_AGNO(mp, ra->ri_startblock) -
- XFS_FSB_TO_AGNO(mp, rb->ri_startblock);
-}
-
-/* Get an CUI. */
-STATIC void *
-xfs_refcount_update_create_intent(
- struct xfs_trans *tp,
- unsigned int count)
-{
- struct xfs_cui_log_item *cuip;
-
- ASSERT(tp != NULL);
- ASSERT(count > 0);
-
- cuip = xfs_cui_init(tp->t_mountp, count);
- ASSERT(cuip != NULL);
-
- /*
- * Get a log_item_desc to point at the new item.
- */
- xfs_trans_add_item(tp, &cuip->cui_item);
- return cuip;
-}
-
-/* Set the phys extent flags for this reverse mapping. */
-static void
-xfs_trans_set_refcount_flags(
- struct xfs_phys_extent *refc,
- enum xfs_refcount_intent_type type)
-{
- refc->pe_flags = 0;
- switch (type) {
- case XFS_REFCOUNT_INCREASE:
- case XFS_REFCOUNT_DECREASE:
- case XFS_REFCOUNT_ALLOC_COW:
- case XFS_REFCOUNT_FREE_COW:
- refc->pe_flags |= type;
- break;
- default:
- ASSERT(0);
- }
-}
-
-/* Log refcount updates in the intent item. */
-STATIC void
-xfs_refcount_update_log_item(
- struct xfs_trans *tp,
- void *intent,
- struct list_head *item)
-{
- struct xfs_cui_log_item *cuip = intent;
- struct xfs_refcount_intent *refc;
- uint next_extent;
- struct xfs_phys_extent *ext;
-
- refc = container_of(item, struct xfs_refcount_intent, ri_list);
-
- tp->t_flags |= XFS_TRANS_DIRTY;
- set_bit(XFS_LI_DIRTY, &cuip->cui_item.li_flags);
-
- /*
- * atomic_inc_return gives us the value after the increment;
- * we want to use it as an array index so we need to subtract 1 from
- * it.
- */
- next_extent = atomic_inc_return(&cuip->cui_next_extent) - 1;
- ASSERT(next_extent < cuip->cui_format.cui_nextents);
- ext = &cuip->cui_format.cui_extents[next_extent];
- ext->pe_startblock = refc->ri_startblock;
- ext->pe_len = refc->ri_blockcount;
- xfs_trans_set_refcount_flags(ext, refc->ri_type);
-}
-
-/* Get an CUD so we can process all the deferred refcount updates. */
-STATIC void *
-xfs_refcount_update_create_done(
- struct xfs_trans *tp,
- void *intent,
- unsigned int count)
-{
- return xfs_trans_get_cud(tp, intent);
-}
-
-/* Process a deferred refcount update. */
-STATIC int
-xfs_refcount_update_finish_item(
- struct xfs_trans *tp,
- struct list_head *item,
- void *done_item,
- void **state)
-{
- struct xfs_refcount_intent *refc;
- xfs_fsblock_t new_fsb;
- xfs_extlen_t new_aglen;
- int error;
-
- refc = container_of(item, struct xfs_refcount_intent, ri_list);
- error = xfs_trans_log_finish_refcount_update(tp, done_item,
- refc->ri_type,
- refc->ri_startblock,
- refc->ri_blockcount,
- &new_fsb, &new_aglen,
- (struct xfs_btree_cur **)state);
- /* Did we run out of reservation? Requeue what we didn't finish. */
- if (!error && new_aglen > 0) {
- ASSERT(refc->ri_type == XFS_REFCOUNT_INCREASE ||
- refc->ri_type == XFS_REFCOUNT_DECREASE);
- refc->ri_startblock = new_fsb;
- refc->ri_blockcount = new_aglen;
- return -EAGAIN;
- }
- kmem_free(refc);
- return error;
-}
-
-/* Clean up after processing deferred refcounts. */
-STATIC void
-xfs_refcount_update_finish_cleanup(
- struct xfs_trans *tp,
- void *state,
- int error)
-{
- struct xfs_btree_cur *rcur = state;
-
- xfs_refcount_finish_one_cleanup(tp, rcur, error);
-}
-
-/* Abort all pending CUIs. */
-STATIC void
-xfs_refcount_update_abort_intent(
- void *intent)
-{
- xfs_cui_release(intent);
-}
-
-/* Cancel a deferred refcount update. */
-STATIC void
-xfs_refcount_update_cancel_item(
- struct list_head *item)
-{
- struct xfs_refcount_intent *refc;
-
- refc = container_of(item, struct xfs_refcount_intent, ri_list);
- kmem_free(refc);
-}
-
-const struct xfs_defer_op_type xfs_refcount_update_defer_type = {
- .max_items = XFS_CUI_MAX_FAST_EXTENTS,
- .diff_items = xfs_refcount_update_diff_items,
- .create_intent = xfs_refcount_update_create_intent,
- .abort_intent = xfs_refcount_update_abort_intent,
- .log_item = xfs_refcount_update_log_item,
- .create_done = xfs_refcount_update_create_done,
- .finish_item = xfs_refcount_update_finish_item,
- .finish_cleanup = xfs_refcount_update_finish_cleanup,
- .cancel_item = xfs_refcount_update_cancel_item,
-};
diff --git a/fs/xfs/xfs_trans_rmap.c b/fs/xfs/xfs_trans_rmap.c
deleted file mode 100644
index a42890931ecd..000000000000
--- a/fs/xfs/xfs_trans_rmap.c
+++ /dev/null
@@ -1,258 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0+
-/*
- * Copyright (C) 2016 Oracle. All Rights Reserved.
- * Author: Darrick J. Wong <darrick.wong@oracle.com>
- */
-#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_shared.h"
-#include "xfs_format.h"
-#include "xfs_log_format.h"
-#include "xfs_trans_resv.h"
-#include "xfs_mount.h"
-#include "xfs_defer.h"
-#include "xfs_trans.h"
-#include "xfs_trans_priv.h"
-#include "xfs_rmap_item.h"
-#include "xfs_alloc.h"
-#include "xfs_rmap.h"
-#include "xfs_defer.h"
-
-/* Set the map extent flags for this reverse mapping. */
-static void
-xfs_trans_set_rmap_flags(
- struct xfs_map_extent *rmap,
- enum xfs_rmap_intent_type type,
- int whichfork,
- xfs_exntst_t state)
-{
- rmap->me_flags = 0;
- if (state == XFS_EXT_UNWRITTEN)
- rmap->me_flags |= XFS_RMAP_EXTENT_UNWRITTEN;
- if (whichfork == XFS_ATTR_FORK)
- rmap->me_flags |= XFS_RMAP_EXTENT_ATTR_FORK;
- switch (type) {
- case XFS_RMAP_MAP:
- rmap->me_flags |= XFS_RMAP_EXTENT_MAP;
- break;
- case XFS_RMAP_MAP_SHARED:
- rmap->me_flags |= XFS_RMAP_EXTENT_MAP_SHARED;
- break;
- case XFS_RMAP_UNMAP:
- rmap->me_flags |= XFS_RMAP_EXTENT_UNMAP;
- break;
- case XFS_RMAP_UNMAP_SHARED:
- rmap->me_flags |= XFS_RMAP_EXTENT_UNMAP_SHARED;
- break;
- case XFS_RMAP_CONVERT:
- rmap->me_flags |= XFS_RMAP_EXTENT_CONVERT;
- break;
- case XFS_RMAP_CONVERT_SHARED:
- rmap->me_flags |= XFS_RMAP_EXTENT_CONVERT_SHARED;
- break;
- case XFS_RMAP_ALLOC:
- rmap->me_flags |= XFS_RMAP_EXTENT_ALLOC;
- break;
- case XFS_RMAP_FREE:
- rmap->me_flags |= XFS_RMAP_EXTENT_FREE;
- break;
- default:
- ASSERT(0);
- }
-}
-
-struct xfs_rud_log_item *
-xfs_trans_get_rud(
- struct xfs_trans *tp,
- struct xfs_rui_log_item *ruip)
-{
- struct xfs_rud_log_item *rudp;
-
- rudp = xfs_rud_init(tp->t_mountp, ruip);
- xfs_trans_add_item(tp, &rudp->rud_item);
- return rudp;
-}
-
-/*
- * Finish an rmap update and log it to the RUD. Note that the transaction is
- * marked dirty regardless of whether the rmap update succeeds or fails to
- * support the RUI/RUD lifecycle rules.
- */
-int
-xfs_trans_log_finish_rmap_update(
- struct xfs_trans *tp,
- struct xfs_rud_log_item *rudp,
- enum xfs_rmap_intent_type type,
- uint64_t owner,
- int whichfork,
- xfs_fileoff_t startoff,
- xfs_fsblock_t startblock,
- xfs_filblks_t blockcount,
- xfs_exntst_t state,
- struct xfs_btree_cur **pcur)
-{
- int error;
-
- error = xfs_rmap_finish_one(tp, type, owner, whichfork, startoff,
- startblock, blockcount, state, pcur);
-
- /*
- * Mark the transaction dirty, even on error. This ensures the
- * transaction is aborted, which:
- *
- * 1.) releases the RUI and frees the RUD
- * 2.) shuts down the filesystem
- */
- tp->t_flags |= XFS_TRANS_DIRTY;
- set_bit(XFS_LI_DIRTY, &rudp->rud_item.li_flags);
-
- return error;
-}
-
-/* Sort rmap intents by AG. */
-static int
-xfs_rmap_update_diff_items(
- void *priv,
- struct list_head *a,
- struct list_head *b)
-{
- struct xfs_mount *mp = priv;
- struct xfs_rmap_intent *ra;
- struct xfs_rmap_intent *rb;
-
- ra = container_of(a, struct xfs_rmap_intent, ri_list);
- rb = container_of(b, struct xfs_rmap_intent, ri_list);
- return XFS_FSB_TO_AGNO(mp, ra->ri_bmap.br_startblock) -
- XFS_FSB_TO_AGNO(mp, rb->ri_bmap.br_startblock);
-}
-
-/* Get an RUI. */
-STATIC void *
-xfs_rmap_update_create_intent(
- struct xfs_trans *tp,
- unsigned int count)
-{
- struct xfs_rui_log_item *ruip;
-
- ASSERT(tp != NULL);
- ASSERT(count > 0);
-
- ruip = xfs_rui_init(tp->t_mountp, count);
- ASSERT(ruip != NULL);
-
- /*
- * Get a log_item_desc to point at the new item.
- */
- xfs_trans_add_item(tp, &ruip->rui_item);
- return ruip;
-}
-
-/* Log rmap updates in the intent item. */
-STATIC void
-xfs_rmap_update_log_item(
- struct xfs_trans *tp,
- void *intent,
- struct list_head *item)
-{
- struct xfs_rui_log_item *ruip = intent;
- struct xfs_rmap_intent *rmap;
- uint next_extent;
- struct xfs_map_extent *map;
-
- rmap = container_of(item, struct xfs_rmap_intent, ri_list);
-
- tp->t_flags |= XFS_TRANS_DIRTY;
- set_bit(XFS_LI_DIRTY, &ruip->rui_item.li_flags);
-
- /*
- * atomic_inc_return gives us the value after the increment;
- * we want to use it as an array index so we need to subtract 1 from
- * it.
- */
- next_extent = atomic_inc_return(&ruip->rui_next_extent) - 1;
- ASSERT(next_extent < ruip->rui_format.rui_nextents);
- map = &ruip->rui_format.rui_extents[next_extent];
- map->me_owner = rmap->ri_owner;
- map->me_startblock = rmap->ri_bmap.br_startblock;
- map->me_startoff = rmap->ri_bmap.br_startoff;
- map->me_len = rmap->ri_bmap.br_blockcount;
- xfs_trans_set_rmap_flags(map, rmap->ri_type, rmap->ri_whichfork,
- rmap->ri_bmap.br_state);
-}
-
-/* Get an RUD so we can process all the deferred rmap updates. */
-STATIC void *
-xfs_rmap_update_create_done(
- struct xfs_trans *tp,
- void *intent,
- unsigned int count)
-{
- return xfs_trans_get_rud(tp, intent);
-}
-
-/* Process a deferred rmap update. */
-STATIC int
-xfs_rmap_update_finish_item(
- struct xfs_trans *tp,
- struct list_head *item,
- void *done_item,
- void **state)
-{
- struct xfs_rmap_intent *rmap;
- int error;
-
- rmap = container_of(item, struct xfs_rmap_intent, ri_list);
- error = xfs_trans_log_finish_rmap_update(tp, done_item,
- rmap->ri_type,
- rmap->ri_owner, rmap->ri_whichfork,
- rmap->ri_bmap.br_startoff,
- rmap->ri_bmap.br_startblock,
- rmap->ri_bmap.br_blockcount,
- rmap->ri_bmap.br_state,
- (struct xfs_btree_cur **)state);
- kmem_free(rmap);
- return error;
-}
-
-/* Clean up after processing deferred rmaps. */
-STATIC void
-xfs_rmap_update_finish_cleanup(
- struct xfs_trans *tp,
- void *state,
- int error)
-{
- struct xfs_btree_cur *rcur = state;
-
- xfs_rmap_finish_one_cleanup(tp, rcur, error);
-}
-
-/* Abort all pending RUIs. */
-STATIC void
-xfs_rmap_update_abort_intent(
- void *intent)
-{
- xfs_rui_release(intent);
-}
-
-/* Cancel a deferred rmap update. */
-STATIC void
-xfs_rmap_update_cancel_item(
- struct list_head *item)
-{
- struct xfs_rmap_intent *rmap;
-
- rmap = container_of(item, struct xfs_rmap_intent, ri_list);
- kmem_free(rmap);
-}
-
-const struct xfs_defer_op_type xfs_rmap_update_defer_type = {
- .max_items = XFS_RUI_MAX_FAST_EXTENTS,
- .diff_items = xfs_rmap_update_diff_items,
- .create_intent = xfs_rmap_update_create_intent,
- .abort_intent = xfs_rmap_update_abort_intent,
- .log_item = xfs_rmap_update_log_item,
- .create_done = xfs_rmap_update_create_done,
- .finish_item = xfs_rmap_update_finish_item,
- .finish_cleanup = xfs_rmap_update_finish_cleanup,
- .cancel_item = xfs_rmap_update_cancel_item,
-};
diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c
index 63ee1d5bf1d7..3123b5aaad2a 100644
--- a/fs/xfs/xfs_xattr.c
+++ b/fs/xfs/xfs_xattr.c
@@ -5,15 +5,12 @@
*/
#include "xfs.h"
+#include "xfs_shared.h"
#include "xfs_format.h"
#include "xfs_log_format.h"
-#include "xfs_trans_resv.h"
-#include "xfs_mount.h"
#include "xfs_da_format.h"
#include "xfs_inode.h"
#include "xfs_attr.h"
-#include "xfs_attr_leaf.h"
-#include "xfs_acl.h"
#include <linux/posix_acl_xattr.h>
#include <linux/xattr.h>
@@ -129,6 +126,9 @@ __xfs_xattr_put_listent(
char *offset;
int arraytop;
+ if (context->count < 0 || context->seen_enough)
+ return;
+
if (!context->alist)
goto compute_size;