diff options
Diffstat (limited to 'fs/xfs')
183 files changed, 8758 insertions, 6360 deletions
diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig index 457ac9f97377..e685299eb3d2 100644 --- a/fs/xfs/Kconfig +++ b/fs/xfs/Kconfig @@ -1,7 +1,7 @@ +# SPDX-License-Identifier: GPL-2.0-only config XFS_FS tristate "XFS filesystem support" depends on BLOCK - depends on (64BIT || LBDAF) select EXPORTFS select LIBCRC32C select FS_IOMAP diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index 7f96bdadc372..06b68b6115bc 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -4,8 +4,8 @@ # All Rights Reserved. # -ccflags-y += -I$(src) # needed for trace events -ccflags-y += -I$(src)/libxfs +ccflags-y += -I $(srctree)/$(src) # needed for trace events +ccflags-y += -I $(srctree)/$(src)/libxfs ccflags-$(CONFIG_XFS_DEBUG) += -g @@ -49,6 +49,7 @@ xfs-y += $(addprefix libxfs/, \ xfs_refcount_btree.o \ xfs_sb.o \ xfs_symlink_remote.o \ + xfs_trans_inode.o \ xfs_trans_resv.o \ xfs_types.o \ ) @@ -62,6 +63,7 @@ xfs-y += xfs_aops.o \ xfs_attr_inactive.o \ xfs_attr_list.o \ xfs_bmap_util.o \ + xfs_bio_io.o \ xfs_buf.o \ xfs_dir2_readdir.o \ xfs_discard.o \ @@ -73,15 +75,18 @@ xfs-y += xfs_aops.o \ xfs_fsmap.o \ xfs_fsops.o \ xfs_globals.o \ + xfs_health.o \ xfs_icache.o \ xfs_ioctl.o \ xfs_iomap.o \ xfs_iops.o \ xfs_inode.o \ xfs_itable.o \ + xfs_iwalk.o \ xfs_message.o \ xfs_mount.o \ xfs_mru_cache.o \ + xfs_pwork.o \ xfs_reflink.o \ xfs_stats.o \ xfs_super.o \ @@ -103,12 +108,7 @@ xfs-y += xfs_log.o \ xfs_rmap_item.o \ xfs_log_recover.o \ xfs_trans_ail.o \ - xfs_trans_bmap.o \ - xfs_trans_buf.o \ - xfs_trans_extfree.o \ - xfs_trans_inode.o \ - xfs_trans_refcount.o \ - xfs_trans_rmap.o \ + xfs_trans_buf.o # optional features xfs-$(CONFIG_XFS_QUOTA) += xfs_dquot.o \ @@ -142,6 +142,8 @@ xfs-y += $(addprefix scrub/, \ common.o \ dabtree.o \ dir.o \ + fscounters.o \ + health.o \ ialloc.o \ inode.o \ parent.o \ diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c index fdd9d6ede25c..16bb9a328678 100644 --- a/fs/xfs/kmem.c +++ b/fs/xfs/kmem.c @@ -3,12 +3,7 @@ * Copyright (c) 2000-2005 Silicon Graphics, Inc. * All Rights Reserved. */ -#include <linux/mm.h> #include <linux/sched/mm.h> -#include <linux/highmem.h> -#include <linux/slab.h> -#include <linux/swap.h> -#include <linux/blkdev.h> #include <linux/backing-dev.h> #include "kmem.h" #include "xfs_message.h" diff --git a/fs/xfs/kmem.h b/fs/xfs/kmem.h index 8e6b3ba81c03..267655acd426 100644 --- a/fs/xfs/kmem.h +++ b/fs/xfs/kmem.h @@ -124,4 +124,12 @@ kmem_zone_zalloc(kmem_zone_t *zone, xfs_km_flags_t flags) return kmem_zone_alloc(zone, flags | KM_ZERO); } +static inline struct page * +kmem_to_page(void *addr) +{ + if (is_vmalloc_addr(addr)) + return vmalloc_to_page(addr); + return virt_to_page(addr); +} + #endif /* __XFS_SUPPORT_KMEM_H__ */ diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c index 999ad8d00d43..5de296b34ab1 100644 --- a/fs/xfs/libxfs/xfs_ag.c +++ b/fs/xfs/libxfs/xfs_ag.c @@ -10,6 +10,7 @@ #include "xfs_shared.h" #include "xfs_format.h" #include "xfs_trans_resv.h" +#include "xfs_bit.h" #include "xfs_sb.h" #include "xfs_mount.h" #include "xfs_btree.h" @@ -19,6 +20,8 @@ #include "xfs_ialloc.h" #include "xfs_rmap.h" #include "xfs_ag.h" +#include "xfs_ag_resv.h" +#include "xfs_health.h" static struct xfs_buf * xfs_get_aghdr_buf( @@ -42,6 +45,12 @@ xfs_get_aghdr_buf( return bp; } +static inline bool is_log_ag(struct xfs_mount *mp, struct aghdr_init_data *id) +{ + return mp->m_sb.sb_logstart > 0 && + id->agno == XFS_FSB_TO_AGNO(mp, mp->m_sb.sb_logstart); +} + /* * Generic btree root block init function */ @@ -51,40 +60,85 @@ xfs_btroot_init( struct xfs_buf *bp, struct aghdr_init_data *id) { - xfs_btree_init_block(mp, bp, id->type, 0, 0, id->agno, 0); + xfs_btree_init_block(mp, bp, id->type, 0, 0, id->agno); } -/* - * Alloc btree root block init functions - */ +/* Finish initializing a free space btree. */ static void -xfs_bnoroot_init( +xfs_freesp_init_recs( struct xfs_mount *mp, struct xfs_buf *bp, struct aghdr_init_data *id) { struct xfs_alloc_rec *arec; + struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); - xfs_btree_init_block(mp, bp, XFS_BTNUM_BNO, 0, 1, id->agno, 0); arec = XFS_ALLOC_REC_ADDR(mp, XFS_BUF_TO_BLOCK(bp), 1); arec->ar_startblock = cpu_to_be32(mp->m_ag_prealloc_blocks); + + if (is_log_ag(mp, id)) { + struct xfs_alloc_rec *nrec; + xfs_agblock_t start = XFS_FSB_TO_AGBNO(mp, + mp->m_sb.sb_logstart); + + ASSERT(start >= mp->m_ag_prealloc_blocks); + if (start != mp->m_ag_prealloc_blocks) { + /* + * Modify first record to pad stripe align of log + */ + arec->ar_blockcount = cpu_to_be32(start - + mp->m_ag_prealloc_blocks); + nrec = arec + 1; + + /* + * Insert second record at start of internal log + * which then gets trimmed. + */ + nrec->ar_startblock = cpu_to_be32( + be32_to_cpu(arec->ar_startblock) + + be32_to_cpu(arec->ar_blockcount)); + arec = nrec; + be16_add_cpu(&block->bb_numrecs, 1); + } + /* + * Change record start to after the internal log + */ + be32_add_cpu(&arec->ar_startblock, mp->m_sb.sb_logblocks); + } + + /* + * Calculate the record block count and check for the case where + * the log might have consumed all available space in the AG. If + * so, reset the record count to 0 to avoid exposure of an invalid + * record start block. + */ arec->ar_blockcount = cpu_to_be32(id->agsize - be32_to_cpu(arec->ar_startblock)); + if (!arec->ar_blockcount) + block->bb_numrecs = 0; } +/* + * Alloc btree root block init functions + */ static void -xfs_cntroot_init( +xfs_bnoroot_init( struct xfs_mount *mp, struct xfs_buf *bp, struct aghdr_init_data *id) { - struct xfs_alloc_rec *arec; + xfs_btree_init_block(mp, bp, XFS_BTNUM_BNO, 0, 1, id->agno); + xfs_freesp_init_recs(mp, bp, id); +} - xfs_btree_init_block(mp, bp, XFS_BTNUM_CNT, 0, 1, id->agno, 0); - arec = XFS_ALLOC_REC_ADDR(mp, XFS_BUF_TO_BLOCK(bp), 1); - arec->ar_startblock = cpu_to_be32(mp->m_ag_prealloc_blocks); - arec->ar_blockcount = cpu_to_be32(id->agsize - - be32_to_cpu(arec->ar_startblock)); +static void +xfs_cntroot_init( + struct xfs_mount *mp, + struct xfs_buf *bp, + struct aghdr_init_data *id) +{ + xfs_btree_init_block(mp, bp, XFS_BTNUM_CNT, 0, 1, id->agno); + xfs_freesp_init_recs(mp, bp, id); } /* @@ -99,7 +153,7 @@ xfs_rmaproot_init( struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); struct xfs_rmap_rec *rrec; - xfs_btree_init_block(mp, bp, XFS_BTNUM_RMAP, 0, 4, id->agno, 0); + xfs_btree_init_block(mp, bp, XFS_BTNUM_RMAP, 0, 4, id->agno); /* * mark the AG header regions as static metadata The BNO @@ -147,6 +201,18 @@ xfs_rmaproot_init( rrec->rm_offset = 0; be16_add_cpu(&block->bb_numrecs, 1); } + + /* account for the log space */ + if (is_log_ag(mp, id)) { + rrec = XFS_RMAP_REC_ADDR(block, + be16_to_cpu(block->bb_numrecs) + 1); + rrec->rm_startblock = cpu_to_be32( + XFS_FSB_TO_AGBNO(mp, mp->m_sb.sb_logstart)); + rrec->rm_blockcount = cpu_to_be32(mp->m_sb.sb_logblocks); + rrec->rm_owner = cpu_to_be64(XFS_RMAP_OWN_LOG); + rrec->rm_offset = 0; + be16_add_cpu(&block->bb_numrecs, 1); + } } /* @@ -207,6 +273,14 @@ xfs_agfblock_init( agf->agf_refcount_level = cpu_to_be32(1); agf->agf_refcount_blocks = cpu_to_be32(1); } + + if (is_log_ag(mp, id)) { + int64_t logblocks = mp->m_sb.sb_logblocks; + + be32_add_cpu(&agf->agf_freeblks, -logblocks); + agf->agf_longest = cpu_to_be32(id->agsize - + XFS_FSB_TO_AGBNO(mp, mp->m_sb.sb_logstart) - logblocks); + } } static void @@ -339,14 +413,14 @@ xfs_ag_init_headers( { /* BNO root block */ .daddr = XFS_AGB_TO_DADDR(mp, id->agno, XFS_BNO_BLOCK(mp)), .numblks = BTOBB(mp->m_sb.sb_blocksize), - .ops = &xfs_allocbt_buf_ops, + .ops = &xfs_bnobt_buf_ops, .work = &xfs_bnoroot_init, .need_init = true }, { /* CNT root block */ .daddr = XFS_AGB_TO_DADDR(mp, id->agno, XFS_CNT_BLOCK(mp)), .numblks = BTOBB(mp->m_sb.sb_blocksize), - .ops = &xfs_allocbt_buf_ops, + .ops = &xfs_cntbt_buf_ops, .work = &xfs_cntroot_init, .need_init = true }, @@ -361,7 +435,7 @@ xfs_ag_init_headers( { /* FINO root block */ .daddr = XFS_AGB_TO_DADDR(mp, id->agno, XFS_FIBT_BLOCK(mp)), .numblks = BTOBB(mp->m_sb.sb_blocksize), - .ops = &xfs_inobt_buf_ops, + .ops = &xfs_finobt_buf_ops, .work = &xfs_btroot_init, .type = XFS_BTNUM_FINO, .need_init = xfs_sb_version_hasfinobt(&mp->m_sb) @@ -461,3 +535,55 @@ xfs_ag_extend_space( len, &XFS_RMAP_OINFO_SKIP_UPDATE, XFS_AG_RESV_NONE); } + +/* Retrieve AG geometry. */ +int +xfs_ag_get_geometry( + struct xfs_mount *mp, + xfs_agnumber_t agno, + struct xfs_ag_geometry *ageo) +{ + struct xfs_buf *agi_bp; + struct xfs_buf *agf_bp; + struct xfs_agi *agi; + struct xfs_agf *agf; + struct xfs_perag *pag; + unsigned int freeblks; + int error; + + if (agno >= mp->m_sb.sb_agcount) + return -EINVAL; + + /* Lock the AG headers. */ + error = xfs_ialloc_read_agi(mp, NULL, agno, &agi_bp); + if (error) + return error; + error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agf_bp); + if (error) + goto out_agi; + pag = xfs_perag_get(mp, agno); + + /* Fill out form. */ + memset(ageo, 0, sizeof(*ageo)); + ageo->ag_number = agno; + + agi = XFS_BUF_TO_AGI(agi_bp); + ageo->ag_icount = be32_to_cpu(agi->agi_count); + ageo->ag_ifree = be32_to_cpu(agi->agi_freecount); + + agf = XFS_BUF_TO_AGF(agf_bp); + ageo->ag_length = be32_to_cpu(agf->agf_length); + freeblks = pag->pagf_freeblks + + pag->pagf_flcount + + pag->pagf_btreeblks - + xfs_ag_resv_needed(pag, XFS_AG_RESV_NONE); + ageo->ag_freeblks = freeblks; + xfs_ag_geom_health(pag, ageo); + + /* Release resources. */ + xfs_perag_put(pag); + xfs_buf_relse(agf_bp); +out_agi: + xfs_buf_relse(agi_bp); + return error; +} diff --git a/fs/xfs/libxfs/xfs_ag.h b/fs/xfs/libxfs/xfs_ag.h index 412702e23f61..5166322807e7 100644 --- a/fs/xfs/libxfs/xfs_ag.h +++ b/fs/xfs/libxfs/xfs_ag.h @@ -26,5 +26,7 @@ struct aghdr_init_data { int xfs_ag_init_headers(struct xfs_mount *mp, struct aghdr_init_data *id); int xfs_ag_extend_space(struct xfs_mount *mp, struct xfs_trans *tp, struct aghdr_init_data *id, xfs_extlen_t len); +int xfs_ag_get_geometry(struct xfs_mount *mp, xfs_agnumber_t agno, + struct xfs_ag_geometry *ageo); #endif /* __LIBXFS_AG_H */ diff --git a/fs/xfs/libxfs/xfs_ag_resv.c b/fs/xfs/libxfs/xfs_ag_resv.c index e701ebc36c06..87a9747f1d36 100644 --- a/fs/xfs/libxfs/xfs_ag_resv.c +++ b/fs/xfs/libxfs/xfs_ag_resv.c @@ -9,20 +9,12 @@ #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" -#include "xfs_sb.h" #include "xfs_mount.h" -#include "xfs_defer.h" #include "xfs_alloc.h" #include "xfs_errortag.h" #include "xfs_error.h" #include "xfs_trace.h" -#include "xfs_cksum.h" #include "xfs_trans.h" -#include "xfs_bit.h" -#include "xfs_bmap.h" -#include "xfs_bmap_btree.h" -#include "xfs_ag_resv.h" -#include "xfs_trans_space.h" #include "xfs_rmap_btree.h" #include "xfs_btree.h" #include "xfs_refcount_btree.h" @@ -281,7 +273,7 @@ xfs_ag_resv_init( */ ask = used = 0; - mp->m_inotbt_nores = true; + mp->m_finobt_nores = true; error = xfs_refcountbt_calc_reserves(mp, tp, agno, &ask, &used); diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index b715668886a4..372ad55631fc 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -13,7 +13,6 @@ #include "xfs_sb.h" #include "xfs_mount.h" #include "xfs_defer.h" -#include "xfs_inode.h" #include "xfs_btree.h" #include "xfs_rmap.h" #include "xfs_alloc_btree.h" @@ -21,7 +20,6 @@ #include "xfs_extent_busy.h" #include "xfs_errortag.h" #include "xfs_error.h" -#include "xfs_cksum.h" #include "xfs_trace.h" #include "xfs_trans.h" #include "xfs_buf_item.h" @@ -41,8 +39,6 @@ struct workqueue_struct *xfs_alloc_wq; STATIC int xfs_alloc_ag_vextent_exact(xfs_alloc_arg_t *); STATIC int xfs_alloc_ag_vextent_near(xfs_alloc_arg_t *); STATIC int xfs_alloc_ag_vextent_size(xfs_alloc_arg_t *); -STATIC int xfs_alloc_ag_vextent_small(xfs_alloc_arg_t *, - xfs_btree_cur_t *, xfs_agblock_t *, xfs_extlen_t *, int *); /* * Size of the AGFL. For CRC-enabled filesystes we steal a couple of slots in @@ -555,7 +551,7 @@ static xfs_failaddr_t xfs_agfl_verify( struct xfs_buf *bp) { - struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_mount *mp = bp->b_mount; struct xfs_agfl *agfl = XFS_BUF_TO_AGFL(bp); int i; @@ -568,9 +564,9 @@ xfs_agfl_verify( if (!xfs_sb_version_hascrc(&mp->m_sb)) return NULL; - if (!uuid_equal(&agfl->agfl_uuid, &mp->m_sb.sb_meta_uuid)) + if (!xfs_verify_magic(bp, agfl->agfl_magicnum)) return __this_address; - if (be32_to_cpu(agfl->agfl_magicnum) != XFS_AGFL_MAGIC) + if (!uuid_equal(&agfl->agfl_uuid, &mp->m_sb.sb_meta_uuid)) return __this_address; /* * during growfs operations, the perag is not fully initialised, @@ -596,7 +592,7 @@ static void xfs_agfl_read_verify( struct xfs_buf *bp) { - struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_mount *mp = bp->b_mount; xfs_failaddr_t fa; /* @@ -621,7 +617,7 @@ static void xfs_agfl_write_verify( struct xfs_buf *bp) { - struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_mount *mp = bp->b_mount; struct xfs_buf_log_item *bip = bp->b_log_item; xfs_failaddr_t fa; @@ -643,6 +639,7 @@ xfs_agfl_write_verify( const struct xfs_buf_ops xfs_agfl_buf_ops = { .name = "xfs_agfl", + .magic = { cpu_to_be32(XFS_AGFL_MAGIC), cpu_to_be32(XFS_AGFL_MAGIC) }, .verify_read = xfs_agfl_read_verify, .verify_write = xfs_agfl_write_verify, .verify_struct = xfs_agfl_verify, @@ -699,6 +696,107 @@ xfs_alloc_update_counters( */ /* + * Deal with the case where only small freespaces remain. Either return the + * contents of the last freespace record, or allocate space from the freelist if + * there is nothing in the tree. + */ +STATIC int /* error */ +xfs_alloc_ag_vextent_small( + struct xfs_alloc_arg *args, /* allocation argument structure */ + struct xfs_btree_cur *ccur, /* optional by-size cursor */ + xfs_agblock_t *fbnop, /* result block number */ + xfs_extlen_t *flenp, /* result length */ + int *stat) /* status: 0-freelist, 1-normal/none */ +{ + int error = 0; + xfs_agblock_t fbno = NULLAGBLOCK; + xfs_extlen_t flen = 0; + int i = 0; + + /* + * If a cntbt cursor is provided, try to allocate the largest record in + * the tree. Try the AGFL if the cntbt is empty, otherwise fail the + * allocation. Make sure to respect minleft even when pulling from the + * freelist. + */ + if (ccur) + error = xfs_btree_decrement(ccur, 0, &i); + if (error) + goto error; + if (i) { + error = xfs_alloc_get_rec(ccur, &fbno, &flen, &i); + if (error) + goto error; + XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error); + goto out; + } + + if (args->minlen != 1 || args->alignment != 1 || + args->resv == XFS_AG_RESV_AGFL || + (be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_flcount) <= + args->minleft)) + goto out; + + error = xfs_alloc_get_freelist(args->tp, args->agbp, &fbno, 0); + if (error) + goto error; + if (fbno == NULLAGBLOCK) + goto out; + + xfs_extent_busy_reuse(args->mp, args->agno, fbno, 1, + xfs_alloc_allow_busy_reuse(args->datatype)); + + if (xfs_alloc_is_userdata(args->datatype)) { + struct xfs_buf *bp; + + bp = xfs_btree_get_bufs(args->mp, args->tp, args->agno, fbno); + if (!bp) { + error = -EFSCORRUPTED; + goto error; + } + xfs_trans_binval(args->tp, bp); + } + *fbnop = args->agbno = fbno; + *flenp = args->len = 1; + XFS_WANT_CORRUPTED_GOTO(args->mp, + fbno < be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length), + error); + args->wasfromfl = 1; + trace_xfs_alloc_small_freelist(args); + + /* + * If we're feeding an AGFL block to something that doesn't live in the + * free space, we need to clear out the OWN_AG rmap. + */ + error = xfs_rmap_free(args->tp, args->agbp, args->agno, fbno, 1, + &XFS_RMAP_OINFO_AG); + if (error) + goto error; + + *stat = 0; + return 0; + +out: + /* + * Can't do the allocation, give up. + */ + if (flen < args->minlen) { + args->agbno = NULLAGBLOCK; + trace_xfs_alloc_small_notenough(args); + flen = 0; + } + *fbnop = fbno; + *flenp = flen; + *stat = 1; + trace_xfs_alloc_small_done(args); + return 0; + +error: + trace_xfs_alloc_small_error(args); + return error; +} + +/* * Allocate a variable extent in the allocation group agno. * Type and bno are used to determine where in the allocation group the * extent will start. @@ -1582,112 +1680,6 @@ out_nominleft: } /* - * Deal with the case where only small freespaces remain. - * Either return the contents of the last freespace record, - * or allocate space from the freelist if there is nothing in the tree. - */ -STATIC int /* error */ -xfs_alloc_ag_vextent_small( - xfs_alloc_arg_t *args, /* allocation argument structure */ - xfs_btree_cur_t *ccur, /* by-size cursor */ - xfs_agblock_t *fbnop, /* result block number */ - xfs_extlen_t *flenp, /* result length */ - int *stat) /* status: 0-freelist, 1-normal/none */ -{ - int error; - xfs_agblock_t fbno; - xfs_extlen_t flen; - int i; - - if ((error = xfs_btree_decrement(ccur, 0, &i))) - goto error0; - if (i) { - if ((error = xfs_alloc_get_rec(ccur, &fbno, &flen, &i))) - goto error0; - XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0); - } - /* - * Nothing in the btree, try the freelist. Make sure - * to respect minleft even when pulling from the - * freelist. - */ - else if (args->minlen == 1 && args->alignment == 1 && - args->resv != XFS_AG_RESV_AGFL && - (be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_flcount) - > args->minleft)) { - error = xfs_alloc_get_freelist(args->tp, args->agbp, &fbno, 0); - if (error) - goto error0; - if (fbno != NULLAGBLOCK) { - xfs_extent_busy_reuse(args->mp, args->agno, fbno, 1, - xfs_alloc_allow_busy_reuse(args->datatype)); - - if (xfs_alloc_is_userdata(args->datatype)) { - xfs_buf_t *bp; - - bp = xfs_btree_get_bufs(args->mp, args->tp, - args->agno, fbno, 0); - if (!bp) { - error = -EFSCORRUPTED; - goto error0; - } - xfs_trans_binval(args->tp, bp); - } - args->len = 1; - args->agbno = fbno; - XFS_WANT_CORRUPTED_GOTO(args->mp, - args->agbno + args->len <= - be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length), - error0); - args->wasfromfl = 1; - trace_xfs_alloc_small_freelist(args); - - /* - * If we're feeding an AGFL block to something that - * doesn't live in the free space, we need to clear - * out the OWN_AG rmap. - */ - error = xfs_rmap_free(args->tp, args->agbp, args->agno, - fbno, 1, &XFS_RMAP_OINFO_AG); - if (error) - goto error0; - - *stat = 0; - return 0; - } - /* - * Nothing in the freelist. - */ - else - flen = 0; - } - /* - * Can't allocate from the freelist for some reason. - */ - else { - fbno = NULLAGBLOCK; - flen = 0; - } - /* - * Can't do the allocation, give up. - */ - if (flen < args->minlen) { - args->agbno = NULLAGBLOCK; - trace_xfs_alloc_small_notenough(args); - flen = 0; - } - *fbnop = fbno; - *flenp = flen; - *stat = 1; - trace_xfs_alloc_small_done(args); - return 0; - -error0: - trace_xfs_alloc_small_error(args); - return error; -} - -/* * Free the extent starting at agno/bno for length. */ STATIC int @@ -2041,6 +2033,7 @@ xfs_alloc_space_available( xfs_extlen_t alloc_len, longest; xfs_extlen_t reservation; /* blocks that are still reserved */ int available; + xfs_extlen_t agflcount; if (flags & XFS_ALLOC_FLAG_FREEING) return true; @@ -2053,8 +2046,13 @@ xfs_alloc_space_available( if (longest < alloc_len) return false; - /* do we have enough free space remaining for the allocation? */ - available = (int)(pag->pagf_freeblks + pag->pagf_flcount - + /* + * Do we have enough free space remaining for the allocation? Don't + * account extra agfl blocks because we are about to defer free them, + * making them unavailable until the current transaction commits. + */ + agflcount = min_t(xfs_extlen_t, pag->pagf_flcount, min_free); + available = (int)(pag->pagf_freeblks + agflcount - reservation - min_free - args->minleft); if (available < (int)max(args->total, alloc_len)) return false; @@ -2088,7 +2086,7 @@ xfs_free_agfl_block( if (error) return error; - bp = xfs_btree_get_bufs(tp->t_mountp, tp, agno, agbno, 0); + bp = xfs_btree_get_bufs(tp->t_mountp, tp, agno, agbno); if (!bp) return -EFSCORRUPTED; xfs_trans_binval(tp, bp); @@ -2236,6 +2234,9 @@ xfs_alloc_fix_freelist( xfs_extlen_t need; /* total blocks needed in freelist */ int error = 0; + /* deferred ops (AGFL block frees) require permanent transactions */ + ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); + if (!pag->pagf_init) { error = xfs_alloc_read_agf(mp, tp, args->agno, flags, &agbp); if (error) @@ -2576,7 +2577,7 @@ static xfs_failaddr_t xfs_agf_verify( struct xfs_buf *bp) { - struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_mount *mp = bp->b_mount; struct xfs_agf *agf = XFS_BUF_TO_AGF(bp); if (xfs_sb_version_hascrc(&mp->m_sb)) { @@ -2587,8 +2588,10 @@ xfs_agf_verify( return __this_address; } - if (!(agf->agf_magicnum == cpu_to_be32(XFS_AGF_MAGIC) && - XFS_AGF_GOOD_VERSION(be32_to_cpu(agf->agf_versionnum)) && + if (!xfs_verify_magic(bp, agf->agf_magicnum)) + return __this_address; + + if (!(XFS_AGF_GOOD_VERSION(be32_to_cpu(agf->agf_versionnum)) && be32_to_cpu(agf->agf_freeblks) <= be32_to_cpu(agf->agf_length) && be32_to_cpu(agf->agf_flfirst) < xfs_agfl_size(mp) && be32_to_cpu(agf->agf_fllast) < xfs_agfl_size(mp) && @@ -2632,7 +2635,7 @@ static void xfs_agf_read_verify( struct xfs_buf *bp) { - struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_mount *mp = bp->b_mount; xfs_failaddr_t fa; if (xfs_sb_version_hascrc(&mp->m_sb) && @@ -2649,7 +2652,7 @@ static void xfs_agf_write_verify( struct xfs_buf *bp) { - struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_mount *mp = bp->b_mount; struct xfs_buf_log_item *bip = bp->b_log_item; xfs_failaddr_t fa; @@ -2670,6 +2673,7 @@ xfs_agf_write_verify( const struct xfs_buf_ops xfs_agf_buf_ops = { .name = "xfs_agf", + .magic = { cpu_to_be32(XFS_AGF_MAGIC), cpu_to_be32(XFS_AGF_MAGIC) }, .verify_read = xfs_agf_read_verify, .verify_write = xfs_agf_write_verify, .verify_struct = xfs_agf_verify, @@ -3133,7 +3137,7 @@ xfs_alloc_has_record( /* * Walk all the blocks in the AGFL. The @walk_fn can return any negative - * error code or XFS_BTREE_QUERY_RANGE_ABORT. + * error code or XFS_ITER_*. */ int xfs_agfl_walk( diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c index 4e59cc8a2802..2a94543857a1 100644 --- a/fs/xfs/libxfs/xfs_alloc_btree.c +++ b/fs/xfs/libxfs/xfs_alloc_btree.c @@ -17,7 +17,6 @@ #include "xfs_extent_busy.h" #include "xfs_error.h" #include "xfs_trace.h" -#include "xfs_cksum.h" #include "xfs_trans.h" @@ -292,53 +291,39 @@ static xfs_failaddr_t xfs_allocbt_verify( struct xfs_buf *bp) { - struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_mount *mp = bp->b_mount; struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); struct xfs_perag *pag = bp->b_pag; xfs_failaddr_t fa; unsigned int level; + xfs_btnum_t btnum = XFS_BTNUM_BNOi; + + if (!xfs_verify_magic(bp, block->bb_magic)) + return __this_address; + + if (xfs_sb_version_hascrc(&mp->m_sb)) { + fa = xfs_btree_sblock_v5hdr_verify(bp); + if (fa) + return fa; + } /* - * magic number and level verification - * - * During growfs operations, we can't verify the exact level or owner as - * the perag is not fully initialised and hence not attached to the - * buffer. In this case, check against the maximum tree depth. + * The perag may not be attached during grow operations or fully + * initialized from the AGF during log recovery. Therefore we can only + * check against maximum tree depth from those contexts. * - * Similarly, during log recovery we will have a perag structure - * attached, but the agf information will not yet have been initialised - * from the on disk AGF. Again, we can only check against maximum limits - * in this case. + * Otherwise check against the per-tree limit. Peek at one of the + * verifier magic values to determine the type of tree we're verifying + * against. */ level = be16_to_cpu(block->bb_level); - switch (block->bb_magic) { - case cpu_to_be32(XFS_ABTB_CRC_MAGIC): - fa = xfs_btree_sblock_v5hdr_verify(bp); - if (fa) - return fa; - /* fall through */ - case cpu_to_be32(XFS_ABTB_MAGIC): - if (pag && pag->pagf_init) { - if (level >= pag->pagf_levels[XFS_BTNUM_BNOi]) - return __this_address; - } else if (level >= mp->m_ag_maxlevels) + if (bp->b_ops->magic[0] == cpu_to_be32(XFS_ABTC_MAGIC)) + btnum = XFS_BTNUM_CNTi; + if (pag && pag->pagf_init) { + if (level >= pag->pagf_levels[btnum]) return __this_address; - break; - case cpu_to_be32(XFS_ABTC_CRC_MAGIC): - fa = xfs_btree_sblock_v5hdr_verify(bp); - if (fa) - return fa; - /* fall through */ - case cpu_to_be32(XFS_ABTC_MAGIC): - if (pag && pag->pagf_init) { - if (level >= pag->pagf_levels[XFS_BTNUM_CNTi]) - return __this_address; - } else if (level >= mp->m_ag_maxlevels) - return __this_address; - break; - default: + } else if (level >= mp->m_ag_maxlevels) return __this_address; - } return xfs_btree_sblock_verify(bp, mp->m_alloc_mxr[level != 0]); } @@ -377,13 +362,23 @@ xfs_allocbt_write_verify( } -const struct xfs_buf_ops xfs_allocbt_buf_ops = { - .name = "xfs_allocbt", +const struct xfs_buf_ops xfs_bnobt_buf_ops = { + .name = "xfs_bnobt", + .magic = { cpu_to_be32(XFS_ABTB_MAGIC), + cpu_to_be32(XFS_ABTB_CRC_MAGIC) }, .verify_read = xfs_allocbt_read_verify, .verify_write = xfs_allocbt_write_verify, .verify_struct = xfs_allocbt_verify, }; +const struct xfs_buf_ops xfs_cntbt_buf_ops = { + .name = "xfs_cntbt", + .magic = { cpu_to_be32(XFS_ABTC_MAGIC), + cpu_to_be32(XFS_ABTC_CRC_MAGIC) }, + .verify_read = xfs_allocbt_read_verify, + .verify_write = xfs_allocbt_write_verify, + .verify_struct = xfs_allocbt_verify, +}; STATIC int xfs_bnobt_keys_inorder( @@ -448,7 +443,7 @@ static const struct xfs_btree_ops xfs_bnobt_ops = { .init_rec_from_cur = xfs_allocbt_init_rec_from_cur, .init_ptr_from_cur = xfs_allocbt_init_ptr_from_cur, .key_diff = xfs_bnobt_key_diff, - .buf_ops = &xfs_allocbt_buf_ops, + .buf_ops = &xfs_bnobt_buf_ops, .diff_two_keys = xfs_bnobt_diff_two_keys, .keys_inorder = xfs_bnobt_keys_inorder, .recs_inorder = xfs_bnobt_recs_inorder, @@ -470,7 +465,7 @@ static const struct xfs_btree_ops xfs_cntbt_ops = { .init_rec_from_cur = xfs_allocbt_init_rec_from_cur, .init_ptr_from_cur = xfs_allocbt_init_ptr_from_cur, .key_diff = xfs_cntbt_key_diff, - .buf_ops = &xfs_allocbt_buf_ops, + .buf_ops = &xfs_cntbt_buf_ops, .diff_two_keys = xfs_cntbt_diff_two_keys, .keys_inorder = xfs_cntbt_keys_inorder, .recs_inorder = xfs_cntbt_recs_inorder, diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index 844ed87b1900..d48fcf11cc35 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -9,23 +9,18 @@ #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" -#include "xfs_bit.h" #include "xfs_mount.h" #include "xfs_defer.h" #include "xfs_da_format.h" #include "xfs_da_btree.h" #include "xfs_attr_sf.h" #include "xfs_inode.h" -#include "xfs_alloc.h" #include "xfs_trans.h" -#include "xfs_inode_item.h" #include "xfs_bmap.h" -#include "xfs_bmap_util.h" #include "xfs_bmap_btree.h" #include "xfs_attr.h" #include "xfs_attr_leaf.h" #include "xfs_attr_remote.h" -#include "xfs_error.h" #include "xfs_quota.h" #include "xfs_trans_space.h" #include "xfs_trace.h" @@ -224,10 +219,10 @@ xfs_attr_try_sf_addname( */ int xfs_attr_set_args( - struct xfs_da_args *args, - struct xfs_buf **leaf_bp) + struct xfs_da_args *args) { struct xfs_inode *dp = args->dp; + struct xfs_buf *leaf_bp = NULL; int error; /* @@ -255,7 +250,7 @@ xfs_attr_set_args( * It won't fit in the shortform, transform to a leaf block. * GROT: another possible req'mt for a double-split btree op. */ - error = xfs_attr_shortform_to_leaf(args, leaf_bp); + error = xfs_attr_shortform_to_leaf(args, &leaf_bp); if (error) return error; @@ -263,23 +258,16 @@ xfs_attr_set_args( * Prevent the leaf buffer from being unlocked so that a * concurrent AIL push cannot grab the half-baked leaf * buffer and run into problems with the write verifier. + * Once we're done rolling the transaction we can release + * the hold and add the attr to the leaf. */ - xfs_trans_bhold(args->trans, *leaf_bp); - + xfs_trans_bhold(args->trans, leaf_bp); error = xfs_defer_finish(&args->trans); - if (error) - return error; - - /* - * Commit the leaf transformation. We'll need another - * (linked) transaction to add the new attribute to the - * leaf. - */ - error = xfs_trans_roll_inode(&args->trans, dp); - if (error) + xfs_trans_bhold_release(args->trans, leaf_bp); + if (error) { + xfs_trans_brelse(args->trans, leaf_bp); return error; - xfs_trans_bjoin(args->trans, *leaf_bp); - *leaf_bp = NULL; + } } if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) @@ -322,7 +310,6 @@ xfs_attr_set( int flags) { struct xfs_mount *mp = dp->i_mount; - struct xfs_buf *leaf_bp = NULL; struct xfs_da_args args; struct xfs_trans_res tres; int rsvd = (flags & ATTR_ROOT) != 0; @@ -381,9 +368,9 @@ xfs_attr_set( goto out_trans_cancel; xfs_trans_ijoin(args.trans, dp, 0); - error = xfs_attr_set_args(&args, &leaf_bp); + error = xfs_attr_set_args(&args); if (error) - goto out_release_leaf; + goto out_trans_cancel; if (!args.trans) { /* shortform attribute has already been committed */ goto out_unlock; @@ -408,9 +395,6 @@ out_unlock: xfs_iunlock(dp, XFS_ILOCK_EXCL); return error; -out_release_leaf: - if (leaf_bp) - xfs_trans_brelse(args.trans, leaf_bp); out_trans_cancel: if (args.trans) xfs_trans_cancel(args.trans); @@ -1336,3 +1320,20 @@ xfs_attr_node_get(xfs_da_args_t *args) xfs_da_state_free(state); return retval; } + +/* Returns true if the attribute entry name is valid. */ +bool +xfs_attr_namecheck( + const void *name, + size_t length) +{ + /* + * MAXNAMELEN includes the trailing null, but (name/length) leave it + * out, so use >= for the length check. + */ + if (length >= MAXNAMELEN) + return false; + + /* There shouldn't be any nulls here */ + return !memchr(name, 0, length); +} diff --git a/fs/xfs/libxfs/xfs_attr.h b/fs/xfs/libxfs/xfs_attr.h index bdf52a333f3f..ff28ebf3b635 100644 --- a/fs/xfs/libxfs/xfs_attr.h +++ b/fs/xfs/libxfs/xfs_attr.h @@ -112,7 +112,13 @@ typedef struct xfs_attr_list_context { struct xfs_inode *dp; /* inode */ struct attrlist_cursor_kern *cursor; /* position in list */ char *alist; /* output buffer */ - int seen_enough; /* T/F: seen enough of list? */ + + /* + * Abort attribute list iteration if non-zero. Can be used to pass + * error values to the xfs_attr_list caller. + */ + int seen_enough; + ssize_t count; /* num used entries */ int dupcnt; /* count dup hashvals seen */ int bufsize; /* total buffer size */ @@ -140,11 +146,11 @@ int xfs_attr_get(struct xfs_inode *ip, const unsigned char *name, unsigned char *value, int *valuelenp, int flags); int xfs_attr_set(struct xfs_inode *dp, const unsigned char *name, unsigned char *value, int valuelen, int flags); -int xfs_attr_set_args(struct xfs_da_args *args, struct xfs_buf **leaf_bp); +int xfs_attr_set_args(struct xfs_da_args *args); int xfs_attr_remove(struct xfs_inode *dp, const unsigned char *name, int flags); int xfs_attr_remove_args(struct xfs_da_args *args); int xfs_attr_list(struct xfs_inode *dp, char *buffer, int bufsize, int flags, struct attrlist_cursor_kern *cursor); - +bool xfs_attr_namecheck(const void *name, size_t length); #endif /* __XFS_ATTR_H__ */ diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c index 2652d00842d6..70eb941d02e4 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.c +++ b/fs/xfs/libxfs/xfs_attr_leaf.c @@ -10,14 +10,12 @@ #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" -#include "xfs_bit.h" #include "xfs_sb.h" #include "xfs_mount.h" #include "xfs_da_format.h" #include "xfs_da_btree.h" #include "xfs_inode.h" #include "xfs_trans.h" -#include "xfs_inode_item.h" #include "xfs_bmap_btree.h" #include "xfs_bmap.h" #include "xfs_attr_sf.h" @@ -27,7 +25,6 @@ #include "xfs_error.h" #include "xfs_trace.h" #include "xfs_buf_item.h" -#include "xfs_cksum.h" #include "xfs_dir2.h" #include "xfs_log.h" @@ -240,30 +237,19 @@ xfs_attr3_leaf_verify( struct xfs_buf *bp) { struct xfs_attr3_icleaf_hdr ichdr; - struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_mount *mp = bp->b_mount; struct xfs_attr_leafblock *leaf = bp->b_addr; struct xfs_attr_leaf_entry *entries; uint32_t end; /* must be 32bit - see below */ int i; + xfs_failaddr_t fa; xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr, leaf); - if (xfs_sb_version_hascrc(&mp->m_sb)) { - struct xfs_da3_node_hdr *hdr3 = bp->b_addr; - - if (ichdr.magic != XFS_ATTR3_LEAF_MAGIC) - return __this_address; + fa = xfs_da3_blkinfo_verify(bp, bp->b_addr); + if (fa) + return fa; - if (!uuid_equal(&hdr3->info.uuid, &mp->m_sb.sb_meta_uuid)) - return __this_address; - if (be64_to_cpu(hdr3->info.blkno) != bp->b_bn) - return __this_address; - if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->info.lsn))) - return __this_address; - } else { - if (ichdr.magic != XFS_ATTR_LEAF_MAGIC) - return __this_address; - } /* * In recovery there is a transient state where count == 0 is valid * because we may have transitioned an empty shortform attr to a leaf @@ -324,7 +310,7 @@ static void xfs_attr3_leaf_write_verify( struct xfs_buf *bp) { - struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_mount *mp = bp->b_mount; struct xfs_buf_log_item *bip = bp->b_log_item; struct xfs_attr3_leaf_hdr *hdr3 = bp->b_addr; xfs_failaddr_t fa; @@ -354,7 +340,7 @@ static void xfs_attr3_leaf_read_verify( struct xfs_buf *bp) { - struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_mount *mp = bp->b_mount; xfs_failaddr_t fa; if (xfs_sb_version_hascrc(&mp->m_sb) && @@ -369,6 +355,8 @@ xfs_attr3_leaf_read_verify( const struct xfs_buf_ops xfs_attr3_leaf_buf_ops = { .name = "xfs_attr3_leaf", + .magic16 = { cpu_to_be16(XFS_ATTR_LEAF_MAGIC), + cpu_to_be16(XFS_ATTR3_LEAF_MAGIC) }, .verify_read = xfs_attr3_leaf_read_verify, .verify_write = xfs_attr3_leaf_write_verify, .verify_struct = xfs_attr3_leaf_verify, @@ -874,7 +862,7 @@ xfs_attr_shortform_allfit( struct xfs_attr3_icleaf_hdr leafhdr; int bytes; int i; - struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_mount *mp = bp->b_mount; leaf = bp->b_addr; xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &leafhdr, leaf); @@ -1534,7 +1522,7 @@ xfs_attr_leaf_order( { struct xfs_attr3_icleaf_hdr ichdr1; struct xfs_attr3_icleaf_hdr ichdr2; - struct xfs_mount *mp = leaf1_bp->b_target->bt_mount; + struct xfs_mount *mp = leaf1_bp->b_mount; xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr1, leaf1_bp->b_addr); xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr2, leaf2_bp->b_addr); @@ -2577,7 +2565,7 @@ xfs_attr_leaf_lasthash( { struct xfs_attr3_icleaf_hdr ichdr; struct xfs_attr_leaf_entry *entries; - struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_mount *mp = bp->b_mount; xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr, bp->b_addr); entries = xfs_attr3_leaf_entryp(bp->b_addr); diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c index d89363c6b523..4eb30d357045 100644 --- a/fs/xfs/libxfs/xfs_attr_remote.c +++ b/fs/xfs/libxfs/xfs_attr_remote.c @@ -16,18 +16,10 @@ #include "xfs_da_format.h" #include "xfs_da_btree.h" #include "xfs_inode.h" -#include "xfs_alloc.h" #include "xfs_trans.h" -#include "xfs_inode_item.h" #include "xfs_bmap.h" -#include "xfs_bmap_util.h" #include "xfs_attr.h" -#include "xfs_attr_leaf.h" -#include "xfs_attr_remote.h" -#include "xfs_trans_space.h" #include "xfs_trace.h" -#include "xfs_cksum.h" -#include "xfs_buf_item.h" #include "xfs_error.h" #define ATTR_RMTVALUE_MAPSIZE 1 /* # of map entries at once */ @@ -79,6 +71,7 @@ xfs_attr3_rmt_hdr_ok( static xfs_failaddr_t xfs_attr3_rmt_verify( struct xfs_mount *mp, + struct xfs_buf *bp, void *ptr, int fsbsize, xfs_daddr_t bno) @@ -87,7 +80,7 @@ xfs_attr3_rmt_verify( if (!xfs_sb_version_hascrc(&mp->m_sb)) return __this_address; - if (rmt->rm_magic != cpu_to_be32(XFS_ATTR3_RMT_MAGIC)) + if (!xfs_verify_magic(bp, rmt->rm_magic)) return __this_address; if (!uuid_equal(&rmt->rm_uuid, &mp->m_sb.sb_meta_uuid)) return __this_address; @@ -110,7 +103,7 @@ __xfs_attr3_rmt_read_verify( bool check_crc, xfs_failaddr_t *failaddr) { - struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_mount *mp = bp->b_mount; char *ptr; int len; xfs_daddr_t bno; @@ -131,7 +124,7 @@ __xfs_attr3_rmt_read_verify( *failaddr = __this_address; return -EFSBADCRC; } - *failaddr = xfs_attr3_rmt_verify(mp, ptr, blksize, bno); + *failaddr = xfs_attr3_rmt_verify(mp, bp, ptr, blksize, bno); if (*failaddr) return -EFSCORRUPTED; len -= blksize; @@ -174,7 +167,7 @@ static void xfs_attr3_rmt_write_verify( struct xfs_buf *bp) { - struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_mount *mp = bp->b_mount; xfs_failaddr_t fa; int blksize = mp->m_attr_geo->blksize; char *ptr; @@ -193,7 +186,7 @@ xfs_attr3_rmt_write_verify( while (len > 0) { struct xfs_attr3_rmt_hdr *rmt = (struct xfs_attr3_rmt_hdr *)ptr; - fa = xfs_attr3_rmt_verify(mp, ptr, blksize, bno); + fa = xfs_attr3_rmt_verify(mp, bp, ptr, blksize, bno); if (fa) { xfs_verifier_error(bp, -EFSCORRUPTED, fa); return; @@ -220,6 +213,7 @@ xfs_attr3_rmt_write_verify( const struct xfs_buf_ops xfs_attr3_rmt_buf_ops = { .name = "xfs_attr3_rmt", + .magic = { 0, cpu_to_be32(XFS_ATTR3_RMT_MAGIC) }, .verify_read = xfs_attr3_rmt_read_verify, .verify_write = xfs_attr3_rmt_write_verify, .verify_struct = xfs_attr3_rmt_verify_struct, @@ -533,7 +527,7 @@ xfs_attr_rmtval_set( dblkno = XFS_FSB_TO_DADDR(mp, map.br_startblock), dblkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount); - bp = xfs_buf_get(mp->m_ddev_targp, dblkno, dblkcnt, 0); + bp = xfs_buf_get(mp->m_ddev_targp, dblkno, dblkcnt); if (!bp) return -ENOMEM; bp->b_ops = &xfs_attr3_rmt_buf_ops; diff --git a/fs/xfs/libxfs/xfs_bit.c b/fs/xfs/libxfs/xfs_bit.c index 40ce5f3094d1..7071ff98fdbc 100644 --- a/fs/xfs/libxfs/xfs_bit.c +++ b/fs/xfs/libxfs/xfs_bit.c @@ -5,7 +5,6 @@ */ #include "xfs.h" #include "xfs_log_format.h" -#include "xfs_bit.h" /* * XFS bit manipulation routines, used in non-realtime code. diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 332eefa2700b..baf0b72c0a37 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -13,14 +13,10 @@ #include "xfs_sb.h" #include "xfs_mount.h" #include "xfs_defer.h" -#include "xfs_da_format.h" -#include "xfs_da_btree.h" #include "xfs_dir2.h" #include "xfs_inode.h" #include "xfs_btree.h" #include "xfs_trans.h" -#include "xfs_inode_item.h" -#include "xfs_extfree_item.h" #include "xfs_alloc.h" #include "xfs_bmap.h" #include "xfs_bmap_util.h" @@ -32,7 +28,6 @@ #include "xfs_trans_space.h" #include "xfs_buf_item.h" #include "xfs_trace.h" -#include "xfs_symlink.h" #include "xfs_attr_leaf.h" #include "xfs_filestream.h" #include "xfs_rmap.h" @@ -370,7 +365,7 @@ xfs_bmap_check_leaf_extents( bp = xfs_bmap_get_bp(cur, XFS_FSB_TO_DADDR(mp, bno)); if (!bp) { bp_release = 1; - error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp, + error = xfs_btree_read_bufl(mp, NULL, bno, &bp, XFS_BMAP_BTREE_REF, &xfs_bmbt_buf_ops); if (error) @@ -454,7 +449,7 @@ xfs_bmap_check_leaf_extents( bp = xfs_bmap_get_bp(cur, XFS_FSB_TO_DADDR(mp, bno)); if (!bp) { bp_release = 1; - error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp, + error = xfs_btree_read_bufl(mp, NULL, bno, &bp, XFS_BMAP_BTREE_REF, &xfs_bmbt_buf_ops); if (error) @@ -577,47 +572,49 @@ __xfs_bmap_add_free( */ /* - * Transform a btree format file with only one leaf node, where the - * extents list will fit in the inode, into an extents format file. - * Since the file extents are already in-core, all we have to do is - * give up the space for the btree root and pitch the leaf block. + * Convert the inode format to extent format if it currently is in btree format, + * but the extent list is small enough that it fits into the extent format. + * + * Since the extents are already in-core, all we have to do is give up the space + * for the btree root and pitch the leaf block. */ STATIC int /* error */ xfs_bmap_btree_to_extents( - xfs_trans_t *tp, /* transaction pointer */ - xfs_inode_t *ip, /* incore inode pointer */ - xfs_btree_cur_t *cur, /* btree cursor */ + struct xfs_trans *tp, /* transaction pointer */ + struct xfs_inode *ip, /* incore inode pointer */ + struct xfs_btree_cur *cur, /* btree cursor */ int *logflagsp, /* inode logging flags */ int whichfork) /* data or attr fork */ { - /* REFERENCED */ + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_mount *mp = ip->i_mount; + struct xfs_btree_block *rblock = ifp->if_broot; struct xfs_btree_block *cblock;/* child btree block */ xfs_fsblock_t cbno; /* child block number */ xfs_buf_t *cbp; /* child block's buffer */ int error; /* error return value */ - struct xfs_ifork *ifp; /* inode fork data */ - xfs_mount_t *mp; /* mount point structure */ __be64 *pp; /* ptr to block address */ - struct xfs_btree_block *rblock;/* root btree block */ struct xfs_owner_info oinfo; - mp = ip->i_mount; - ifp = XFS_IFORK_PTR(ip, whichfork); + /* check if we actually need the extent format first: */ + if (!xfs_bmap_wants_extents(ip, whichfork)) + return 0; + + ASSERT(cur); ASSERT(whichfork != XFS_COW_FORK); ASSERT(ifp->if_flags & XFS_IFEXTENTS); ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE); - rblock = ifp->if_broot; ASSERT(be16_to_cpu(rblock->bb_level) == 1); ASSERT(be16_to_cpu(rblock->bb_numrecs) == 1); ASSERT(xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0) == 1); + pp = XFS_BMAP_BROOT_PTR_ADDR(mp, rblock, 1, ifp->if_broot_bytes); cbno = be64_to_cpu(*pp); - *logflagsp = 0; #ifdef DEBUG XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, xfs_btree_check_lptr(cur, cbno, 1)); #endif - error = xfs_btree_read_bufl(mp, tp, cbno, 0, &cbp, XFS_BMAP_BTREE_REF, + error = xfs_btree_read_bufl(mp, tp, cbno, &cbp, XFS_BMAP_BTREE_REF, &xfs_bmbt_buf_ops); if (error) return error; @@ -635,7 +632,7 @@ xfs_bmap_btree_to_extents( ASSERT(ifp->if_broot == NULL); ASSERT((ifp->if_flags & XFS_IFBROOT) == 0); XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS); - *logflagsp = XFS_ILOG_CORE | xfs_ilog_fext(whichfork); + *logflagsp |= XFS_ILOG_CORE | xfs_ilog_fext(whichfork); return 0; } @@ -730,7 +727,7 @@ xfs_bmap_extents_to_btree( cur->bc_private.b.allocated++; ip->i_d.di_nblocks++; xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, 1L); - abp = xfs_btree_get_bufl(mp, tp, args.fsbno, 0); + abp = xfs_btree_get_bufl(mp, tp, args.fsbno); if (!abp) { error = -EFSCORRUPTED; goto out_unreserve_dquot; @@ -876,7 +873,7 @@ xfs_bmap_local_to_extents( ASSERT(args.fsbno != NULLFSBLOCK); ASSERT(args.len == 1); tp->t_firstblock = args.fsbno; - bp = xfs_btree_get_bufl(args.mp, tp, args.fsbno, 0); + bp = xfs_btree_get_bufl(args.mp, tp, args.fsbno); /* * Initialize the block, copy the data and log the remote buffer. @@ -1189,7 +1186,10 @@ xfs_iread_extents( * Root level must use BMAP_BROOT_PTR_ADDR macro to get ptr out. */ level = be16_to_cpu(block->bb_level); - ASSERT(level > 0); + if (unlikely(level == 0)) { + XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp); + return -EFSCORRUPTED; + } pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes); bno = be64_to_cpu(*pp); @@ -1198,7 +1198,7 @@ xfs_iread_extents( * pointer (leftmost) at each level. */ while (level-- > 0) { - error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, + error = xfs_btree_read_bufl(mp, tp, bno, &bp, XFS_BMAP_BTREE_REF, &xfs_bmbt_buf_ops); if (error) goto out; @@ -1271,7 +1271,7 @@ xfs_iread_extents( */ if (bno == NULLFSBLOCK) break; - error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, + error = xfs_btree_read_bufl(mp, tp, bno, &bp, XFS_BMAP_BTREE_REF, &xfs_bmbt_buf_ops); if (error) goto out; @@ -2004,6 +2004,9 @@ xfs_bmap_add_extent_delay_real( goto done; } + if (da_new != da_old) + xfs_mod_delalloc(mp, (int64_t)da_new - da_old); + if (bma->cur) { da_new += bma->cur->bc_private.b.allocated; bma->cur->bc_private.b.allocated = 0; @@ -2029,7 +2032,7 @@ done: /* * Convert an unwritten allocation to a real allocation or vice versa. */ -STATIC int /* error */ +int /* error */ xfs_bmap_add_extent_unwritten_real( struct xfs_trans *tp, xfs_inode_t *ip, /* incore inode pointer */ @@ -2635,6 +2638,7 @@ xfs_bmap_add_extent_hole_delay( /* * Nothing to do for disk quota accounting here. */ + xfs_mod_delalloc(ip->i_mount, (int64_t)newlen - oldlen); } } @@ -3347,8 +3351,10 @@ xfs_bmap_btalloc_accounting( * already have quota reservation and there's nothing to do * yet. */ - if (ap->wasdel) + if (ap->wasdel) { + xfs_mod_delalloc(ap->ip->i_mount, -(int64_t)args->len); return; + } /* * Otherwise, we've allocated blocks in a hole. The transaction @@ -3367,8 +3373,10 @@ xfs_bmap_btalloc_accounting( /* data/attr fork only */ ap->ip->i_d.di_nblocks += args->len; xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE); - if (ap->wasdel) + if (ap->wasdel) { ap->ip->i_delayed_blks -= args->len; + xfs_mod_delalloc(ap->ip->i_mount, -(int64_t)args->len); + } xfs_trans_mod_dquot_byino(ap->tp, ap->ip, ap->wasdel ? XFS_TRANS_DQ_DELBCOUNT : XFS_TRANS_DQ_BCOUNT, args->len); @@ -3685,17 +3693,6 @@ xfs_trim_extent( } } -/* trim extent to within eof */ -void -xfs_trim_extent_eof( - struct xfs_bmbt_irec *irec, - struct xfs_inode *ip) - -{ - xfs_trim_extent(irec, 0, XFS_B_TO_FSB(ip->i_mount, - i_size_read(VFS_I(ip)))); -} - /* * Trim the returned map to the required bounds */ @@ -3975,6 +3972,7 @@ xfs_bmapi_reserve_delalloc( ip->i_delayed_blks += alen; + xfs_mod_delalloc(ip->i_mount, alen + indlen); got->br_startoff = aoff; got->br_startblock = nullstartblock(indlen); @@ -4203,6 +4201,44 @@ xfs_bmapi_convert_unwritten( return 0; } +static inline xfs_extlen_t +xfs_bmapi_minleft( + struct xfs_trans *tp, + struct xfs_inode *ip, + int fork) +{ + if (tp && tp->t_firstblock != NULLFSBLOCK) + return 0; + if (XFS_IFORK_FORMAT(ip, fork) != XFS_DINODE_FMT_BTREE) + return 1; + return be16_to_cpu(XFS_IFORK_PTR(ip, fork)->if_broot->bb_level) + 1; +} + +/* + * Log whatever the flags say, even if error. Otherwise we might miss detecting + * a case where the data is changed, there's an error, and it's not logged so we + * don't shutdown when we should. Don't bother logging extents/btree changes if + * we converted to the other format. + */ +static void +xfs_bmapi_finish( + struct xfs_bmalloca *bma, + int whichfork, + int error) +{ + if ((bma->logflags & xfs_ilog_fext(whichfork)) && + XFS_IFORK_FORMAT(bma->ip, whichfork) != XFS_DINODE_FMT_EXTENTS) + bma->logflags &= ~xfs_ilog_fext(whichfork); + else if ((bma->logflags & xfs_ilog_fbroot(whichfork)) && + XFS_IFORK_FORMAT(bma->ip, whichfork) != XFS_DINODE_FMT_BTREE) + bma->logflags &= ~xfs_ilog_fbroot(whichfork); + + if (bma->logflags) + xfs_trans_log_inode(bma->tp, bma->ip, bma->logflags); + if (bma->cur) + xfs_btree_del_cursor(bma->cur, error); +} + /* * Map file blocks to filesystem blocks, and allocate blocks or convert the * extent state if necessary. Details behaviour is controlled by the flags @@ -4220,9 +4256,13 @@ xfs_bmapi_write( struct xfs_bmbt_irec *mval, /* output: map values */ int *nmap) /* i/o: mval size/count */ { + struct xfs_bmalloca bma = { + .tp = tp, + .ip = ip, + .total = total, + }; struct xfs_mount *mp = ip->i_mount; struct xfs_ifork *ifp; - struct xfs_bmalloca bma = { NULL }; /* args for xfs_bmap_alloc */ xfs_fileoff_t end; /* end of mapped file region */ bool eof = false; /* after the end of extents */ int error; /* error return */ @@ -4247,9 +4287,7 @@ xfs_bmapi_write( ASSERT(*nmap >= 1); ASSERT(*nmap <= XFS_BMAP_MAX_NMAP); - ASSERT(tp != NULL || - (flags & (XFS_BMAPI_CONVERT | XFS_BMAPI_COWFORK)) == - (XFS_BMAPI_CONVERT | XFS_BMAPI_COWFORK)); + ASSERT(tp != NULL); ASSERT(len > 0); ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL); ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); @@ -4282,34 +4320,21 @@ xfs_bmapi_write( XFS_STATS_INC(mp, xs_blk_mapw); - if (!tp || tp->t_firstblock == NULLFSBLOCK) { - if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE) - bma.minleft = be16_to_cpu(ifp->if_broot->bb_level) + 1; - else - bma.minleft = 1; - } else { - bma.minleft = 0; - } - if (!(ifp->if_flags & XFS_IFEXTENTS)) { error = xfs_iread_extents(tp, ip, whichfork); if (error) goto error0; } - n = 0; - end = bno + len; - obno = bno; - if (!xfs_iext_lookup_extent(ip, ifp, bno, &bma.icur, &bma.got)) eof = true; if (!xfs_iext_peek_prev_extent(ifp, &bma.icur, &bma.prev)) bma.prev.br_startoff = NULLFILEOFF; - bma.tp = tp; - bma.ip = ip; - bma.total = total; - bma.datatype = 0; + bma.minleft = xfs_bmapi_minleft(tp, ip, whichfork); + n = 0; + end = bno + len; + obno = bno; while (bno < end && n < *nmap) { bool need_alloc = false, wasdelay = false; @@ -4323,26 +4348,7 @@ xfs_bmapi_write( ASSERT(!((flags & XFS_BMAPI_CONVERT) && (flags & XFS_BMAPI_COWFORK))); - if (flags & XFS_BMAPI_DELALLOC) { - /* - * For the COW fork we can reasonably get a - * request for converting an extent that races - * with other threads already having converted - * part of it, as there converting COW to - * regular blocks is not protected using the - * IOLOCK. - */ - ASSERT(flags & XFS_BMAPI_COWFORK); - if (!(flags & XFS_BMAPI_COWFORK)) { - error = -EIO; - goto error0; - } - - if (eof || bno >= end) - break; - } else { - need_alloc = true; - } + need_alloc = true; } else if (isnullstartblock(bma.got.br_startblock)) { wasdelay = true; } @@ -4351,8 +4357,7 @@ xfs_bmapi_write( * First, deal with the hole before the allocated space * that we found, if any. */ - if ((need_alloc || wasdelay) && - !(flags & XFS_BMAPI_CONVERT_ONLY)) { + if (need_alloc || wasdelay) { bma.eof = eof; bma.conv = !!(flags & XFS_BMAPI_CONVERT); bma.wasdel = wasdelay; @@ -4420,49 +4425,130 @@ xfs_bmapi_write( } *nmap = n; - /* - * Transform from btree to extents, give it cur. - */ - if (xfs_bmap_wants_extents(ip, whichfork)) { - int tmp_logflags = 0; - - ASSERT(bma.cur); - error = xfs_bmap_btree_to_extents(tp, ip, bma.cur, - &tmp_logflags, whichfork); - bma.logflags |= tmp_logflags; - if (error) - goto error0; - } + error = xfs_bmap_btree_to_extents(tp, ip, bma.cur, &bma.logflags, + whichfork); + if (error) + goto error0; ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE || XFS_IFORK_NEXTENTS(ip, whichfork) > XFS_IFORK_MAXEXT(ip, whichfork)); - error = 0; + xfs_bmapi_finish(&bma, whichfork, 0); + xfs_bmap_validate_ret(orig_bno, orig_len, orig_flags, orig_mval, + orig_nmap, *nmap); + return 0; error0: + xfs_bmapi_finish(&bma, whichfork, error); + return error; +} + +/* + * Convert an existing delalloc extent to real blocks based on file offset. This + * attempts to allocate the entire delalloc extent and may require multiple + * invocations to allocate the target offset if a large enough physical extent + * is not available. + */ +int +xfs_bmapi_convert_delalloc( + struct xfs_inode *ip, + int whichfork, + xfs_fileoff_t offset_fsb, + struct xfs_bmbt_irec *imap, + unsigned int *seq) +{ + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_mount *mp = ip->i_mount; + struct xfs_bmalloca bma = { NULL }; + struct xfs_trans *tp; + int error; + /* - * Log everything. Do this after conversion, there's no point in - * logging the extent records if we've converted to btree format. + * Space for the extent and indirect blocks was reserved when the + * delalloc extent was created so there's no need to do so here. */ - if ((bma.logflags & xfs_ilog_fext(whichfork)) && - XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS) - bma.logflags &= ~xfs_ilog_fext(whichfork); - else if ((bma.logflags & xfs_ilog_fbroot(whichfork)) && - XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE) - bma.logflags &= ~xfs_ilog_fbroot(whichfork); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, 0, 0, + XFS_TRANS_RESERVE, &tp); + if (error) + return error; + + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, 0); + + if (!xfs_iext_lookup_extent(ip, ifp, offset_fsb, &bma.icur, &bma.got) || + bma.got.br_startoff > offset_fsb) { + /* + * No extent found in the range we are trying to convert. This + * should only happen for the COW fork, where another thread + * might have moved the extent to the data fork in the meantime. + */ + WARN_ON_ONCE(whichfork != XFS_COW_FORK); + error = -EAGAIN; + goto out_trans_cancel; + } + /* - * Log whatever the flags say, even if error. Otherwise we might miss - * detecting a case where the data is changed, there's an error, - * and it's not logged so we don't shutdown when we should. + * If we find a real extent here we raced with another thread converting + * the extent. Just return the real extent at this offset. */ - if (bma.logflags) - xfs_trans_log_inode(tp, ip, bma.logflags); + if (!isnullstartblock(bma.got.br_startblock)) { + *imap = bma.got; + *seq = READ_ONCE(ifp->if_seq); + goto out_trans_cancel; + } + + bma.tp = tp; + bma.ip = ip; + bma.wasdel = true; + bma.offset = bma.got.br_startoff; + bma.length = max_t(xfs_filblks_t, bma.got.br_blockcount, MAXEXTLEN); + bma.total = XFS_EXTENTADD_SPACE_RES(ip->i_mount, XFS_DATA_FORK); + bma.minleft = xfs_bmapi_minleft(tp, ip, whichfork); + if (whichfork == XFS_COW_FORK) + bma.flags = XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC; + + if (!xfs_iext_peek_prev_extent(ifp, &bma.icur, &bma.prev)) + bma.prev.br_startoff = NULLFILEOFF; + + error = xfs_bmapi_allocate(&bma); + if (error) + goto out_finish; + + error = -ENOSPC; + if (WARN_ON_ONCE(bma.blkno == NULLFSBLOCK)) + goto out_finish; + error = -EFSCORRUPTED; + if (WARN_ON_ONCE(!bma.got.br_startblock && !XFS_IS_REALTIME_INODE(ip))) + goto out_finish; + + XFS_STATS_ADD(mp, xs_xstrat_bytes, XFS_FSB_TO_B(mp, bma.length)); + XFS_STATS_INC(mp, xs_xstrat_quick); - if (bma.cur) { - xfs_btree_del_cursor(bma.cur, error); + ASSERT(!isnullstartblock(bma.got.br_startblock)); + *imap = bma.got; + *seq = READ_ONCE(ifp->if_seq); + + if (whichfork == XFS_COW_FORK) { + error = xfs_refcount_alloc_cow_extent(tp, bma.blkno, + bma.length); + if (error) + goto out_finish; } - if (!error) - xfs_bmap_validate_ret(orig_bno, orig_len, orig_flags, orig_mval, - orig_nmap, *nmap); + + error = xfs_bmap_btree_to_extents(tp, ip, bma.cur, &bma.logflags, + whichfork); + if (error) + goto out_finish; + + xfs_bmapi_finish(&bma, whichfork, 0); + error = xfs_trans_commit(tp); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return error; + +out_finish: + xfs_bmapi_finish(&bma, whichfork, error); +out_trans_cancel: + xfs_trans_cancel(tp); + xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; } @@ -4536,13 +4622,7 @@ xfs_bmapi_remap( if (error) goto error0; - if (xfs_bmap_wants_extents(ip, whichfork)) { - int tmp_logflags = 0; - - error = xfs_bmap_btree_to_extents(tp, ip, cur, - &tmp_logflags, whichfork); - logflags |= tmp_logflags; - } + error = xfs_bmap_btree_to_extents(tp, ip, cur, &logflags, whichfork); error0: if (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) @@ -4764,8 +4844,10 @@ xfs_bmap_del_extent_delay( da_diff = da_old - da_new; if (!isrt) da_diff += del->br_blockcount; - if (da_diff) + if (da_diff) { xfs_mod_fdblocks(mp, da_diff, false); + xfs_mod_delalloc(mp, -da_diff); + } return error; } @@ -5406,24 +5488,11 @@ nodelete: error = xfs_bmap_extents_to_btree(tp, ip, &cur, 0, &tmp_logflags, whichfork); logflags |= tmp_logflags; - if (error) - goto error0; - } - /* - * transform from btree to extents, give it cur - */ - else if (xfs_bmap_wants_extents(ip, whichfork)) { - ASSERT(cur != NULL); - error = xfs_bmap_btree_to_extents(tp, ip, cur, &tmp_logflags, + } else { + error = xfs_bmap_btree_to_extents(tp, ip, cur, &logflags, whichfork); - logflags |= tmp_logflags; - if (error) - goto error0; } - /* - * transform from extents to local? - */ - error = 0; + error0: /* * Log everything. Do this after conversion, there's no point in diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h index 09d3ea97cc15..8f597f9abdbe 100644 --- a/fs/xfs/libxfs/xfs_bmap.h +++ b/fs/xfs/libxfs/xfs_bmap.h @@ -95,12 +95,6 @@ struct xfs_extent_free_item /* Map something in the CoW fork. */ #define XFS_BMAPI_COWFORK 0x200 -/* Only convert delalloc space, don't allocate entirely new extents */ -#define XFS_BMAPI_DELALLOC 0x400 - -/* Only convert unwritten extents, don't allocate new blocks */ -#define XFS_BMAPI_CONVERT_ONLY 0x800 - /* Skip online discard of freed extents */ #define XFS_BMAPI_NODISCARD 0x1000 @@ -117,8 +111,6 @@ struct xfs_extent_free_item { XFS_BMAPI_ZERO, "ZERO" }, \ { XFS_BMAPI_REMAP, "REMAP" }, \ { XFS_BMAPI_COWFORK, "COWFORK" }, \ - { XFS_BMAPI_DELALLOC, "DELALLOC" }, \ - { XFS_BMAPI_CONVERT_ONLY, "CONVERT_ONLY" }, \ { XFS_BMAPI_NODISCARD, "NODISCARD" }, \ { XFS_BMAPI_NORMAP, "NORMAP" } @@ -181,7 +173,6 @@ static inline bool xfs_bmap_is_real_extent(struct xfs_bmbt_irec *irec) void xfs_trim_extent(struct xfs_bmbt_irec *irec, xfs_fileoff_t bno, xfs_filblks_t len); -void xfs_trim_extent_eof(struct xfs_bmbt_irec *, struct xfs_inode *); int xfs_bmap_add_attrfork(struct xfs_inode *ip, int size, int rsvd); int xfs_bmap_set_attrforkoff(struct xfs_inode *ip, int size, int *version); void xfs_bmap_local_to_extents_empty(struct xfs_inode *ip, int whichfork); @@ -228,6 +219,13 @@ int xfs_bmapi_reserve_delalloc(struct xfs_inode *ip, int whichfork, xfs_fileoff_t off, xfs_filblks_t len, xfs_filblks_t prealloc, struct xfs_bmbt_irec *got, struct xfs_iext_cursor *cur, int eof); +int xfs_bmapi_convert_delalloc(struct xfs_inode *ip, int whichfork, + xfs_fileoff_t offset_fsb, struct xfs_bmbt_irec *imap, + unsigned int *seq); +int xfs_bmap_add_extent_unwritten_real(struct xfs_trans *tp, + struct xfs_inode *ip, int whichfork, + struct xfs_iext_cursor *icur, struct xfs_btree_cur **curp, + struct xfs_bmbt_irec *new, int *logflagsp); static inline void xfs_bmap_add_free( diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c index cdb74d2e2a43..fbb18ba5d905 100644 --- a/fs/xfs/libxfs/xfs_bmap_btree.c +++ b/fs/xfs/libxfs/xfs_bmap_btree.c @@ -11,10 +11,8 @@ #include "xfs_trans_resv.h" #include "xfs_bit.h" #include "xfs_mount.h" -#include "xfs_defer.h" #include "xfs_inode.h" #include "xfs_trans.h" -#include "xfs_inode_item.h" #include "xfs_alloc.h" #include "xfs_btree.h" #include "xfs_bmap_btree.h" @@ -22,7 +20,6 @@ #include "xfs_error.h" #include "xfs_quota.h" #include "xfs_trace.h" -#include "xfs_cksum.h" #include "xfs_rmap.h" /* @@ -411,13 +408,15 @@ static xfs_failaddr_t xfs_bmbt_verify( struct xfs_buf *bp) { - struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_mount *mp = bp->b_mount; struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); xfs_failaddr_t fa; unsigned int level; - switch (block->bb_magic) { - case cpu_to_be32(XFS_BMAP_CRC_MAGIC): + if (!xfs_verify_magic(bp, block->bb_magic)) + return __this_address; + + if (xfs_sb_version_hascrc(&mp->m_sb)) { /* * XXX: need a better way of verifying the owner here. Right now * just make sure there has been one set. @@ -425,11 +424,6 @@ xfs_bmbt_verify( fa = xfs_btree_lblock_v5hdr_verify(bp, XFS_RMAP_OWN_UNKNOWN); if (fa) return fa; - /* fall through */ - case cpu_to_be32(XFS_BMAP_MAGIC): - break; - default: - return __this_address; } /* @@ -481,6 +475,8 @@ xfs_bmbt_write_verify( const struct xfs_buf_ops xfs_bmbt_buf_ops = { .name = "xfs_bmbt", + .magic = { cpu_to_be32(XFS_BMAP_MAGIC), + cpu_to_be32(XFS_BMAP_CRC_MAGIC) }, .verify_read = xfs_bmbt_read_verify, .verify_write = xfs_bmbt_write_verify, .verify_struct = xfs_bmbt_verify, diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index bbdae2b4559f..f1048efa4268 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -11,16 +11,13 @@ #include "xfs_trans_resv.h" #include "xfs_bit.h" #include "xfs_mount.h" -#include "xfs_defer.h" #include "xfs_inode.h" #include "xfs_trans.h" -#include "xfs_inode_item.h" #include "xfs_buf_item.h" #include "xfs_btree.h" #include "xfs_errortag.h" #include "xfs_error.h" #include "xfs_trace.h" -#include "xfs_cksum.h" #include "xfs_alloc.h" #include "xfs_log.h" @@ -276,7 +273,7 @@ xfs_btree_lblock_calc_crc( struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); struct xfs_buf_log_item *bip = bp->b_log_item; - if (!xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb)) + if (!xfs_sb_version_hascrc(&bp->b_mount->m_sb)) return; if (bip) block->bb_u.l.bb_lsn = cpu_to_be64(bip->bli_item.li_lsn); @@ -288,7 +285,7 @@ xfs_btree_lblock_verify_crc( struct xfs_buf *bp) { struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); - struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_mount *mp = bp->b_mount; if (xfs_sb_version_hascrc(&mp->m_sb)) { if (!xfs_log_check_lsn(mp, be64_to_cpu(block->bb_u.l.bb_lsn))) @@ -314,7 +311,7 @@ xfs_btree_sblock_calc_crc( struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); struct xfs_buf_log_item *bip = bp->b_log_item; - if (!xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb)) + if (!xfs_sb_version_hascrc(&bp->b_mount->m_sb)) return; if (bip) block->bb_u.s.bb_lsn = cpu_to_be64(bip->bli_item.li_lsn); @@ -326,7 +323,7 @@ xfs_btree_sblock_verify_crc( struct xfs_buf *bp) { struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); - struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_mount *mp = bp->b_mount; if (xfs_sb_version_hascrc(&mp->m_sb)) { if (!xfs_log_check_lsn(mp, be64_to_cpu(block->bb_u.s.bb_lsn))) @@ -691,14 +688,13 @@ xfs_buf_t * /* buffer for fsbno */ xfs_btree_get_bufl( xfs_mount_t *mp, /* file system mount point */ xfs_trans_t *tp, /* transaction pointer */ - xfs_fsblock_t fsbno, /* file system block number */ - uint lock) /* lock flags for get_buf */ + xfs_fsblock_t fsbno) /* file system block number */ { xfs_daddr_t d; /* real disk block address */ ASSERT(fsbno != NULLFSBLOCK); d = XFS_FSB_TO_DADDR(mp, fsbno); - return xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize, lock); + return xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize, 0); } /* @@ -710,15 +706,14 @@ xfs_btree_get_bufs( xfs_mount_t *mp, /* file system mount point */ xfs_trans_t *tp, /* transaction pointer */ xfs_agnumber_t agno, /* allocation group number */ - xfs_agblock_t agbno, /* allocation group block number */ - uint lock) /* lock flags for get_buf */ + xfs_agblock_t agbno) /* allocation group block number */ { xfs_daddr_t d; /* real disk block address */ ASSERT(agno != NULLAGNUMBER); ASSERT(agbno != NULLAGBLOCK); d = XFS_AGB_TO_DADDR(mp, agno, agbno); - return xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize, lock); + return xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize, 0); } /* @@ -845,7 +840,6 @@ xfs_btree_read_bufl( struct xfs_mount *mp, /* file system mount point */ struct xfs_trans *tp, /* transaction pointer */ xfs_fsblock_t fsbno, /* file system block number */ - uint lock, /* lock flags for read_buf */ struct xfs_buf **bpp, /* buffer for fsbno */ int refval, /* ref count value for buffer */ const struct xfs_buf_ops *ops) @@ -858,7 +852,7 @@ xfs_btree_read_bufl( return -EFSCORRUPTED; d = XFS_FSB_TO_DADDR(mp, fsbno); error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d, - mp->m_bsize, lock, &bp, ops); + mp->m_bsize, 0, &bp, ops); if (error) return error; if (bp) @@ -1185,11 +1179,10 @@ xfs_btree_init_block( xfs_btnum_t btnum, __u16 level, __u16 numrecs, - __u64 owner, - unsigned int flags) + __u64 owner) { xfs_btree_init_block_int(mp, XFS_BUF_TO_BLOCK(bp), bp->b_bn, - btnum, level, numrecs, owner, flags); + btnum, level, numrecs, owner, 0); } STATIC void @@ -1288,7 +1281,6 @@ STATIC int xfs_btree_get_buf_block( struct xfs_btree_cur *cur, union xfs_btree_ptr *ptr, - int flags, struct xfs_btree_block **block, struct xfs_buf **bpp) { @@ -1296,14 +1288,11 @@ xfs_btree_get_buf_block( xfs_daddr_t d; int error; - /* need to sort out how callers deal with failures first */ - ASSERT(!(flags & XBF_TRYLOCK)); - error = xfs_btree_ptr_to_daddr(cur, ptr, &d); if (error) return error; *bpp = xfs_trans_get_buf(cur->bc_tp, mp->m_ddev_targp, d, - mp->m_bsize, flags); + mp->m_bsize, 0); if (!*bpp) return -ENOMEM; @@ -2706,7 +2695,7 @@ __xfs_btree_split( XFS_BTREE_STATS_INC(cur, alloc); /* Set up the new block as "right". */ - error = xfs_btree_get_buf_block(cur, &rptr, 0, &right, &rbp); + error = xfs_btree_get_buf_block(cur, &rptr, &right, &rbp); if (error) goto error0; @@ -2961,7 +2950,7 @@ xfs_btree_new_iroot( XFS_BTREE_STATS_INC(cur, alloc); /* Copy the root into a real block. */ - error = xfs_btree_get_buf_block(cur, &nptr, 0, &cblock, &cbp); + error = xfs_btree_get_buf_block(cur, &nptr, &cblock, &cbp); if (error) goto error0; @@ -3058,7 +3047,7 @@ xfs_btree_new_root( XFS_BTREE_STATS_INC(cur, alloc); /* Set up the new block. */ - error = xfs_btree_get_buf_block(cur, &lptr, 0, &new, &nbp); + error = xfs_btree_get_buf_block(cur, &lptr, &new, &nbp); if (error) goto error0; @@ -4433,7 +4422,7 @@ xfs_btree_lblock_v5hdr_verify( struct xfs_buf *bp, uint64_t owner) { - struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_mount *mp = bp->b_mount; struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); if (!xfs_sb_version_hascrc(&mp->m_sb)) @@ -4454,7 +4443,7 @@ xfs_btree_lblock_verify( struct xfs_buf *bp, unsigned int max_recs) { - struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_mount *mp = bp->b_mount; struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); /* numrecs verification */ @@ -4484,7 +4473,7 @@ xfs_failaddr_t xfs_btree_sblock_v5hdr_verify( struct xfs_buf *bp) { - struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_mount *mp = bp->b_mount; struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); struct xfs_perag *pag = bp->b_pag; @@ -4510,7 +4499,7 @@ xfs_btree_sblock_verify( struct xfs_buf *bp, unsigned int max_recs) { - struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_mount *mp = bp->b_mount; struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); xfs_agblock_t agno; diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h index e3b3e9dce5da..fa3cd8ab9aba 100644 --- a/fs/xfs/libxfs/xfs_btree.h +++ b/fs/xfs/libxfs/xfs_btree.h @@ -301,8 +301,7 @@ struct xfs_buf * /* buffer for fsbno */ xfs_btree_get_bufl( struct xfs_mount *mp, /* file system mount point */ struct xfs_trans *tp, /* transaction pointer */ - xfs_fsblock_t fsbno, /* file system block number */ - uint lock); /* lock flags for get_buf */ + xfs_fsblock_t fsbno); /* file system block number */ /* * Get a buffer for the block, return it with no data read. @@ -313,8 +312,7 @@ xfs_btree_get_bufs( struct xfs_mount *mp, /* file system mount point */ struct xfs_trans *tp, /* transaction pointer */ xfs_agnumber_t agno, /* allocation group number */ - xfs_agblock_t agbno, /* allocation group block number */ - uint lock); /* lock flags for get_buf */ + xfs_agblock_t agbno); /* allocation group block number */ /* * Check for the cursor referring to the last block at the given level. @@ -345,7 +343,6 @@ xfs_btree_read_bufl( struct xfs_mount *mp, /* file system mount point */ struct xfs_trans *tp, /* transaction pointer */ xfs_fsblock_t fsbno, /* file system block number */ - uint lock, /* lock flags for read_buf */ struct xfs_buf **bpp, /* buffer for fsbno */ int refval, /* ref count value for buffer */ const struct xfs_buf_ops *ops); @@ -383,8 +380,7 @@ xfs_btree_init_block( xfs_btnum_t btnum, __u16 level, __u16 numrecs, - __u64 owner, - unsigned int flags); + __u64 owner); void xfs_btree_init_block_int( @@ -469,8 +465,8 @@ uint xfs_btree_compute_maxlevels(uint *limits, unsigned long len); unsigned long long xfs_btree_calc_size(uint *limits, unsigned long long len); /* return codes */ -#define XFS_BTREE_QUERY_RANGE_CONTINUE 0 /* keep iterating */ -#define XFS_BTREE_QUERY_RANGE_ABORT 1 /* stop iterating */ +#define XFS_BTREE_QUERY_RANGE_CONTINUE (XFS_ITER_CONTINUE) /* keep iterating */ +#define XFS_BTREE_QUERY_RANGE_ABORT (XFS_ITER_ABORT) /* stop iterating */ typedef int (*xfs_btree_query_range_fn)(struct xfs_btree_cur *cur, union xfs_btree_rec *rec, void *priv); diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c index 376bee94b5dd..d1c77fd0815d 100644 --- a/fs/xfs/libxfs/xfs_da_btree.c +++ b/fs/xfs/libxfs/xfs_da_btree.c @@ -12,20 +12,14 @@ #include "xfs_trans_resv.h" #include "xfs_bit.h" #include "xfs_mount.h" -#include "xfs_da_format.h" -#include "xfs_da_btree.h" #include "xfs_dir2.h" #include "xfs_dir2_priv.h" #include "xfs_inode.h" #include "xfs_trans.h" -#include "xfs_inode_item.h" -#include "xfs_alloc.h" #include "xfs_bmap.h" -#include "xfs_attr.h" #include "xfs_attr_leaf.h" #include "xfs_error.h" #include "xfs_trace.h" -#include "xfs_cksum.h" #include "xfs_buf_item.h" #include "xfs_log.h" @@ -116,35 +110,52 @@ xfs_da_state_free(xfs_da_state_t *state) kmem_zone_free(xfs_da_state_zone, state); } +/* + * Verify an xfs_da3_blkinfo structure. Note that the da3 fields are only + * accessible on v5 filesystems. This header format is common across da node, + * attr leaf and dir leaf blocks. + */ +xfs_failaddr_t +xfs_da3_blkinfo_verify( + struct xfs_buf *bp, + struct xfs_da3_blkinfo *hdr3) +{ + struct xfs_mount *mp = bp->b_mount; + struct xfs_da_blkinfo *hdr = &hdr3->hdr; + + if (!xfs_verify_magic16(bp, hdr->magic)) + return __this_address; + + if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_meta_uuid)) + return __this_address; + if (be64_to_cpu(hdr3->blkno) != bp->b_bn) + return __this_address; + if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->lsn))) + return __this_address; + } + + return NULL; +} + static xfs_failaddr_t xfs_da3_node_verify( struct xfs_buf *bp) { - struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_mount *mp = bp->b_mount; struct xfs_da_intnode *hdr = bp->b_addr; struct xfs_da3_icnode_hdr ichdr; const struct xfs_dir_ops *ops; + xfs_failaddr_t fa; ops = xfs_dir_get_ops(mp, NULL); ops->node_hdr_from_disk(&ichdr, hdr); - if (xfs_sb_version_hascrc(&mp->m_sb)) { - struct xfs_da3_node_hdr *hdr3 = bp->b_addr; - - if (ichdr.magic != XFS_DA3_NODE_MAGIC) - return __this_address; + fa = xfs_da3_blkinfo_verify(bp, bp->b_addr); + if (fa) + return fa; - if (!uuid_equal(&hdr3->info.uuid, &mp->m_sb.sb_meta_uuid)) - return __this_address; - if (be64_to_cpu(hdr3->info.blkno) != bp->b_bn) - return __this_address; - if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->info.lsn))) - return __this_address; - } else { - if (ichdr.magic != XFS_DA_NODE_MAGIC) - return __this_address; - } if (ichdr.level == 0) return __this_address; if (ichdr.level > XFS_DA_NODE_MAXDEPTH) @@ -169,7 +180,7 @@ static void xfs_da3_node_write_verify( struct xfs_buf *bp) { - struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_mount *mp = bp->b_mount; struct xfs_buf_log_item *bip = bp->b_log_item; struct xfs_da3_node_hdr *hdr3 = bp->b_addr; xfs_failaddr_t fa; @@ -257,6 +268,8 @@ xfs_da3_node_verify_struct( const struct xfs_buf_ops xfs_da3_node_buf_ops = { .name = "xfs_da3_node", + .magic16 = { cpu_to_be16(XFS_DA_NODE_MAGIC), + cpu_to_be16(XFS_DA3_NODE_MAGIC) }, .verify_read = xfs_da3_node_read_verify, .verify_write = xfs_da3_node_write_verify, .verify_struct = xfs_da3_node_verify_struct, diff --git a/fs/xfs/libxfs/xfs_da_format.c b/fs/xfs/libxfs/xfs_da_format.c index b39053dcb643..b1ae572496b6 100644 --- a/fs/xfs/libxfs/xfs_da_format.c +++ b/fs/xfs/libxfs/xfs_da_format.c @@ -11,11 +11,8 @@ #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" -#include "xfs_da_format.h" -#include "xfs_da_btree.h" #include "xfs_inode.h" #include "xfs_dir2.h" -#include "xfs_dir2_priv.h" /* * Shortform directory ops diff --git a/fs/xfs/libxfs/xfs_da_format.h b/fs/xfs/libxfs/xfs_da_format.h index 5d5bf3bffc78..ae654e06b2fb 100644 --- a/fs/xfs/libxfs/xfs_da_format.h +++ b/fs/xfs/libxfs/xfs_da_format.h @@ -869,4 +869,7 @@ static inline unsigned int xfs_dir2_dirblock_bytes(struct xfs_sb *sbp) return 1 << (sbp->sb_blocklog + sbp->sb_dirblklog); } +xfs_failaddr_t xfs_da3_blkinfo_verify(struct xfs_buf *bp, + struct xfs_da3_blkinfo *hdr3); + #endif /* __XFS_DA_FORMAT_H__ */ diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c index 94f00427de98..eb2be2a6a25a 100644 --- a/fs/xfs/libxfs/xfs_defer.c +++ b/fs/xfs/libxfs/xfs_defer.c @@ -9,8 +9,6 @@ #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" -#include "xfs_bit.h" -#include "xfs_sb.h" #include "xfs_mount.h" #include "xfs_defer.h" #include "xfs_trans.h" @@ -274,13 +272,15 @@ xfs_defer_trans_roll( trace_xfs_defer_trans_roll(tp, _RET_IP_); - /* Roll the transaction. */ + /* + * Roll the transaction. Rolling always given a new transaction (even + * if committing the old one fails!) to hand back to the caller, so we + * join the held resources to the new transaction so that we always + * return with the held resources joined to @tpp, no matter what + * happened. + */ error = xfs_trans_roll(tpp); tp = *tpp; - if (error) { - trace_xfs_defer_trans_roll_error(tp, error); - return error; - } /* Rejoin the joined inodes. */ for (i = 0; i < ipcount; i++) @@ -292,6 +292,8 @@ xfs_defer_trans_roll( xfs_trans_bhold(tp, bplist[i]); } + if (error) + trace_xfs_defer_trans_roll_error(tp, error); return error; } diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c index 229152cd1a24..67840723edbb 100644 --- a/fs/xfs/libxfs/xfs_dir2.c +++ b/fs/xfs/libxfs/xfs_dir2.c @@ -5,20 +5,16 @@ */ #include "xfs.h" #include "xfs_fs.h" +#include "xfs_shared.h" #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" -#include "xfs_defer.h" -#include "xfs_da_format.h" -#include "xfs_da_btree.h" #include "xfs_inode.h" #include "xfs_trans.h" -#include "xfs_inode_item.h" #include "xfs_bmap.h" #include "xfs_dir2.h" #include "xfs_dir2_priv.h" -#include "xfs_ialloc.h" #include "xfs_errortag.h" #include "xfs_error.h" #include "xfs_trace.h" @@ -703,3 +699,20 @@ xfs_dir2_shrink_inode( xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); return 0; } + +/* Returns true if the directory entry name is valid. */ +bool +xfs_dir2_namecheck( + const void *name, + size_t length) +{ + /* + * MAXNAMELEN includes the trailing null, but (name/length) leave it + * out, so use >= for the length check. + */ + if (length >= MAXNAMELEN) + return false; + + /* There shouldn't be any slashes or nulls here */ + return !memchr(name, '/', length) && !memchr(name, 0, length); +} diff --git a/fs/xfs/libxfs/xfs_dir2.h b/fs/xfs/libxfs/xfs_dir2.h index c3e3f6b813d8..f54244779492 100644 --- a/fs/xfs/libxfs/xfs_dir2.h +++ b/fs/xfs/libxfs/xfs_dir2.h @@ -326,5 +326,6 @@ xfs_dir2_leaf_tail_p(struct xfs_da_geometry *geo, struct xfs_dir2_leaf *lp) unsigned char xfs_dir3_get_dtype(struct xfs_mount *mp, uint8_t filetype); void *xfs_dir3_data_endp(struct xfs_da_geometry *geo, struct xfs_dir2_data_hdr *hdr); +bool xfs_dir2_namecheck(const void *name, size_t length); #endif /* __XFS_DIR2_H__ */ diff --git a/fs/xfs/libxfs/xfs_dir2_block.c b/fs/xfs/libxfs/xfs_dir2_block.c index 30ed5919da72..a6fb0cc2085e 100644 --- a/fs/xfs/libxfs/xfs_dir2_block.c +++ b/fs/xfs/libxfs/xfs_dir2_block.c @@ -6,22 +6,19 @@ */ #include "xfs.h" #include "xfs_fs.h" +#include "xfs_shared.h" #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" -#include "xfs_da_format.h" -#include "xfs_da_btree.h" #include "xfs_inode.h" #include "xfs_trans.h" -#include "xfs_inode_item.h" #include "xfs_bmap.h" #include "xfs_buf_item.h" #include "xfs_dir2.h" #include "xfs_dir2_priv.h" #include "xfs_error.h" #include "xfs_trace.h" -#include "xfs_cksum.h" #include "xfs_log.h" /* @@ -50,21 +47,19 @@ static xfs_failaddr_t xfs_dir3_block_verify( struct xfs_buf *bp) { - struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_mount *mp = bp->b_mount; struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr; + if (!xfs_verify_magic(bp, hdr3->magic)) + return __this_address; + if (xfs_sb_version_hascrc(&mp->m_sb)) { - if (hdr3->magic != cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)) - return __this_address; if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_meta_uuid)) return __this_address; if (be64_to_cpu(hdr3->blkno) != bp->b_bn) return __this_address; if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->lsn))) return __this_address; - } else { - if (hdr3->magic != cpu_to_be32(XFS_DIR2_BLOCK_MAGIC)) - return __this_address; } return __xfs_dir3_data_check(NULL, bp); } @@ -73,7 +68,7 @@ static void xfs_dir3_block_read_verify( struct xfs_buf *bp) { - struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_mount *mp = bp->b_mount; xfs_failaddr_t fa; if (xfs_sb_version_hascrc(&mp->m_sb) && @@ -90,7 +85,7 @@ static void xfs_dir3_block_write_verify( struct xfs_buf *bp) { - struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_mount *mp = bp->b_mount; struct xfs_buf_log_item *bip = bp->b_log_item; struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr; xfs_failaddr_t fa; @@ -112,6 +107,8 @@ xfs_dir3_block_write_verify( const struct xfs_buf_ops xfs_dir3_block_buf_ops = { .name = "xfs_dir3_block", + .magic = { cpu_to_be32(XFS_DIR2_BLOCK_MAGIC), + cpu_to_be32(XFS_DIR3_BLOCK_MAGIC) }, .verify_read = xfs_dir3_block_read_verify, .verify_write = xfs_dir3_block_write_verify, .verify_struct = xfs_dir3_block_verify, diff --git a/fs/xfs/libxfs/xfs_dir2_data.c b/fs/xfs/libxfs/xfs_dir2_data.c index 01162c62ec8f..2c79be4c3153 100644 --- a/fs/xfs/libxfs/xfs_dir2_data.c +++ b/fs/xfs/libxfs/xfs_dir2_data.c @@ -6,19 +6,16 @@ */ #include "xfs.h" #include "xfs_fs.h" +#include "xfs_shared.h" #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" -#include "xfs_da_format.h" -#include "xfs_da_btree.h" #include "xfs_inode.h" #include "xfs_dir2.h" -#include "xfs_dir2_priv.h" #include "xfs_error.h" #include "xfs_trans.h" #include "xfs_buf_item.h" -#include "xfs_cksum.h" #include "xfs_log.h" static xfs_failaddr_t xfs_dir2_data_freefind_verify( @@ -50,14 +47,13 @@ __xfs_dir3_data_check( int i; /* leaf index */ int lastfree; /* last entry was unused */ xfs_dir2_leaf_entry_t *lep=NULL; /* block leaf entries */ - xfs_mount_t *mp; /* filesystem mount point */ + struct xfs_mount *mp = bp->b_mount; char *p; /* current data position */ int stale; /* count of stale leaves */ struct xfs_name name; const struct xfs_dir_ops *ops; struct xfs_da_geometry *geo; - mp = bp->b_target->bt_mount; geo = mp->m_dir_geo; /* @@ -249,21 +245,19 @@ static xfs_failaddr_t xfs_dir3_data_verify( struct xfs_buf *bp) { - struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_mount *mp = bp->b_mount; struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr; + if (!xfs_verify_magic(bp, hdr3->magic)) + return __this_address; + if (xfs_sb_version_hascrc(&mp->m_sb)) { - if (hdr3->magic != cpu_to_be32(XFS_DIR3_DATA_MAGIC)) - return __this_address; if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_meta_uuid)) return __this_address; if (be64_to_cpu(hdr3->blkno) != bp->b_bn) return __this_address; if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->lsn))) return __this_address; - } else { - if (hdr3->magic != cpu_to_be32(XFS_DIR2_DATA_MAGIC)) - return __this_address; } return __xfs_dir3_data_check(NULL, bp); } @@ -300,7 +294,7 @@ static void xfs_dir3_data_read_verify( struct xfs_buf *bp) { - struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_mount *mp = bp->b_mount; xfs_failaddr_t fa; if (xfs_sb_version_hascrc(&mp->m_sb) && @@ -317,7 +311,7 @@ static void xfs_dir3_data_write_verify( struct xfs_buf *bp) { - struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_mount *mp = bp->b_mount; struct xfs_buf_log_item *bip = bp->b_log_item; struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr; xfs_failaddr_t fa; @@ -339,6 +333,8 @@ xfs_dir3_data_write_verify( const struct xfs_buf_ops xfs_dir3_data_buf_ops = { .name = "xfs_dir3_data", + .magic = { cpu_to_be32(XFS_DIR2_DATA_MAGIC), + cpu_to_be32(XFS_DIR3_DATA_MAGIC) }, .verify_read = xfs_dir3_data_read_verify, .verify_write = xfs_dir3_data_write_verify, .verify_struct = xfs_dir3_data_verify, @@ -346,6 +342,8 @@ const struct xfs_buf_ops xfs_dir3_data_buf_ops = { static const struct xfs_buf_ops xfs_dir3_data_reada_buf_ops = { .name = "xfs_dir3_data_reada", + .magic = { cpu_to_be32(XFS_DIR2_DATA_MAGIC), + cpu_to_be32(XFS_DIR3_DATA_MAGIC) }, .verify_read = xfs_dir3_data_reada_verify, .verify_write = xfs_dir3_data_write_verify, }; diff --git a/fs/xfs/libxfs/xfs_dir2_leaf.c b/fs/xfs/libxfs/xfs_dir2_leaf.c index 1728a3e6f5cf..a53e4585a2f3 100644 --- a/fs/xfs/libxfs/xfs_dir2_leaf.c +++ b/fs/xfs/libxfs/xfs_dir2_leaf.c @@ -6,12 +6,11 @@ */ #include "xfs.h" #include "xfs_fs.h" +#include "xfs_shared.h" #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" -#include "xfs_da_format.h" -#include "xfs_da_btree.h" #include "xfs_inode.h" #include "xfs_bmap.h" #include "xfs_dir2.h" @@ -20,8 +19,6 @@ #include "xfs_trace.h" #include "xfs_trans.h" #include "xfs_buf_item.h" -#include "xfs_cksum.h" -#include "xfs_log.h" /* * Local function declarations. @@ -142,66 +139,46 @@ xfs_dir3_leaf_check_int( */ static xfs_failaddr_t xfs_dir3_leaf_verify( - struct xfs_buf *bp, - uint16_t magic) + struct xfs_buf *bp) { - struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_mount *mp = bp->b_mount; struct xfs_dir2_leaf *leaf = bp->b_addr; + xfs_failaddr_t fa; - ASSERT(magic == XFS_DIR2_LEAF1_MAGIC || magic == XFS_DIR2_LEAFN_MAGIC); - - if (xfs_sb_version_hascrc(&mp->m_sb)) { - struct xfs_dir3_leaf_hdr *leaf3 = bp->b_addr; - uint16_t magic3; - - magic3 = (magic == XFS_DIR2_LEAF1_MAGIC) ? XFS_DIR3_LEAF1_MAGIC - : XFS_DIR3_LEAFN_MAGIC; - - if (leaf3->info.hdr.magic != cpu_to_be16(magic3)) - return __this_address; - if (!uuid_equal(&leaf3->info.uuid, &mp->m_sb.sb_meta_uuid)) - return __this_address; - if (be64_to_cpu(leaf3->info.blkno) != bp->b_bn) - return __this_address; - if (!xfs_log_check_lsn(mp, be64_to_cpu(leaf3->info.lsn))) - return __this_address; - } else { - if (leaf->hdr.info.magic != cpu_to_be16(magic)) - return __this_address; - } + fa = xfs_da3_blkinfo_verify(bp, bp->b_addr); + if (fa) + return fa; return xfs_dir3_leaf_check_int(mp, NULL, NULL, leaf); } static void -__read_verify( - struct xfs_buf *bp, - uint16_t magic) +xfs_dir3_leaf_read_verify( + struct xfs_buf *bp) { - struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_mount *mp = bp->b_mount; xfs_failaddr_t fa; if (xfs_sb_version_hascrc(&mp->m_sb) && !xfs_buf_verify_cksum(bp, XFS_DIR3_LEAF_CRC_OFF)) xfs_verifier_error(bp, -EFSBADCRC, __this_address); else { - fa = xfs_dir3_leaf_verify(bp, magic); + fa = xfs_dir3_leaf_verify(bp); if (fa) xfs_verifier_error(bp, -EFSCORRUPTED, fa); } } static void -__write_verify( - struct xfs_buf *bp, - uint16_t magic) +xfs_dir3_leaf_write_verify( + struct xfs_buf *bp) { - struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_mount *mp = bp->b_mount; struct xfs_buf_log_item *bip = bp->b_log_item; struct xfs_dir3_leaf_hdr *hdr3 = bp->b_addr; xfs_failaddr_t fa; - fa = xfs_dir3_leaf_verify(bp, magic); + fa = xfs_dir3_leaf_verify(bp); if (fa) { xfs_verifier_error(bp, -EFSCORRUPTED, fa); return; @@ -216,60 +193,22 @@ __write_verify( xfs_buf_update_cksum(bp, XFS_DIR3_LEAF_CRC_OFF); } -static xfs_failaddr_t -xfs_dir3_leaf1_verify( - struct xfs_buf *bp) -{ - return xfs_dir3_leaf_verify(bp, XFS_DIR2_LEAF1_MAGIC); -} - -static void -xfs_dir3_leaf1_read_verify( - struct xfs_buf *bp) -{ - __read_verify(bp, XFS_DIR2_LEAF1_MAGIC); -} - -static void -xfs_dir3_leaf1_write_verify( - struct xfs_buf *bp) -{ - __write_verify(bp, XFS_DIR2_LEAF1_MAGIC); -} - -static xfs_failaddr_t -xfs_dir3_leafn_verify( - struct xfs_buf *bp) -{ - return xfs_dir3_leaf_verify(bp, XFS_DIR2_LEAFN_MAGIC); -} - -static void -xfs_dir3_leafn_read_verify( - struct xfs_buf *bp) -{ - __read_verify(bp, XFS_DIR2_LEAFN_MAGIC); -} - -static void -xfs_dir3_leafn_write_verify( - struct xfs_buf *bp) -{ - __write_verify(bp, XFS_DIR2_LEAFN_MAGIC); -} - const struct xfs_buf_ops xfs_dir3_leaf1_buf_ops = { .name = "xfs_dir3_leaf1", - .verify_read = xfs_dir3_leaf1_read_verify, - .verify_write = xfs_dir3_leaf1_write_verify, - .verify_struct = xfs_dir3_leaf1_verify, + .magic16 = { cpu_to_be16(XFS_DIR2_LEAF1_MAGIC), + cpu_to_be16(XFS_DIR3_LEAF1_MAGIC) }, + .verify_read = xfs_dir3_leaf_read_verify, + .verify_write = xfs_dir3_leaf_write_verify, + .verify_struct = xfs_dir3_leaf_verify, }; const struct xfs_buf_ops xfs_dir3_leafn_buf_ops = { .name = "xfs_dir3_leafn", - .verify_read = xfs_dir3_leafn_read_verify, - .verify_write = xfs_dir3_leafn_write_verify, - .verify_struct = xfs_dir3_leafn_verify, + .magic16 = { cpu_to_be16(XFS_DIR2_LEAFN_MAGIC), + cpu_to_be16(XFS_DIR3_LEAFN_MAGIC) }, + .verify_read = xfs_dir3_leaf_read_verify, + .verify_write = xfs_dir3_leaf_write_verify, + .verify_struct = xfs_dir3_leaf_verify, }; int @@ -621,43 +560,40 @@ xfs_dir3_leaf_find_entry( */ int /* error */ xfs_dir2_leaf_addname( - xfs_da_args_t *args) /* operation arguments */ + struct xfs_da_args *args) /* operation arguments */ { + struct xfs_dir3_icleaf_hdr leafhdr; + struct xfs_trans *tp = args->trans; __be16 *bestsp; /* freespace table in leaf */ - int compact; /* need to compact leaves */ - xfs_dir2_data_hdr_t *hdr; /* data block header */ + __be16 *tagp; /* end of data entry */ struct xfs_buf *dbp; /* data block buffer */ - xfs_dir2_data_entry_t *dep; /* data block entry */ - xfs_inode_t *dp; /* incore directory inode */ - xfs_dir2_data_unused_t *dup; /* data unused entry */ + struct xfs_buf *lbp; /* leaf's buffer */ + struct xfs_dir2_leaf *leaf; /* leaf structure */ + struct xfs_inode *dp = args->dp; /* incore directory inode */ + struct xfs_dir2_data_hdr *hdr; /* data block header */ + struct xfs_dir2_data_entry *dep; /* data block entry */ + struct xfs_dir2_leaf_entry *lep; /* leaf entry table pointer */ + struct xfs_dir2_leaf_entry *ents; + struct xfs_dir2_data_unused *dup; /* data unused entry */ + struct xfs_dir2_leaf_tail *ltp; /* leaf tail pointer */ + struct xfs_dir2_data_free *bf; /* bestfree table */ + int compact; /* need to compact leaves */ int error; /* error return value */ int grown; /* allocated new data block */ - int highstale; /* index of next stale leaf */ + int highstale = 0; /* index of next stale leaf */ int i; /* temporary, index */ int index; /* leaf table position */ - struct xfs_buf *lbp; /* leaf's buffer */ - xfs_dir2_leaf_t *leaf; /* leaf structure */ int length; /* length of new entry */ - xfs_dir2_leaf_entry_t *lep; /* leaf entry table pointer */ int lfloglow; /* low leaf logging index */ int lfloghigh; /* high leaf logging index */ - int lowstale; /* index of prev stale leaf */ - xfs_dir2_leaf_tail_t *ltp; /* leaf tail pointer */ + int lowstale = 0; /* index of prev stale leaf */ int needbytes; /* leaf block bytes needed */ int needlog; /* need to log data header */ int needscan; /* need to rescan data free */ - __be16 *tagp; /* end of data entry */ - xfs_trans_t *tp; /* transaction pointer */ xfs_dir2_db_t use_block; /* data block number */ - struct xfs_dir2_data_free *bf; /* bestfree table */ - struct xfs_dir2_leaf_entry *ents; - struct xfs_dir3_icleaf_hdr leafhdr; trace_xfs_dir2_leaf_addname(args); - dp = args->dp; - tp = args->trans; - error = xfs_dir3_leaf_read(tp, dp, args->geo->leafblk, -1, &lbp); if (error) return error; diff --git a/fs/xfs/libxfs/xfs_dir2_node.c b/fs/xfs/libxfs/xfs_dir2_node.c index f1bb3434f51c..afcc6642690a 100644 --- a/fs/xfs/libxfs/xfs_dir2_node.c +++ b/fs/xfs/libxfs/xfs_dir2_node.c @@ -6,12 +6,11 @@ */ #include "xfs.h" #include "xfs_fs.h" +#include "xfs_shared.h" #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" -#include "xfs_da_format.h" -#include "xfs_da_btree.h" #include "xfs_inode.h" #include "xfs_bmap.h" #include "xfs_dir2.h" @@ -20,7 +19,6 @@ #include "xfs_trace.h" #include "xfs_trans.h" #include "xfs_buf_item.h" -#include "xfs_cksum.h" #include "xfs_log.h" /* @@ -84,23 +82,21 @@ static xfs_failaddr_t xfs_dir3_free_verify( struct xfs_buf *bp) { - struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_mount *mp = bp->b_mount; struct xfs_dir2_free_hdr *hdr = bp->b_addr; + if (!xfs_verify_magic(bp, hdr->magic)) + return __this_address; + if (xfs_sb_version_hascrc(&mp->m_sb)) { struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr; - if (hdr3->magic != cpu_to_be32(XFS_DIR3_FREE_MAGIC)) - return __this_address; if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_meta_uuid)) return __this_address; if (be64_to_cpu(hdr3->blkno) != bp->b_bn) return __this_address; if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->lsn))) return __this_address; - } else { - if (hdr->magic != cpu_to_be32(XFS_DIR2_FREE_MAGIC)) - return __this_address; } /* XXX: should bounds check the xfs_dir3_icfree_hdr here */ @@ -112,7 +108,7 @@ static void xfs_dir3_free_read_verify( struct xfs_buf *bp) { - struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_mount *mp = bp->b_mount; xfs_failaddr_t fa; if (xfs_sb_version_hascrc(&mp->m_sb) && @@ -129,7 +125,7 @@ static void xfs_dir3_free_write_verify( struct xfs_buf *bp) { - struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_mount *mp = bp->b_mount; struct xfs_buf_log_item *bip = bp->b_log_item; struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr; xfs_failaddr_t fa; @@ -151,6 +147,8 @@ xfs_dir3_free_write_verify( const struct xfs_buf_ops xfs_dir3_free_buf_ops = { .name = "xfs_dir3_free", + .magic = { cpu_to_be32(XFS_DIR2_FREE_MAGIC), + cpu_to_be32(XFS_DIR3_FREE_MAGIC) }, .verify_read = xfs_dir3_free_read_verify, .verify_write = xfs_dir3_free_write_verify, .verify_struct = xfs_dir3_free_verify, @@ -426,24 +424,22 @@ xfs_dir2_leaf_to_node( static int /* error */ xfs_dir2_leafn_add( struct xfs_buf *bp, /* leaf buffer */ - xfs_da_args_t *args, /* operation arguments */ + struct xfs_da_args *args, /* operation arguments */ int index) /* insertion pt for new entry */ { + struct xfs_dir3_icleaf_hdr leafhdr; + struct xfs_inode *dp = args->dp; + struct xfs_dir2_leaf *leaf = bp->b_addr; + struct xfs_dir2_leaf_entry *lep; + struct xfs_dir2_leaf_entry *ents; int compact; /* compacting stale leaves */ - xfs_inode_t *dp; /* incore directory inode */ - int highstale; /* next stale entry */ - xfs_dir2_leaf_t *leaf; /* leaf structure */ - xfs_dir2_leaf_entry_t *lep; /* leaf entry */ + int highstale = 0; /* next stale entry */ int lfloghigh; /* high leaf entry logging */ int lfloglow; /* low leaf entry logging */ - int lowstale; /* previous stale entry */ - struct xfs_dir3_icleaf_hdr leafhdr; - struct xfs_dir2_leaf_entry *ents; + int lowstale = 0; /* previous stale entry */ trace_xfs_dir2_leafn_add(args, index); - dp = args->dp; - leaf = bp->b_addr; dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf); ents = dp->d_ops->leaf_ents_p(leaf); diff --git a/fs/xfs/libxfs/xfs_dir2_sf.c b/fs/xfs/libxfs/xfs_dir2_sf.c index 585dfdb7b6b6..033589257f54 100644 --- a/fs/xfs/libxfs/xfs_dir2_sf.c +++ b/fs/xfs/libxfs/xfs_dir2_sf.c @@ -5,16 +5,13 @@ */ #include "xfs.h" #include "xfs_fs.h" +#include "xfs_shared.h" #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" -#include "xfs_da_format.h" -#include "xfs_da_btree.h" #include "xfs_inode.h" #include "xfs_trans.h" -#include "xfs_inode_item.h" -#include "xfs_error.h" #include "xfs_dir2.h" #include "xfs_dir2_priv.h" #include "xfs_trace.h" diff --git a/fs/xfs/libxfs/xfs_dquot_buf.c b/fs/xfs/libxfs/xfs_dquot_buf.c index d293f371dd54..e8bd688a4073 100644 --- a/fs/xfs/libxfs/xfs_dquot_buf.c +++ b/fs/xfs/libxfs/xfs_dquot_buf.c @@ -16,8 +16,6 @@ #include "xfs_trans.h" #include "xfs_qm.h" #include "xfs_error.h" -#include "xfs_cksum.h" -#include "xfs_trace.h" int xfs_calc_dquots_per_chunk( @@ -110,7 +108,7 @@ xfs_dqblk_verify( /* * Do some primitive error checking on ondisk dquot data structures. */ -int +void xfs_dqblk_repair( struct xfs_mount *mp, struct xfs_dqblk *dqb, @@ -133,8 +131,6 @@ xfs_dqblk_repair( xfs_update_cksum((char *)dqb, sizeof(struct xfs_dqblk), XFS_DQUOT_CRC_OFF); } - - return 0; } STATIC bool @@ -226,7 +222,7 @@ static xfs_failaddr_t xfs_dquot_buf_verify_struct( struct xfs_buf *bp) { - struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_mount *mp = bp->b_mount; return xfs_dquot_buf_verify(mp, bp, false); } @@ -235,7 +231,7 @@ static void xfs_dquot_buf_read_verify( struct xfs_buf *bp) { - struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_mount *mp = bp->b_mount; if (!xfs_dquot_buf_verify_crc(mp, bp, false)) return; @@ -252,7 +248,7 @@ static void xfs_dquot_buf_readahead_verify( struct xfs_buf *bp) { - struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_mount *mp = bp->b_mount; if (!xfs_dquot_buf_verify_crc(mp, bp, true) || xfs_dquot_buf_verify(mp, bp, true) != NULL) { @@ -270,13 +266,15 @@ static void xfs_dquot_buf_write_verify( struct xfs_buf *bp) { - struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_mount *mp = bp->b_mount; xfs_dquot_buf_verify(mp, bp, false); } const struct xfs_buf_ops xfs_dquot_buf_ops = { .name = "xfs_dquot", + .magic16 = { cpu_to_be16(XFS_DQUOT_MAGIC), + cpu_to_be16(XFS_DQUOT_MAGIC) }, .verify_read = xfs_dquot_buf_read_verify, .verify_write = xfs_dquot_buf_write_verify, .verify_struct = xfs_dquot_buf_verify_struct, @@ -284,6 +282,8 @@ const struct xfs_buf_ops xfs_dquot_buf_ops = { const struct xfs_buf_ops xfs_dquot_buf_ra_ops = { .name = "xfs_dquot_ra", + .magic16 = { cpu_to_be16(XFS_DQUOT_MAGIC), + cpu_to_be16(XFS_DQUOT_MAGIC) }, .verify_read = xfs_dquot_buf_readahead_verify, .verify_write = xfs_dquot_buf_write_verify, }; diff --git a/fs/xfs/libxfs/xfs_errortag.h b/fs/xfs/libxfs/xfs_errortag.h index 66077a105cbb..79e6c4fb1d8a 100644 --- a/fs/xfs/libxfs/xfs_errortag.h +++ b/fs/xfs/libxfs/xfs_errortag.h @@ -54,7 +54,8 @@ #define XFS_ERRTAG_BUF_LRU_REF 31 #define XFS_ERRTAG_FORCE_SCRUB_REPAIR 32 #define XFS_ERRTAG_FORCE_SUMMARY_RECALC 33 -#define XFS_ERRTAG_MAX 34 +#define XFS_ERRTAG_IUNLINK_FALLBACK 34 +#define XFS_ERRTAG_MAX 35 /* * Random factors for above tags, 1 means always, 2 means 1/2 time, etc. @@ -93,5 +94,6 @@ #define XFS_RANDOM_BUF_LRU_REF 2 #define XFS_RANDOM_FORCE_SCRUB_REPAIR 1 #define XFS_RANDOM_FORCE_SUMMARY_RECALC 1 +#define XFS_RANDOM_IUNLINK_FALLBACK (XFS_RANDOM_DEFAULT/10) #endif /* __XFS_ERRORTAG_H_ */ diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h index 9bb3c48843ec..c968b60cee15 100644 --- a/fs/xfs/libxfs/xfs_format.h +++ b/fs/xfs/libxfs/xfs_format.h @@ -1071,7 +1071,7 @@ static inline void xfs_dinode_put_rdev(struct xfs_dinode *dip, xfs_dev_t rdev) #define XFS_INO_MASK(k) (uint32_t)((1ULL << (k)) - 1) #define XFS_INO_OFFSET_BITS(mp) (mp)->m_sb.sb_inopblog #define XFS_INO_AGBNO_BITS(mp) (mp)->m_sb.sb_agblklog -#define XFS_INO_AGINO_BITS(mp) (mp)->m_agino_log +#define XFS_INO_AGINO_BITS(mp) ((mp)->m_ino_geo.agino_log) #define XFS_INO_AGNO_BITS(mp) (mp)->m_agno_log #define XFS_INO_BITS(mp) \ XFS_INO_AGNO_BITS(mp) + XFS_INO_AGINO_BITS(mp) diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h index f3aa59302fef..52d03a3a02a4 100644 --- a/fs/xfs/libxfs/xfs_fs.h +++ b/fs/xfs/libxfs/xfs_fs.h @@ -97,7 +97,7 @@ struct getbmapx { * For use by backup and restore programs to set the XFS on-disk inode * fields di_dmevmask and di_dmstate. These must be set to exactly and * only values previously obtained via xfs_bulkstat! (Specifically the - * xfs_bstat_t fields bs_dmevmask and bs_dmstate.) + * struct xfs_bstat fields bs_dmevmask and bs_dmstate.) */ #ifndef HAVE_FSDMIDATA struct fsdmidata { @@ -124,7 +124,7 @@ typedef struct xfs_flock64 { /* * Output for XFS_IOC_FSGEOMETRY_V1 */ -typedef struct xfs_fsop_geom_v1 { +struct xfs_fsop_geom_v1 { __u32 blocksize; /* filesystem (data) block size */ __u32 rtextsize; /* realtime extent size */ __u32 agblocks; /* fsblocks in an AG */ @@ -145,12 +145,39 @@ typedef struct xfs_fsop_geom_v1 { __u32 logsectsize; /* log sector size, bytes */ __u32 rtsectsize; /* realtime sector size, bytes */ __u32 dirblocksize; /* directory block size, bytes */ -} xfs_fsop_geom_v1_t; +}; + +/* + * Output for XFS_IOC_FSGEOMETRY_V4 + */ +struct xfs_fsop_geom_v4 { + __u32 blocksize; /* filesystem (data) block size */ + __u32 rtextsize; /* realtime extent size */ + __u32 agblocks; /* fsblocks in an AG */ + __u32 agcount; /* number of allocation groups */ + __u32 logblocks; /* fsblocks in the log */ + __u32 sectsize; /* (data) sector size, bytes */ + __u32 inodesize; /* inode size in bytes */ + __u32 imaxpct; /* max allowed inode space(%) */ + __u64 datablocks; /* fsblocks in data subvolume */ + __u64 rtblocks; /* fsblocks in realtime subvol */ + __u64 rtextents; /* rt extents in realtime subvol*/ + __u64 logstart; /* starting fsblock of the log */ + unsigned char uuid[16]; /* unique id of the filesystem */ + __u32 sunit; /* stripe unit, fsblocks */ + __u32 swidth; /* stripe width, fsblocks */ + __s32 version; /* structure version */ + __u32 flags; /* superblock version flags */ + __u32 logsectsize; /* log sector size, bytes */ + __u32 rtsectsize; /* realtime sector size, bytes */ + __u32 dirblocksize; /* directory block size, bytes */ + __u32 logsunit; /* log stripe unit, bytes */ +}; /* * Output for XFS_IOC_FSGEOMETRY */ -typedef struct xfs_fsop_geom { +struct xfs_fsop_geom { __u32 blocksize; /* filesystem (data) block size */ __u32 rtextsize; /* realtime extent size */ __u32 agblocks; /* fsblocks in an AG */ @@ -171,8 +198,18 @@ typedef struct xfs_fsop_geom { __u32 logsectsize; /* log sector size, bytes */ __u32 rtsectsize; /* realtime sector size, bytes */ __u32 dirblocksize; /* directory block size, bytes */ - __u32 logsunit; /* log stripe unit, bytes */ -} xfs_fsop_geom_t; + __u32 logsunit; /* log stripe unit, bytes */ + uint32_t sick; /* o: unhealthy fs & rt metadata */ + uint32_t checked; /* o: checked fs & rt metadata */ + __u64 reserved[17]; /* reserved space */ +}; + +#define XFS_FSOP_GEOM_SICK_COUNTERS (1 << 0) /* summary counters */ +#define XFS_FSOP_GEOM_SICK_UQUOTA (1 << 1) /* user quota */ +#define XFS_FSOP_GEOM_SICK_GQUOTA (1 << 2) /* group quota */ +#define XFS_FSOP_GEOM_SICK_PQUOTA (1 << 3) /* project quota */ +#define XFS_FSOP_GEOM_SICK_RT_BITMAP (1 << 4) /* realtime bitmap */ +#define XFS_FSOP_GEOM_SICK_RT_SUMMARY (1 << 5) /* realtime summary */ /* Output for XFS_FS_COUNTS */ typedef struct xfs_fsop_counts { @@ -188,28 +225,30 @@ typedef struct xfs_fsop_resblks { __u64 resblks_avail; } xfs_fsop_resblks_t; -#define XFS_FSOP_GEOM_VERSION 0 - -#define XFS_FSOP_GEOM_FLAGS_ATTR 0x0001 /* attributes in use */ -#define XFS_FSOP_GEOM_FLAGS_NLINK 0x0002 /* 32-bit nlink values */ -#define XFS_FSOP_GEOM_FLAGS_QUOTA 0x0004 /* quotas enabled */ -#define XFS_FSOP_GEOM_FLAGS_IALIGN 0x0008 /* inode alignment */ -#define XFS_FSOP_GEOM_FLAGS_DALIGN 0x0010 /* large data alignment */ -#define XFS_FSOP_GEOM_FLAGS_SHARED 0x0020 /* read-only shared */ -#define XFS_FSOP_GEOM_FLAGS_EXTFLG 0x0040 /* special extent flag */ -#define XFS_FSOP_GEOM_FLAGS_DIRV2 0x0080 /* directory version 2 */ -#define XFS_FSOP_GEOM_FLAGS_LOGV2 0x0100 /* log format version 2 */ -#define XFS_FSOP_GEOM_FLAGS_SECTOR 0x0200 /* sector sizes >1BB */ -#define XFS_FSOP_GEOM_FLAGS_ATTR2 0x0400 /* inline attributes rework */ -#define XFS_FSOP_GEOM_FLAGS_PROJID32 0x0800 /* 32-bit project IDs */ -#define XFS_FSOP_GEOM_FLAGS_DIRV2CI 0x1000 /* ASCII only CI names */ -#define XFS_FSOP_GEOM_FLAGS_LAZYSB 0x4000 /* lazy superblock counters */ -#define XFS_FSOP_GEOM_FLAGS_V5SB 0x8000 /* version 5 superblock */ -#define XFS_FSOP_GEOM_FLAGS_FTYPE 0x10000 /* inode directory types */ -#define XFS_FSOP_GEOM_FLAGS_FINOBT 0x20000 /* free inode btree */ -#define XFS_FSOP_GEOM_FLAGS_SPINODES 0x40000 /* sparse inode chunks */ -#define XFS_FSOP_GEOM_FLAGS_RMAPBT 0x80000 /* reverse mapping btree */ -#define XFS_FSOP_GEOM_FLAGS_REFLINK 0x100000 /* files can share blocks */ +#define XFS_FSOP_GEOM_VERSION 0 +#define XFS_FSOP_GEOM_VERSION_V5 5 + +#define XFS_FSOP_GEOM_FLAGS_ATTR (1 << 0) /* attributes in use */ +#define XFS_FSOP_GEOM_FLAGS_NLINK (1 << 1) /* 32-bit nlink values */ +#define XFS_FSOP_GEOM_FLAGS_QUOTA (1 << 2) /* quotas enabled */ +#define XFS_FSOP_GEOM_FLAGS_IALIGN (1 << 3) /* inode alignment */ +#define XFS_FSOP_GEOM_FLAGS_DALIGN (1 << 4) /* large data alignment */ +#define XFS_FSOP_GEOM_FLAGS_SHARED (1 << 5) /* read-only shared */ +#define XFS_FSOP_GEOM_FLAGS_EXTFLG (1 << 6) /* special extent flag */ +#define XFS_FSOP_GEOM_FLAGS_DIRV2 (1 << 7) /* directory version 2 */ +#define XFS_FSOP_GEOM_FLAGS_LOGV2 (1 << 8) /* log format version 2 */ +#define XFS_FSOP_GEOM_FLAGS_SECTOR (1 << 9) /* sector sizes >1BB */ +#define XFS_FSOP_GEOM_FLAGS_ATTR2 (1 << 10) /* inline attributes rework */ +#define XFS_FSOP_GEOM_FLAGS_PROJID32 (1 << 11) /* 32-bit project IDs */ +#define XFS_FSOP_GEOM_FLAGS_DIRV2CI (1 << 12) /* ASCII only CI names */ + /* -- Do not use -- (1 << 13) SGI parent pointers */ +#define XFS_FSOP_GEOM_FLAGS_LAZYSB (1 << 14) /* lazy superblock counters */ +#define XFS_FSOP_GEOM_FLAGS_V5SB (1 << 15) /* version 5 superblock */ +#define XFS_FSOP_GEOM_FLAGS_FTYPE (1 << 16) /* inode directory types */ +#define XFS_FSOP_GEOM_FLAGS_FINOBT (1 << 17) /* free inode btree */ +#define XFS_FSOP_GEOM_FLAGS_SPINODES (1 << 18) /* sparse inode chunks */ +#define XFS_FSOP_GEOM_FLAGS_RMAPBT (1 << 19) /* reverse mapping btree */ +#define XFS_FSOP_GEOM_FLAGS_REFLINK (1 << 20) /* files can share blocks */ /* * Minimum and maximum sizes need for growth checks. @@ -238,6 +277,31 @@ typedef struct xfs_fsop_resblks { (s)->sb_agblocks + XFS_MIN_AG_BLOCKS) /* + * Output for XFS_IOC_AG_GEOMETRY + */ +struct xfs_ag_geometry { + uint32_t ag_number; /* i/o: AG number */ + uint32_t ag_length; /* o: length in blocks */ + uint32_t ag_freeblks; /* o: free space */ + uint32_t ag_icount; /* o: inodes allocated */ + uint32_t ag_ifree; /* o: inodes free */ + uint32_t ag_sick; /* o: sick things in ag */ + uint32_t ag_checked; /* o: checked metadata in ag */ + uint32_t ag_reserved32; /* o: zero */ + uint64_t ag_reserved[12];/* o: zero */ +}; +#define XFS_AG_GEOM_SICK_SB (1 << 0) /* superblock */ +#define XFS_AG_GEOM_SICK_AGF (1 << 1) /* AGF header */ +#define XFS_AG_GEOM_SICK_AGFL (1 << 2) /* AGFL header */ +#define XFS_AG_GEOM_SICK_AGI (1 << 3) /* AGI header */ +#define XFS_AG_GEOM_SICK_BNOBT (1 << 4) /* free space by block */ +#define XFS_AG_GEOM_SICK_CNTBT (1 << 5) /* free space by length */ +#define XFS_AG_GEOM_SICK_INOBT (1 << 6) /* inode index */ +#define XFS_AG_GEOM_SICK_FINOBT (1 << 7) /* free inode index */ +#define XFS_AG_GEOM_SICK_RMAPBT (1 << 8) /* reverse mappings */ +#define XFS_AG_GEOM_SICK_REFCNTBT (1 << 9) /* reference counts */ + +/* * Structures for XFS_IOC_FSGROWFSDATA, XFS_IOC_FSGROWFSLOG & XFS_IOC_FSGROWFSRT */ typedef struct xfs_growfs_data { @@ -264,7 +328,7 @@ typedef struct xfs_bstime { __s32 tv_nsec; /* and nanoseconds */ } xfs_bstime_t; -typedef struct xfs_bstat { +struct xfs_bstat { __u64 bs_ino; /* inode number */ __u16 bs_mode; /* type and mode */ __u16 bs_nlink; /* number of links */ @@ -285,12 +349,70 @@ typedef struct xfs_bstat { #define bs_projid bs_projid_lo /* (previously just bs_projid) */ __u16 bs_forkoff; /* inode fork offset in bytes */ __u16 bs_projid_hi; /* higher part of project id */ - unsigned char bs_pad[6]; /* pad space, unused */ + uint16_t bs_sick; /* sick inode metadata */ + uint16_t bs_checked; /* checked inode metadata */ + unsigned char bs_pad[2]; /* pad space, unused */ __u32 bs_cowextsize; /* cow extent size */ __u32 bs_dmevmask; /* DMIG event mask */ __u16 bs_dmstate; /* DMIG state info */ __u16 bs_aextents; /* attribute number of extents */ -} xfs_bstat_t; +}; + +/* New bulkstat structure that reports v5 features and fixes padding issues */ +struct xfs_bulkstat { + uint64_t bs_ino; /* inode number */ + uint64_t bs_size; /* file size */ + + uint64_t bs_blocks; /* number of blocks */ + uint64_t bs_xflags; /* extended flags */ + + uint64_t bs_atime; /* access time, seconds */ + uint64_t bs_mtime; /* modify time, seconds */ + + uint64_t bs_ctime; /* inode change time, seconds */ + uint64_t bs_btime; /* creation time, seconds */ + + uint32_t bs_gen; /* generation count */ + uint32_t bs_uid; /* user id */ + uint32_t bs_gid; /* group id */ + uint32_t bs_projectid; /* project id */ + + uint32_t bs_atime_nsec; /* access time, nanoseconds */ + uint32_t bs_mtime_nsec; /* modify time, nanoseconds */ + uint32_t bs_ctime_nsec; /* inode change time, nanoseconds */ + uint32_t bs_btime_nsec; /* creation time, nanoseconds */ + + uint32_t bs_blksize; /* block size */ + uint32_t bs_rdev; /* device value */ + uint32_t bs_cowextsize_blks; /* cow extent size hint, blocks */ + uint32_t bs_extsize_blks; /* extent size hint, blocks */ + + uint32_t bs_nlink; /* number of links */ + uint32_t bs_extents; /* number of extents */ + uint32_t bs_aextents; /* attribute number of extents */ + uint16_t bs_version; /* structure version */ + uint16_t bs_forkoff; /* inode fork offset in bytes */ + + uint16_t bs_sick; /* sick inode metadata */ + uint16_t bs_checked; /* checked inode metadata */ + uint16_t bs_mode; /* type and mode */ + uint16_t bs_pad2; /* zeroed */ + + uint64_t bs_pad[7]; /* zeroed */ +}; + +#define XFS_BULKSTAT_VERSION_V1 (1) +#define XFS_BULKSTAT_VERSION_V5 (5) + +/* bs_sick flags */ +#define XFS_BS_SICK_INODE (1 << 0) /* inode core */ +#define XFS_BS_SICK_BMBTD (1 << 1) /* data fork */ +#define XFS_BS_SICK_BMBTA (1 << 2) /* attr fork */ +#define XFS_BS_SICK_BMBTC (1 << 3) /* cow fork */ +#define XFS_BS_SICK_DIR (1 << 4) /* directory */ +#define XFS_BS_SICK_XATTR (1 << 5) /* extended attributes */ +#define XFS_BS_SICK_SYMLINK (1 << 6) /* symbolic link remote target */ +#define XFS_BS_SICK_PARENT (1 << 7) /* parent pointers */ /* * Project quota id helpers (previously projid was 16bit only @@ -298,7 +420,7 @@ typedef struct xfs_bstat { * to retain compatibility with "old" filesystems). */ static inline uint32_t -bstat_get_projid(struct xfs_bstat *bs) +bstat_get_projid(const struct xfs_bstat *bs) { return (uint32_t)bs->bs_projid_hi << 16 | bs->bs_projid_lo; } @@ -306,23 +428,79 @@ bstat_get_projid(struct xfs_bstat *bs) /* * The user-level BulkStat Request interface structure. */ -typedef struct xfs_fsop_bulkreq { +struct xfs_fsop_bulkreq { __u64 __user *lastip; /* last inode # pointer */ __s32 icount; /* count of entries in buffer */ void __user *ubuffer;/* user buffer for inode desc. */ __s32 __user *ocount; /* output count pointer */ -} xfs_fsop_bulkreq_t; - +}; /* * Structures returned from xfs_inumbers routine (XFS_IOC_FSINUMBERS). */ -typedef struct xfs_inogrp { +struct xfs_inogrp { __u64 xi_startino; /* starting inode number */ __s32 xi_alloccount; /* # bits set in allocmask */ __u64 xi_allocmask; /* mask of allocated inodes */ -} xfs_inogrp_t; +}; + +/* New inumbers structure that reports v5 features and fixes padding issues */ +struct xfs_inumbers { + uint64_t xi_startino; /* starting inode number */ + uint64_t xi_allocmask; /* mask of allocated inodes */ + uint8_t xi_alloccount; /* # bits set in allocmask */ + uint8_t xi_version; /* version */ + uint8_t xi_padding[6]; /* zero */ +}; + +#define XFS_INUMBERS_VERSION_V1 (1) +#define XFS_INUMBERS_VERSION_V5 (5) + +/* Header for bulk inode requests. */ +struct xfs_bulk_ireq { + uint64_t ino; /* I/O: start with this inode */ + uint32_t flags; /* I/O: operation flags */ + uint32_t icount; /* I: count of entries in buffer */ + uint32_t ocount; /* O: count of entries filled out */ + uint32_t agno; /* I: see comment for IREQ_AGNO */ + uint64_t reserved[5]; /* must be zero */ +}; + +/* + * Only return results from the specified @agno. If @ino is zero, start + * with the first inode of @agno. + */ +#define XFS_BULK_IREQ_AGNO (1 << 0) + +/* + * Return bulkstat information for a single inode, where @ino value is a + * special value, not a literal inode number. See the XFS_BULK_IREQ_SPECIAL_* + * values below. Not compatible with XFS_BULK_IREQ_AGNO. + */ +#define XFS_BULK_IREQ_SPECIAL (1 << 1) + +#define XFS_BULK_IREQ_FLAGS_ALL (XFS_BULK_IREQ_AGNO | \ + XFS_BULK_IREQ_SPECIAL) + +/* Operate on the root directory inode. */ +#define XFS_BULK_IREQ_SPECIAL_ROOT (1) + +/* + * ioctl structures for v5 bulkstat and inumbers requests + */ +struct xfs_bulkstat_req { + struct xfs_bulk_ireq hdr; + struct xfs_bulkstat bulkstat[]; +}; +#define XFS_BULKSTAT_REQ_SIZE(nr) (sizeof(struct xfs_bulkstat_req) + \ + (nr) * sizeof(struct xfs_bulkstat)) +struct xfs_inumbers_req { + struct xfs_bulk_ireq hdr; + struct xfs_inumbers inumbers[]; +}; +#define XFS_INUMBERS_REQ_SIZE(nr) (sizeof(struct xfs_inumbers_req) + \ + (nr) * sizeof(struct xfs_inumbers)) /* * Error injection. @@ -453,7 +631,7 @@ typedef struct xfs_swapext xfs_off_t sx_offset; /* offset into file */ xfs_off_t sx_length; /* leng from offset */ char sx_pad[16]; /* pad space, unused */ - xfs_bstat_t sx_stat; /* stat of target b4 copy */ + struct xfs_bstat sx_stat; /* stat of target b4 copy */ } xfs_swapext_t; /* @@ -502,9 +680,10 @@ struct xfs_scrub_metadata { #define XFS_SCRUB_TYPE_UQUOTA 21 /* user quotas */ #define XFS_SCRUB_TYPE_GQUOTA 22 /* group quotas */ #define XFS_SCRUB_TYPE_PQUOTA 23 /* project quotas */ +#define XFS_SCRUB_TYPE_FSCOUNTERS 24 /* fs summary counters */ /* Number of scrub subcommands. */ -#define XFS_SCRUB_TYPE_NR 24 +#define XFS_SCRUB_TYPE_NR 25 /* i: Repair this metadata. */ #define XFS_SCRUB_IFLAG_REPAIR (1 << 0) @@ -590,6 +769,7 @@ struct xfs_scrub_metadata { #define XFS_IOC_FREE_EOFBLOCKS _IOR ('X', 58, struct xfs_fs_eofblocks) /* XFS_IOC_GETFSMAP ------ hoisted 59 */ #define XFS_IOC_SCRUB_METADATA _IOWR('X', 60, struct xfs_scrub_metadata) +#define XFS_IOC_AG_GEOMETRY _IOWR('X', 61, struct xfs_ag_geometry) /* * ioctl commands that replace IRIX syssgi()'s @@ -620,8 +800,11 @@ struct xfs_scrub_metadata { #define XFS_IOC_FSSETDM_BY_HANDLE _IOW ('X', 121, struct xfs_fsop_setdm_handlereq) #define XFS_IOC_ATTRLIST_BY_HANDLE _IOW ('X', 122, struct xfs_fsop_attrlist_handlereq) #define XFS_IOC_ATTRMULTI_BY_HANDLE _IOW ('X', 123, struct xfs_fsop_attrmulti_handlereq) -#define XFS_IOC_FSGEOMETRY _IOR ('X', 124, struct xfs_fsop_geom) +#define XFS_IOC_FSGEOMETRY_V4 _IOR ('X', 124, struct xfs_fsop_geom_v4) #define XFS_IOC_GOINGDOWN _IOR ('X', 125, uint32_t) +#define XFS_IOC_FSGEOMETRY _IOR ('X', 126, struct xfs_fsop_geom) +#define XFS_IOC_BULKSTAT _IOR ('X', 127, struct xfs_bulkstat_req) +#define XFS_IOC_INUMBERS _IOR ('X', 128, struct xfs_inumbers_req) /* XFS_IOC_GETFSUUID ---------- deprecated 140 */ diff --git a/fs/xfs/libxfs/xfs_health.h b/fs/xfs/libxfs/xfs_health.h new file mode 100644 index 000000000000..272005ac8c88 --- /dev/null +++ b/fs/xfs/libxfs/xfs_health.h @@ -0,0 +1,190 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Copyright (C) 2019 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <darrick.wong@oracle.com> + */ +#ifndef __XFS_HEALTH_H__ +#define __XFS_HEALTH_H__ + +/* + * In-Core Filesystem Health Assessments + * ===================================== + * + * We'd like to be able to summarize the current health status of the + * filesystem so that the administrator knows when it's necessary to schedule + * some downtime for repairs. Until then, we would also like to avoid abrupt + * shutdowns due to corrupt metadata. + * + * The online scrub feature evaluates the health of all filesystem metadata. + * When scrub detects corruption in a piece of metadata it will set the + * corresponding sickness flag, and repair will clear it if successful. If + * problems remain at unmount time, we can also request manual intervention by + * logging a notice to run xfs_repair. + * + * Each health tracking group uses a pair of fields for reporting. The + * "checked" field tell us if a given piece of metadata has ever been examined, + * and the "sick" field tells us if that piece was found to need repairs. + * Therefore we can conclude that for a given sick flag value: + * + * - checked && sick => metadata needs repair + * - checked && !sick => metadata is ok + * - !checked => has not been examined since mount + */ + +struct xfs_mount; +struct xfs_perag; +struct xfs_inode; +struct xfs_fsop_geom; + +/* Observable health issues for metadata spanning the entire filesystem. */ +#define XFS_SICK_FS_COUNTERS (1 << 0) /* summary counters */ +#define XFS_SICK_FS_UQUOTA (1 << 1) /* user quota */ +#define XFS_SICK_FS_GQUOTA (1 << 2) /* group quota */ +#define XFS_SICK_FS_PQUOTA (1 << 3) /* project quota */ + +/* Observable health issues for realtime volume metadata. */ +#define XFS_SICK_RT_BITMAP (1 << 0) /* realtime bitmap */ +#define XFS_SICK_RT_SUMMARY (1 << 1) /* realtime summary */ + +/* Observable health issues for AG metadata. */ +#define XFS_SICK_AG_SB (1 << 0) /* superblock */ +#define XFS_SICK_AG_AGF (1 << 1) /* AGF header */ +#define XFS_SICK_AG_AGFL (1 << 2) /* AGFL header */ +#define XFS_SICK_AG_AGI (1 << 3) /* AGI header */ +#define XFS_SICK_AG_BNOBT (1 << 4) /* free space by block */ +#define XFS_SICK_AG_CNTBT (1 << 5) /* free space by length */ +#define XFS_SICK_AG_INOBT (1 << 6) /* inode index */ +#define XFS_SICK_AG_FINOBT (1 << 7) /* free inode index */ +#define XFS_SICK_AG_RMAPBT (1 << 8) /* reverse mappings */ +#define XFS_SICK_AG_REFCNTBT (1 << 9) /* reference counts */ + +/* Observable health issues for inode metadata. */ +#define XFS_SICK_INO_CORE (1 << 0) /* inode core */ +#define XFS_SICK_INO_BMBTD (1 << 1) /* data fork */ +#define XFS_SICK_INO_BMBTA (1 << 2) /* attr fork */ +#define XFS_SICK_INO_BMBTC (1 << 3) /* cow fork */ +#define XFS_SICK_INO_DIR (1 << 4) /* directory */ +#define XFS_SICK_INO_XATTR (1 << 5) /* extended attributes */ +#define XFS_SICK_INO_SYMLINK (1 << 6) /* symbolic link remote target */ +#define XFS_SICK_INO_PARENT (1 << 7) /* parent pointers */ + +/* Primary evidence of health problems in a given group. */ +#define XFS_SICK_FS_PRIMARY (XFS_SICK_FS_COUNTERS | \ + XFS_SICK_FS_UQUOTA | \ + XFS_SICK_FS_GQUOTA | \ + XFS_SICK_FS_PQUOTA) + +#define XFS_SICK_RT_PRIMARY (XFS_SICK_RT_BITMAP | \ + XFS_SICK_RT_SUMMARY) + +#define XFS_SICK_AG_PRIMARY (XFS_SICK_AG_SB | \ + XFS_SICK_AG_AGF | \ + XFS_SICK_AG_AGFL | \ + XFS_SICK_AG_AGI | \ + XFS_SICK_AG_BNOBT | \ + XFS_SICK_AG_CNTBT | \ + XFS_SICK_AG_INOBT | \ + XFS_SICK_AG_FINOBT | \ + XFS_SICK_AG_RMAPBT | \ + XFS_SICK_AG_REFCNTBT) + +#define XFS_SICK_INO_PRIMARY (XFS_SICK_INO_CORE | \ + XFS_SICK_INO_BMBTD | \ + XFS_SICK_INO_BMBTA | \ + XFS_SICK_INO_BMBTC | \ + XFS_SICK_INO_DIR | \ + XFS_SICK_INO_XATTR | \ + XFS_SICK_INO_SYMLINK | \ + XFS_SICK_INO_PARENT) + +/* These functions must be provided by the xfs implementation. */ + +void xfs_fs_mark_sick(struct xfs_mount *mp, unsigned int mask); +void xfs_fs_mark_healthy(struct xfs_mount *mp, unsigned int mask); +void xfs_fs_measure_sickness(struct xfs_mount *mp, unsigned int *sick, + unsigned int *checked); + +void xfs_rt_mark_sick(struct xfs_mount *mp, unsigned int mask); +void xfs_rt_mark_healthy(struct xfs_mount *mp, unsigned int mask); +void xfs_rt_measure_sickness(struct xfs_mount *mp, unsigned int *sick, + unsigned int *checked); + +void xfs_ag_mark_sick(struct xfs_perag *pag, unsigned int mask); +void xfs_ag_mark_healthy(struct xfs_perag *pag, unsigned int mask); +void xfs_ag_measure_sickness(struct xfs_perag *pag, unsigned int *sick, + unsigned int *checked); + +void xfs_inode_mark_sick(struct xfs_inode *ip, unsigned int mask); +void xfs_inode_mark_healthy(struct xfs_inode *ip, unsigned int mask); +void xfs_inode_measure_sickness(struct xfs_inode *ip, unsigned int *sick, + unsigned int *checked); + +void xfs_health_unmount(struct xfs_mount *mp); + +/* Now some helpers. */ + +static inline bool +xfs_fs_has_sickness(struct xfs_mount *mp, unsigned int mask) +{ + unsigned int sick, checked; + + xfs_fs_measure_sickness(mp, &sick, &checked); + return sick & mask; +} + +static inline bool +xfs_rt_has_sickness(struct xfs_mount *mp, unsigned int mask) +{ + unsigned int sick, checked; + + xfs_rt_measure_sickness(mp, &sick, &checked); + return sick & mask; +} + +static inline bool +xfs_ag_has_sickness(struct xfs_perag *pag, unsigned int mask) +{ + unsigned int sick, checked; + + xfs_ag_measure_sickness(pag, &sick, &checked); + return sick & mask; +} + +static inline bool +xfs_inode_has_sickness(struct xfs_inode *ip, unsigned int mask) +{ + unsigned int sick, checked; + + xfs_inode_measure_sickness(ip, &sick, &checked); + return sick & mask; +} + +static inline bool +xfs_fs_is_healthy(struct xfs_mount *mp) +{ + return !xfs_fs_has_sickness(mp, -1U); +} + +static inline bool +xfs_rt_is_healthy(struct xfs_mount *mp) +{ + return !xfs_rt_has_sickness(mp, -1U); +} + +static inline bool +xfs_ag_is_healthy(struct xfs_perag *pag) +{ + return !xfs_ag_has_sickness(pag, -1U); +} + +static inline bool +xfs_inode_is_healthy(struct xfs_inode *ip) +{ + return !xfs_inode_has_sickness(ip, -1U); +} + +void xfs_fsop_geom_health(struct xfs_mount *mp, struct xfs_fsop_geom *geo); +void xfs_ag_geom_health(struct xfs_perag *pag, struct xfs_ag_geometry *ageo); +void xfs_bulkstat_health(struct xfs_inode *ip, struct xfs_bulkstat *bs); + +#endif /* __XFS_HEALTH_H__ */ diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index d32152fc8a6c..04377ab75863 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -12,17 +12,14 @@ #include "xfs_bit.h" #include "xfs_sb.h" #include "xfs_mount.h" -#include "xfs_defer.h" #include "xfs_inode.h" #include "xfs_btree.h" #include "xfs_ialloc.h" #include "xfs_ialloc_btree.h" #include "xfs_alloc.h" -#include "xfs_rtalloc.h" #include "xfs_errortag.h" #include "xfs_error.h" #include "xfs_bmap.h" -#include "xfs_cksum.h" #include "xfs_trans.h" #include "xfs_buf_item.h" #include "xfs_icreate_item.h" @@ -31,20 +28,6 @@ #include "xfs_log.h" #include "xfs_rmap.h" - -/* - * Allocation group level functions. - */ -int -xfs_ialloc_cluster_alignment( - struct xfs_mount *mp) -{ - if (xfs_sb_version_hasalign(&mp->m_sb) && - mp->m_sb.sb_inoalignmt >= xfs_icluster_size_fsb(mp)) - return mp->m_sb.sb_inoalignmt; - return 1; -} - /* * Lookup a record by ino in the btree given by cur. */ @@ -299,7 +282,7 @@ xfs_ialloc_inode_init( * sizes, manipulate the inodes in buffers which are multiples of the * blocks size. */ - nbufs = length / mp->m_blocks_per_cluster; + nbufs = length / M_IGEO(mp)->blocks_per_cluster; /* * Figure out what version number to use in the inodes we create. If @@ -343,9 +326,10 @@ xfs_ialloc_inode_init( * Get the block. */ d = XFS_AGB_TO_DADDR(mp, agno, agbno + - (j * mp->m_blocks_per_cluster)); + (j * M_IGEO(mp)->blocks_per_cluster)); fbuf = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, - mp->m_bsize * mp->m_blocks_per_cluster, + mp->m_bsize * + M_IGEO(mp)->blocks_per_cluster, XBF_UNMAPPED); if (!fbuf) return -ENOMEM; @@ -353,7 +337,7 @@ xfs_ialloc_inode_init( /* Initialize the inode buffers and log them appropriately. */ fbuf->b_ops = &xfs_inode_buf_ops; xfs_buf_zero(fbuf, 0, BBTOB(fbuf->b_length)); - for (i = 0; i < mp->m_inodes_per_cluster; i++) { + for (i = 0; i < M_IGEO(mp)->inodes_per_cluster; i++) { int ioffset = i << mp->m_sb.sb_inodelog; uint isize = xfs_dinode_size(version); @@ -616,24 +600,26 @@ error: * Allocate new inodes in the allocation group specified by agbp. * Return 0 for success, else error code. */ -STATIC int /* error code or 0 */ +STATIC int xfs_ialloc_ag_alloc( - xfs_trans_t *tp, /* transaction pointer */ - xfs_buf_t *agbp, /* alloc group buffer */ - int *alloc) + struct xfs_trans *tp, + struct xfs_buf *agbp, + int *alloc) { - xfs_agi_t *agi; /* allocation group header */ - xfs_alloc_arg_t args; /* allocation argument structure */ - xfs_agnumber_t agno; - int error; - xfs_agino_t newino; /* new first inode's number */ - xfs_agino_t newlen; /* new number of inodes */ - int isaligned = 0; /* inode allocation at stripe unit */ - /* boundary */ - uint16_t allocmask = (uint16_t) -1; /* init. to full chunk */ + struct xfs_agi *agi; + struct xfs_alloc_arg args; + xfs_agnumber_t agno; + int error; + xfs_agino_t newino; /* new first inode's number */ + xfs_agino_t newlen; /* new number of inodes */ + int isaligned = 0; /* inode allocation at stripe */ + /* unit boundary */ + /* init. to full chunk */ + uint16_t allocmask = (uint16_t) -1; struct xfs_inobt_rec_incore rec; - struct xfs_perag *pag; - int do_sparse = 0; + struct xfs_perag *pag; + struct xfs_ino_geometry *igeo = M_IGEO(tp->t_mountp); + int do_sparse = 0; memset(&args, 0, sizeof(args)); args.tp = tp; @@ -644,7 +630,7 @@ xfs_ialloc_ag_alloc( #ifdef DEBUG /* randomly do sparse inode allocations */ if (xfs_sb_version_hassparseinodes(&tp->t_mountp->m_sb) && - args.mp->m_ialloc_min_blks < args.mp->m_ialloc_blks) + igeo->ialloc_min_blks < igeo->ialloc_blks) do_sparse = prandom_u32() & 1; #endif @@ -652,12 +638,12 @@ xfs_ialloc_ag_alloc( * Locking will ensure that we don't have two callers in here * at one time. */ - newlen = args.mp->m_ialloc_inos; - if (args.mp->m_maxicount && + newlen = igeo->ialloc_inos; + if (igeo->maxicount && percpu_counter_read_positive(&args.mp->m_icount) + newlen > - args.mp->m_maxicount) + igeo->maxicount) return -ENOSPC; - args.minlen = args.maxlen = args.mp->m_ialloc_blks; + args.minlen = args.maxlen = igeo->ialloc_blks; /* * First try to allocate inodes contiguous with the last-allocated * chunk of inodes. If the filesystem is striped, this will fill @@ -667,7 +653,7 @@ xfs_ialloc_ag_alloc( newino = be32_to_cpu(agi->agi_newino); agno = be32_to_cpu(agi->agi_seqno); args.agbno = XFS_AGINO_TO_AGBNO(args.mp, newino) + - args.mp->m_ialloc_blks; + igeo->ialloc_blks; if (do_sparse) goto sparse_alloc; if (likely(newino != NULLAGINO && @@ -690,10 +676,10 @@ xfs_ialloc_ag_alloc( * but not to use them in the actual exact allocation. */ args.alignment = 1; - args.minalignslop = args.mp->m_cluster_align - 1; + args.minalignslop = igeo->cluster_align - 1; /* Allow space for the inode btree to split. */ - args.minleft = args.mp->m_in_maxlevels - 1; + args.minleft = igeo->inobt_maxlevels - 1; if ((error = xfs_alloc_vextent(&args))) return error; @@ -720,12 +706,12 @@ xfs_ialloc_ag_alloc( * pieces, so don't need alignment anyway. */ isaligned = 0; - if (args.mp->m_sinoalign) { + if (igeo->ialloc_align) { ASSERT(!(args.mp->m_flags & XFS_MOUNT_NOALIGN)); args.alignment = args.mp->m_dalign; isaligned = 1; } else - args.alignment = args.mp->m_cluster_align; + args.alignment = igeo->cluster_align; /* * Need to figure out where to allocate the inode blocks. * Ideally they should be spaced out through the a.g. @@ -741,7 +727,7 @@ xfs_ialloc_ag_alloc( /* * Allow space for the inode btree to split. */ - args.minleft = args.mp->m_in_maxlevels - 1; + args.minleft = igeo->inobt_maxlevels - 1; if ((error = xfs_alloc_vextent(&args))) return error; } @@ -754,7 +740,7 @@ xfs_ialloc_ag_alloc( args.type = XFS_ALLOCTYPE_NEAR_BNO; args.agbno = be32_to_cpu(agi->agi_root); args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno); - args.alignment = args.mp->m_cluster_align; + args.alignment = igeo->cluster_align; if ((error = xfs_alloc_vextent(&args))) return error; } @@ -764,7 +750,7 @@ xfs_ialloc_ag_alloc( * the sparse allocation length is smaller than a full chunk. */ if (xfs_sb_version_hassparseinodes(&args.mp->m_sb) && - args.mp->m_ialloc_min_blks < args.mp->m_ialloc_blks && + igeo->ialloc_min_blks < igeo->ialloc_blks && args.fsbno == NULLFSBLOCK) { sparse_alloc: args.type = XFS_ALLOCTYPE_NEAR_BNO; @@ -773,7 +759,7 @@ sparse_alloc: args.alignment = args.mp->m_sb.sb_spino_align; args.prod = 1; - args.minlen = args.mp->m_ialloc_min_blks; + args.minlen = igeo->ialloc_min_blks; args.maxlen = args.minlen; /* @@ -789,7 +775,7 @@ sparse_alloc: args.min_agbno = args.mp->m_sb.sb_inoalignmt; args.max_agbno = round_down(args.mp->m_sb.sb_agblocks, args.mp->m_sb.sb_inoalignmt) - - args.mp->m_ialloc_blks; + igeo->ialloc_blks; error = xfs_alloc_vextent(&args); if (error) @@ -1006,7 +992,7 @@ xfs_ialloc_ag_select( * space needed for alignment of inode chunks when checking the * longest contiguous free space in the AG - this prevents us * from getting ENOSPC because we have free space larger than - * m_ialloc_blks but alignment constraints prevent us from using + * ialloc_blks but alignment constraints prevent us from using * it. * * If we can't find an AG with space for full alignment slack to @@ -1015,9 +1001,9 @@ xfs_ialloc_ag_select( * if we fail allocation due to alignment issues then it is most * likely a real ENOSPC condition. */ - ineed = mp->m_ialloc_min_blks; + ineed = M_IGEO(mp)->ialloc_min_blks; if (flags && ineed > 1) - ineed += mp->m_cluster_align; + ineed += M_IGEO(mp)->cluster_align; longest = pag->pagf_longest; if (!longest) longest = pag->pagf_flcount > 0; @@ -1703,6 +1689,7 @@ xfs_dialloc( int noroom = 0; xfs_agnumber_t start_agno; struct xfs_perag *pag; + struct xfs_ino_geometry *igeo = M_IGEO(mp); int okalloc = 1; if (*IO_agbp) { @@ -1733,9 +1720,9 @@ xfs_dialloc( * Read rough value of mp->m_icount by percpu_counter_read_positive, * which will sacrifice the preciseness but improve the performance. */ - if (mp->m_maxicount && - percpu_counter_read_positive(&mp->m_icount) + mp->m_ialloc_inos - > mp->m_maxicount) { + if (igeo->maxicount && + percpu_counter_read_positive(&mp->m_icount) + igeo->ialloc_inos + > igeo->maxicount) { noroom = 1; okalloc = 0; } @@ -1852,7 +1839,8 @@ xfs_difree_inode_chunk( if (!xfs_inobt_issparse(rec->ir_holemask)) { /* not sparse, calculate extent info directly */ xfs_bmap_add_free(tp, XFS_AGB_TO_FSB(mp, agno, sagbno), - mp->m_ialloc_blks, &XFS_RMAP_OINFO_INODES); + M_IGEO(mp)->ialloc_blks, + &XFS_RMAP_OINFO_INODES); return; } @@ -2261,7 +2249,7 @@ xfs_imap_lookup( /* check that the returned record contains the required inode */ if (rec.ir_startino > agino || - rec.ir_startino + mp->m_ialloc_inos <= agino) + rec.ir_startino + M_IGEO(mp)->ialloc_inos <= agino) return -EINVAL; /* for untrusted inodes check it is allocated first */ @@ -2352,7 +2340,7 @@ xfs_imap( * If the inode cluster size is the same as the blocksize or * smaller we get to the buffer by simple arithmetics. */ - if (mp->m_blocks_per_cluster == 1) { + if (M_IGEO(mp)->blocks_per_cluster == 1) { offset = XFS_INO_TO_OFFSET(mp, ino); ASSERT(offset < mp->m_sb.sb_inopblock); @@ -2368,8 +2356,8 @@ xfs_imap( * find the location. Otherwise we have to do a btree * lookup to find the location. */ - if (mp->m_inoalign_mask) { - offset_agbno = agbno & mp->m_inoalign_mask; + if (M_IGEO(mp)->inoalign_mask) { + offset_agbno = agbno & M_IGEO(mp)->inoalign_mask; chunk_agbno = agbno - offset_agbno; } else { error = xfs_imap_lookup(mp, tp, agno, agino, agbno, @@ -2381,13 +2369,13 @@ xfs_imap( out_map: ASSERT(agbno >= chunk_agbno); cluster_agbno = chunk_agbno + - ((offset_agbno / mp->m_blocks_per_cluster) * - mp->m_blocks_per_cluster); + ((offset_agbno / M_IGEO(mp)->blocks_per_cluster) * + M_IGEO(mp)->blocks_per_cluster); offset = ((agbno - cluster_agbno) * mp->m_sb.sb_inopblock) + XFS_INO_TO_OFFSET(mp, ino); imap->im_blkno = XFS_AGB_TO_DADDR(mp, agno, cluster_agbno); - imap->im_len = XFS_FSB_TO_BB(mp, mp->m_blocks_per_cluster); + imap->im_len = XFS_FSB_TO_BB(mp, M_IGEO(mp)->blocks_per_cluster); imap->im_boffset = (unsigned short)(offset << mp->m_sb.sb_inodelog); /* @@ -2409,20 +2397,6 @@ out_map: } /* - * Compute and fill in value of m_in_maxlevels. - */ -void -xfs_ialloc_compute_maxlevels( - xfs_mount_t *mp) /* file system mount structure */ -{ - uint inodes; - - inodes = (1LL << XFS_INO_AGINO_BITS(mp)) >> XFS_INODES_PER_CHUNK_LOG; - mp->m_in_maxlevels = xfs_btree_compute_maxlevels(mp->m_inobt_mnr, - inodes); -} - -/* * Log specified fields for the ag hdr (inode section). The growth of the agi * structure over time requires that we interpret the buffer as two logical * regions delineated by the end of the unlinked list. This is due to the size @@ -2493,7 +2467,7 @@ static xfs_failaddr_t xfs_agi_verify( struct xfs_buf *bp) { - struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_mount *mp = bp->b_mount; struct xfs_agi *agi = XFS_BUF_TO_AGI(bp); int i; @@ -2508,7 +2482,7 @@ xfs_agi_verify( /* * Validate the magic number of the agi block. */ - if (agi->agi_magicnum != cpu_to_be32(XFS_AGI_MAGIC)) + if (!xfs_verify_magic(bp, agi->agi_magicnum)) return __this_address; if (!XFS_AGI_GOOD_VERSION(be32_to_cpu(agi->agi_versionnum))) return __this_address; @@ -2545,7 +2519,7 @@ static void xfs_agi_read_verify( struct xfs_buf *bp) { - struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_mount *mp = bp->b_mount; xfs_failaddr_t fa; if (xfs_sb_version_hascrc(&mp->m_sb) && @@ -2562,7 +2536,7 @@ static void xfs_agi_write_verify( struct xfs_buf *bp) { - struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_mount *mp = bp->b_mount; struct xfs_buf_log_item *bip = bp->b_log_item; xfs_failaddr_t fa; @@ -2582,6 +2556,7 @@ xfs_agi_write_verify( const struct xfs_buf_ops xfs_agi_buf_ops = { .name = "xfs_agi", + .magic = { cpu_to_be32(XFS_AGI_MAGIC), cpu_to_be32(XFS_AGI_MAGIC) }, .verify_read = xfs_agi_read_verify, .verify_write = xfs_agi_write_verify, .verify_struct = xfs_agi_verify, @@ -2767,3 +2742,110 @@ xfs_ialloc_count_inodes( *freecount = ci.freecount; return 0; } + +/* + * Initialize inode-related geometry information. + * + * Compute the inode btree min and max levels and set maxicount. + * + * Set the inode cluster size. This may still be overridden by the file + * system block size if it is larger than the chosen cluster size. + * + * For v5 filesystems, scale the cluster size with the inode size to keep a + * constant ratio of inode per cluster buffer, but only if mkfs has set the + * inode alignment value appropriately for larger cluster sizes. + * + * Then compute the inode cluster alignment information. + */ +void +xfs_ialloc_setup_geometry( + struct xfs_mount *mp) +{ + struct xfs_sb *sbp = &mp->m_sb; + struct xfs_ino_geometry *igeo = M_IGEO(mp); + uint64_t icount; + uint inodes; + + /* Compute inode btree geometry. */ + igeo->agino_log = sbp->sb_inopblog + sbp->sb_agblklog; + igeo->inobt_mxr[0] = xfs_inobt_maxrecs(mp, sbp->sb_blocksize, 1); + igeo->inobt_mxr[1] = xfs_inobt_maxrecs(mp, sbp->sb_blocksize, 0); + igeo->inobt_mnr[0] = igeo->inobt_mxr[0] / 2; + igeo->inobt_mnr[1] = igeo->inobt_mxr[1] / 2; + + igeo->ialloc_inos = max_t(uint16_t, XFS_INODES_PER_CHUNK, + sbp->sb_inopblock); + igeo->ialloc_blks = igeo->ialloc_inos >> sbp->sb_inopblog; + + if (sbp->sb_spino_align) + igeo->ialloc_min_blks = sbp->sb_spino_align; + else + igeo->ialloc_min_blks = igeo->ialloc_blks; + + /* Compute and fill in value of m_ino_geo.inobt_maxlevels. */ + inodes = (1LL << XFS_INO_AGINO_BITS(mp)) >> XFS_INODES_PER_CHUNK_LOG; + igeo->inobt_maxlevels = xfs_btree_compute_maxlevels(igeo->inobt_mnr, + inodes); + + /* Set the maximum inode count for this filesystem. */ + if (sbp->sb_imax_pct) { + /* + * Make sure the maximum inode count is a multiple + * of the units we allocate inodes in. + */ + icount = sbp->sb_dblocks * sbp->sb_imax_pct; + do_div(icount, 100); + do_div(icount, igeo->ialloc_blks); + igeo->maxicount = XFS_FSB_TO_INO(mp, + icount * igeo->ialloc_blks); + } else { + igeo->maxicount = 0; + } + + /* + * Compute the desired size of an inode cluster buffer size, which + * starts at 8K and (on v5 filesystems) scales up with larger inode + * sizes. + * + * Preserve the desired inode cluster size because the sparse inodes + * feature uses that desired size (not the actual size) to compute the + * sparse inode alignment. The mount code validates this value, so we + * cannot change the behavior. + */ + igeo->inode_cluster_size_raw = XFS_INODE_BIG_CLUSTER_SIZE; + if (xfs_sb_version_hascrc(&mp->m_sb)) { + int new_size = igeo->inode_cluster_size_raw; + + new_size *= mp->m_sb.sb_inodesize / XFS_DINODE_MIN_SIZE; + if (mp->m_sb.sb_inoalignmt >= XFS_B_TO_FSBT(mp, new_size)) + igeo->inode_cluster_size_raw = new_size; + } + + /* Calculate inode cluster ratios. */ + if (igeo->inode_cluster_size_raw > mp->m_sb.sb_blocksize) + igeo->blocks_per_cluster = XFS_B_TO_FSBT(mp, + igeo->inode_cluster_size_raw); + else + igeo->blocks_per_cluster = 1; + igeo->inode_cluster_size = XFS_FSB_TO_B(mp, igeo->blocks_per_cluster); + igeo->inodes_per_cluster = XFS_FSB_TO_INO(mp, igeo->blocks_per_cluster); + + /* Calculate inode cluster alignment. */ + if (xfs_sb_version_hasalign(&mp->m_sb) && + mp->m_sb.sb_inoalignmt >= igeo->blocks_per_cluster) + igeo->cluster_align = mp->m_sb.sb_inoalignmt; + else + igeo->cluster_align = 1; + igeo->inoalign_mask = igeo->cluster_align - 1; + igeo->cluster_align_inodes = XFS_FSB_TO_INO(mp, igeo->cluster_align); + + /* + * If we are using stripe alignment, check whether + * the stripe unit is a multiple of the inode alignment + */ + if (mp->m_dalign && igeo->inoalign_mask && + !(mp->m_dalign & igeo->inoalign_mask)) + igeo->ialloc_align = mp->m_dalign; + else + igeo->ialloc_align = 0; +} diff --git a/fs/xfs/libxfs/xfs_ialloc.h b/fs/xfs/libxfs/xfs_ialloc.h index e936b7cc9389..323592d563d5 100644 --- a/fs/xfs/libxfs/xfs_ialloc.h +++ b/fs/xfs/libxfs/xfs_ialloc.h @@ -23,16 +23,6 @@ struct xfs_icluster { * sparse chunks */ }; -/* Calculate and return the number of filesystem blocks per inode cluster */ -static inline int -xfs_icluster_size_fsb( - struct xfs_mount *mp) -{ - if (mp->m_sb.sb_blocksize >= mp->m_inode_cluster_size) - return 1; - return mp->m_inode_cluster_size >> mp->m_sb.sb_blocklog; -} - /* * Make an inode pointer out of the buffer/offset. */ @@ -96,13 +86,6 @@ xfs_imap( uint flags); /* flags for inode btree lookup */ /* - * Compute and fill in value of m_in_maxlevels. - */ -void -xfs_ialloc_compute_maxlevels( - struct xfs_mount *mp); /* file system mount structure */ - -/* * Log specified fields for the ag hdr (inode section) */ void @@ -168,5 +151,6 @@ int xfs_inobt_insert_rec(struct xfs_btree_cur *cur, uint16_t holemask, int *stat); int xfs_ialloc_cluster_alignment(struct xfs_mount *mp); +void xfs_ialloc_setup_geometry(struct xfs_mount *mp); #endif /* __XFS_IALLOC_H__ */ diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c index 9b25e7a0df47..b82992f795aa 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.c +++ b/fs/xfs/libxfs/xfs_ialloc_btree.c @@ -11,14 +11,12 @@ #include "xfs_trans_resv.h" #include "xfs_bit.h" #include "xfs_mount.h" -#include "xfs_inode.h" #include "xfs_btree.h" #include "xfs_ialloc.h" #include "xfs_ialloc_btree.h" #include "xfs_alloc.h" #include "xfs_error.h" #include "xfs_trace.h" -#include "xfs_cksum.h" #include "xfs_trans.h" #include "xfs_rmap.h" @@ -28,7 +26,7 @@ xfs_inobt_get_minrecs( struct xfs_btree_cur *cur, int level) { - return cur->bc_mp->m_inobt_mnr[level != 0]; + return M_IGEO(cur->bc_mp)->inobt_mnr[level != 0]; } STATIC struct xfs_btree_cur * @@ -124,7 +122,7 @@ xfs_finobt_alloc_block( union xfs_btree_ptr *new, int *stat) { - if (cur->bc_mp->m_inotbt_nores) + if (cur->bc_mp->m_finobt_nores) return xfs_inobt_alloc_block(cur, start, new, stat); return __xfs_inobt_alloc_block(cur, start, new, stat, XFS_AG_RESV_METADATA); @@ -154,7 +152,7 @@ xfs_finobt_free_block( struct xfs_btree_cur *cur, struct xfs_buf *bp) { - if (cur->bc_mp->m_inotbt_nores) + if (cur->bc_mp->m_finobt_nores) return xfs_inobt_free_block(cur, bp); return __xfs_inobt_free_block(cur, bp, XFS_AG_RESV_METADATA); } @@ -164,7 +162,7 @@ xfs_inobt_get_maxrecs( struct xfs_btree_cur *cur, int level) { - return cur->bc_mp->m_inobt_mxr[level != 0]; + return M_IGEO(cur->bc_mp)->inobt_mxr[level != 0]; } STATIC void @@ -255,11 +253,14 @@ static xfs_failaddr_t xfs_inobt_verify( struct xfs_buf *bp) { - struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_mount *mp = bp->b_mount; struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); xfs_failaddr_t fa; unsigned int level; + if (!xfs_verify_magic(bp, block->bb_magic)) + return __this_address; + /* * During growfs operations, we can't verify the exact owner as the * perag is not fully initialised and hence not attached to the buffer. @@ -270,26 +271,19 @@ xfs_inobt_verify( * but beware of the landmine (i.e. need to check pag->pagi_init) if we * ever do. */ - switch (block->bb_magic) { - case cpu_to_be32(XFS_IBT_CRC_MAGIC): - case cpu_to_be32(XFS_FIBT_CRC_MAGIC): + if (xfs_sb_version_hascrc(&mp->m_sb)) { fa = xfs_btree_sblock_v5hdr_verify(bp); if (fa) return fa; - /* fall through */ - case cpu_to_be32(XFS_IBT_MAGIC): - case cpu_to_be32(XFS_FIBT_MAGIC): - break; - default: - return __this_address; } /* level verification */ level = be16_to_cpu(block->bb_level); - if (level >= mp->m_in_maxlevels) + if (level >= M_IGEO(mp)->inobt_maxlevels) return __this_address; - return xfs_btree_sblock_verify(bp, mp->m_inobt_mxr[level != 0]); + return xfs_btree_sblock_verify(bp, + M_IGEO(mp)->inobt_mxr[level != 0]); } static void @@ -328,6 +322,16 @@ xfs_inobt_write_verify( const struct xfs_buf_ops xfs_inobt_buf_ops = { .name = "xfs_inobt", + .magic = { cpu_to_be32(XFS_IBT_MAGIC), cpu_to_be32(XFS_IBT_CRC_MAGIC) }, + .verify_read = xfs_inobt_read_verify, + .verify_write = xfs_inobt_write_verify, + .verify_struct = xfs_inobt_verify, +}; + +const struct xfs_buf_ops xfs_finobt_buf_ops = { + .name = "xfs_finobt", + .magic = { cpu_to_be32(XFS_FIBT_MAGIC), + cpu_to_be32(XFS_FIBT_CRC_MAGIC) }, .verify_read = xfs_inobt_read_verify, .verify_write = xfs_inobt_write_verify, .verify_struct = xfs_inobt_verify, @@ -389,7 +393,7 @@ static const struct xfs_btree_ops xfs_finobt_ops = { .init_rec_from_cur = xfs_inobt_init_rec_from_cur, .init_ptr_from_cur = xfs_finobt_init_ptr_from_cur, .key_diff = xfs_inobt_key_diff, - .buf_ops = &xfs_inobt_buf_ops, + .buf_ops = &xfs_finobt_buf_ops, .diff_two_keys = xfs_inobt_diff_two_keys, .keys_inorder = xfs_inobt_keys_inorder, .recs_inorder = xfs_inobt_recs_inorder, @@ -541,14 +545,53 @@ xfs_inobt_max_size( xfs_agblock_t agblocks = xfs_ag_block_count(mp, agno); /* Bail out if we're uninitialized, which can happen in mkfs. */ - if (mp->m_inobt_mxr[0] == 0) + if (M_IGEO(mp)->inobt_mxr[0] == 0) return 0; - return xfs_btree_calc_size(mp->m_inobt_mnr, + /* + * The log is permanently allocated, so the space it occupies will + * never be available for the kinds of things that would require btree + * expansion. We therefore can pretend the space isn't there. + */ + if (mp->m_sb.sb_logstart && + XFS_FSB_TO_AGNO(mp, mp->m_sb.sb_logstart) == agno) + agblocks -= mp->m_sb.sb_logblocks; + + return xfs_btree_calc_size(M_IGEO(mp)->inobt_mnr, (uint64_t)agblocks * mp->m_sb.sb_inopblock / XFS_INODES_PER_CHUNK); } +/* Read AGI and create inobt cursor. */ +int +xfs_inobt_cur( + struct xfs_mount *mp, + struct xfs_trans *tp, + xfs_agnumber_t agno, + xfs_btnum_t which, + struct xfs_btree_cur **curpp, + struct xfs_buf **agi_bpp) +{ + struct xfs_btree_cur *cur; + int error; + + ASSERT(*agi_bpp == NULL); + ASSERT(*curpp == NULL); + + error = xfs_ialloc_read_agi(mp, tp, agno, agi_bpp); + if (error) + return error; + + cur = xfs_inobt_init_cursor(mp, tp, *agi_bpp, agno, which); + if (!cur) { + xfs_trans_brelse(tp, *agi_bpp); + *agi_bpp = NULL; + return -ENOMEM; + } + *curpp = cur; + return 0; +} + static int xfs_inobt_count_blocks( struct xfs_mount *mp, @@ -557,15 +600,14 @@ xfs_inobt_count_blocks( xfs_btnum_t btnum, xfs_extlen_t *tree_blocks) { - struct xfs_buf *agbp; - struct xfs_btree_cur *cur; + struct xfs_buf *agbp = NULL; + struct xfs_btree_cur *cur = NULL; int error; - error = xfs_ialloc_read_agi(mp, tp, agno, &agbp); + error = xfs_inobt_cur(mp, tp, agno, btnum, &cur, &agbp); if (error) return error; - cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, btnum); error = xfs_btree_count_blocks(cur, tree_blocks); xfs_btree_del_cursor(cur, error); xfs_trans_brelse(tp, agbp); @@ -605,5 +647,5 @@ xfs_iallocbt_calc_size( struct xfs_mount *mp, unsigned long long len) { - return xfs_btree_calc_size(mp->m_inobt_mnr, len); + return xfs_btree_calc_size(M_IGEO(mp)->inobt_mnr, len); } diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.h b/fs/xfs/libxfs/xfs_ialloc_btree.h index ebdd0c6b8766..951305ecaae1 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.h +++ b/fs/xfs/libxfs/xfs_ialloc_btree.h @@ -64,5 +64,8 @@ int xfs_finobt_calc_reserves(struct xfs_mount *mp, struct xfs_trans *tp, xfs_agnumber_t agno, xfs_extlen_t *ask, xfs_extlen_t *used); extern xfs_extlen_t xfs_iallocbt_calc_size(struct xfs_mount *mp, unsigned long long len); +int xfs_inobt_cur(struct xfs_mount *mp, struct xfs_trans *tp, + xfs_agnumber_t agno, xfs_btnum_t btnum, + struct xfs_btree_cur **curpp, struct xfs_buf **agi_bpp); #endif /* __XFS_IALLOC_BTREE_H__ */ diff --git a/fs/xfs/libxfs/xfs_iext_tree.c b/fs/xfs/libxfs/xfs_iext_tree.c index 771dd072015d..27aa3f2bc4bc 100644 --- a/fs/xfs/libxfs/xfs_iext_tree.c +++ b/fs/xfs/libxfs/xfs_iext_tree.c @@ -3,18 +3,14 @@ * Copyright (c) 2017 Christoph Hellwig. */ -#include <linux/cache.h> -#include <linux/kernel.h> -#include <linux/slab.h> #include "xfs.h" +#include "xfs_shared.h" #include "xfs_format.h" #include "xfs_bit.h" #include "xfs_log_format.h" #include "xfs_inode.h" -#include "xfs_inode_fork.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" -#include "xfs_bmap.h" #include "xfs_trace.h" /* @@ -614,16 +610,15 @@ xfs_iext_realloc_root( } /* - * Increment the sequence counter if we are on a COW fork. This allows - * the writeback code to skip looking for a COW extent if the COW fork - * hasn't changed. We use WRITE_ONCE here to ensure the update to the - * sequence counter is seen before the modifications to the extent - * tree itself take effect. + * Increment the sequence counter on extent tree changes. If we are on a COW + * fork, this allows the writeback code to skip looking for a COW extent if the + * COW fork hasn't changed. We use WRITE_ONCE here to ensure the update to the + * sequence counter is seen before the modifications to the extent tree itself + * take effect. */ static inline void xfs_iext_inc_seq(struct xfs_ifork *ifp, int state) { - if (state & BMAP_COWFORK) - WRITE_ONCE(ifp->if_seq, READ_ONCE(ifp->if_seq) + 1); + WRITE_ONCE(ifp->if_seq, READ_ONCE(ifp->if_seq) + 1); } void diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c index 09d9c8cfa4a0..28ab3c5255e1 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.c +++ b/fs/xfs/libxfs/xfs_inode_buf.c @@ -10,11 +10,9 @@ #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" -#include "xfs_defer.h" #include "xfs_inode.h" #include "xfs_errortag.h" #include "xfs_error.h" -#include "xfs_cksum.h" #include "xfs_icache.h" #include "xfs_trans.h" #include "xfs_ialloc.h" @@ -33,12 +31,9 @@ xfs_inobp_check( xfs_buf_t *bp) { int i; - int j; xfs_dinode_t *dip; - j = mp->m_inode_cluster_size >> mp->m_sb.sb_inodelog; - - for (i = 0; i < j; i++) { + for (i = 0; i < M_IGEO(mp)->inodes_per_cluster; i++) { dip = xfs_buf_offset(bp, i * mp->m_sb.sb_inodesize); if (!dip->di_next_unlinked) { xfs_alert(mp, @@ -80,7 +75,7 @@ xfs_inode_buf_verify( struct xfs_buf *bp, bool readahead) { - struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_mount *mp = bp->b_mount; xfs_agnumber_t agno; int i; int ni; @@ -97,10 +92,9 @@ xfs_inode_buf_verify( dip = xfs_buf_offset(bp, (i << mp->m_sb.sb_inodelog)); unlinked_ino = be32_to_cpu(dip->di_next_unlinked); - di_ok = dip->di_magic == cpu_to_be16(XFS_DINODE_MAGIC) && + di_ok = xfs_verify_magic16(bp, dip->di_magic) && xfs_dinode_good_version(mp, dip->di_version) && - (unlinked_ino == NULLAGINO || - xfs_verify_agino(mp, agno, unlinked_ino)); + xfs_verify_agino_or_null(mp, agno, unlinked_ino); if (unlikely(XFS_TEST_ERROR(!di_ok, mp, XFS_ERRTAG_ITOBP_INOTOBP))) { if (readahead) { @@ -147,12 +141,16 @@ xfs_inode_buf_write_verify( const struct xfs_buf_ops xfs_inode_buf_ops = { .name = "xfs_inode", + .magic16 = { cpu_to_be16(XFS_DINODE_MAGIC), + cpu_to_be16(XFS_DINODE_MAGIC) }, .verify_read = xfs_inode_buf_read_verify, .verify_write = xfs_inode_buf_write_verify, }; const struct xfs_buf_ops xfs_inode_buf_ra_ops = { - .name = "xxfs_inode_ra", + .name = "xfs_inode_ra", + .magic16 = { cpu_to_be16(XFS_DINODE_MAGIC), + cpu_to_be16(XFS_DINODE_MAGIC) }, .verify_read = xfs_inode_buf_readahead_verify, .verify_write = xfs_inode_buf_write_verify, }; diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c index f9acf1d436f6..bf3e04018246 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.c +++ b/fs/xfs/libxfs/xfs_inode_fork.c @@ -3,10 +3,10 @@ * Copyright (c) 2000-2006 Silicon Graphics, Inc. * All Rights Reserved. */ -#include <linux/log2.h> #include "xfs.h" #include "xfs_fs.h" +#include "xfs_shared.h" #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" @@ -19,12 +19,10 @@ #include "xfs_bmap.h" #include "xfs_error.h" #include "xfs_trace.h" -#include "xfs_attr_sf.h" #include "xfs_da_format.h" #include "xfs_da_btree.h" #include "xfs_dir2_priv.h" #include "xfs_attr_leaf.h" -#include "xfs_shared.h" kmem_zone_t *xfs_ifork_zone; diff --git a/fs/xfs/libxfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h index 60361d2d74a1..00c62ce170d0 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.h +++ b/fs/xfs/libxfs/xfs_inode_fork.h @@ -14,7 +14,7 @@ struct xfs_dinode; */ struct xfs_ifork { int if_bytes; /* bytes in if_u1 */ - unsigned int if_seq; /* cow fork mod counter */ + unsigned int if_seq; /* fork mod counter */ struct xfs_btree_block *if_broot; /* file's incore btree root */ short if_broot_bytes; /* bytes allocated for root */ unsigned char if_flags; /* per-fork flags */ diff --git a/fs/xfs/libxfs/xfs_log_rlimit.c b/fs/xfs/libxfs/xfs_log_rlimit.c index 1b542ec11d5d..7f55eb3f3653 100644 --- a/fs/xfs/libxfs/xfs_log_rlimit.c +++ b/fs/xfs/libxfs/xfs_log_rlimit.c @@ -12,9 +12,7 @@ #include "xfs_mount.h" #include "xfs_da_format.h" #include "xfs_trans_space.h" -#include "xfs_inode.h" #include "xfs_da_btree.h" -#include "xfs_attr_leaf.h" #include "xfs_bmap_btree.h" /* diff --git a/fs/xfs/libxfs/xfs_quota_defs.h b/fs/xfs/libxfs/xfs_quota_defs.h index 4bfdd5f4c6af..b2113b17e53c 100644 --- a/fs/xfs/libxfs/xfs_quota_defs.h +++ b/fs/xfs/libxfs/xfs_quota_defs.h @@ -142,7 +142,7 @@ extern xfs_failaddr_t xfs_dquot_verify(struct xfs_mount *mp, extern xfs_failaddr_t xfs_dqblk_verify(struct xfs_mount *mp, struct xfs_dqblk *dqb, xfs_dqid_t id, uint type); extern int xfs_calc_dquots_per_chunk(unsigned int nbblks); -extern int xfs_dqblk_repair(struct xfs_mount *mp, struct xfs_dqblk *dqb, +extern void xfs_dqblk_repair(struct xfs_mount *mp, struct xfs_dqblk *dqb, xfs_dqid_t id, uint type); #endif /* __XFS_QUOTA_H__ */ diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c index 542aa1475b5f..51bb9bdb0e84 100644 --- a/fs/xfs/libxfs/xfs_refcount.c +++ b/fs/xfs/libxfs/xfs_refcount.c @@ -9,7 +9,6 @@ #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" -#include "xfs_sb.h" #include "xfs_mount.h" #include "xfs_defer.h" #include "xfs_btree.h" @@ -19,7 +18,6 @@ #include "xfs_errortag.h" #include "xfs_error.h" #include "xfs_trace.h" -#include "xfs_cksum.h" #include "xfs_trans.h" #include "xfs_bit.h" #include "xfs_refcount.h" diff --git a/fs/xfs/libxfs/xfs_refcount_btree.c b/fs/xfs/libxfs/xfs_refcount_btree.c index d9eab657b63e..38529dbacd55 100644 --- a/fs/xfs/libxfs/xfs_refcount_btree.c +++ b/fs/xfs/libxfs/xfs_refcount_btree.c @@ -12,12 +12,10 @@ #include "xfs_sb.h" #include "xfs_mount.h" #include "xfs_btree.h" -#include "xfs_bmap.h" #include "xfs_refcount_btree.h" #include "xfs_alloc.h" #include "xfs_error.h" #include "xfs_trace.h" -#include "xfs_cksum.h" #include "xfs_trans.h" #include "xfs_bit.h" #include "xfs_rmap.h" @@ -203,13 +201,13 @@ STATIC xfs_failaddr_t xfs_refcountbt_verify( struct xfs_buf *bp) { - struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_mount *mp = bp->b_mount; struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); struct xfs_perag *pag = bp->b_pag; xfs_failaddr_t fa; unsigned int level; - if (block->bb_magic != cpu_to_be32(XFS_REFC_CRC_MAGIC)) + if (!xfs_verify_magic(bp, block->bb_magic)) return __this_address; if (!xfs_sb_version_hasreflink(&mp->m_sb)) @@ -264,6 +262,7 @@ xfs_refcountbt_write_verify( const struct xfs_buf_ops xfs_refcountbt_buf_ops = { .name = "xfs_refcountbt", + .magic = { 0, cpu_to_be32(XFS_REFC_CRC_MAGIC) }, .verify_read = xfs_refcountbt_read_verify, .verify_write = xfs_refcountbt_write_verify, .verify_struct = xfs_refcountbt_verify, @@ -426,6 +425,15 @@ xfs_refcountbt_calc_reserves( tree_len = be32_to_cpu(agf->agf_refcount_blocks); xfs_trans_brelse(tp, agbp); + /* + * The log is permanently allocated, so the space it occupies will + * never be available for the kinds of things that would require btree + * expansion. We therefore can pretend the space isn't there. + */ + if (mp->m_sb.sb_logstart && + XFS_FSB_TO_AGNO(mp, mp->m_sb.sb_logstart) == agno) + agblocks -= mp->m_sb.sb_logblocks; + *ask += xfs_refcountbt_max_size(mp, agblocks); *used += tree_len; diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c index 8ed885507dd8..e6aeb390b2fb 100644 --- a/fs/xfs/libxfs/xfs_rmap.c +++ b/fs/xfs/libxfs/xfs_rmap.c @@ -10,24 +10,17 @@ #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_bit.h" -#include "xfs_sb.h" #include "xfs_mount.h" #include "xfs_defer.h" -#include "xfs_da_format.h" -#include "xfs_da_btree.h" #include "xfs_btree.h" #include "xfs_trans.h" #include "xfs_alloc.h" #include "xfs_rmap.h" #include "xfs_rmap_btree.h" -#include "xfs_trans_space.h" #include "xfs_trace.h" #include "xfs_errortag.h" #include "xfs_error.h" -#include "xfs_extent_busy.h" -#include "xfs_bmap.h" #include "xfs_inode.h" -#include "xfs_ialloc.h" /* * Lookup the first record less than or equal to [bno, len, owner, offset] diff --git a/fs/xfs/libxfs/xfs_rmap_btree.c b/fs/xfs/libxfs/xfs_rmap_btree.c index f79cf040d745..fc78efa52c94 100644 --- a/fs/xfs/libxfs/xfs_rmap_btree.c +++ b/fs/xfs/libxfs/xfs_rmap_btree.c @@ -9,18 +9,14 @@ #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" -#include "xfs_bit.h" #include "xfs_sb.h" #include "xfs_mount.h" -#include "xfs_defer.h" -#include "xfs_inode.h" #include "xfs_trans.h" #include "xfs_alloc.h" #include "xfs_btree.h" #include "xfs_rmap.h" #include "xfs_rmap_btree.h" #include "xfs_trace.h" -#include "xfs_cksum.h" #include "xfs_error.h" #include "xfs_extent_busy.h" #include "xfs_ag_resv.h" @@ -292,7 +288,7 @@ static xfs_failaddr_t xfs_rmapbt_verify( struct xfs_buf *bp) { - struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_mount *mp = bp->b_mount; struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); struct xfs_perag *pag = bp->b_pag; xfs_failaddr_t fa; @@ -310,7 +306,7 @@ xfs_rmapbt_verify( * from the on disk AGF. Again, we can only check against maximum limits * in this case. */ - if (block->bb_magic != cpu_to_be32(XFS_RMAP_CRC_MAGIC)) + if (!xfs_verify_magic(bp, block->bb_magic)) return __this_address; if (!xfs_sb_version_hasrmapbt(&mp->m_sb)) @@ -365,6 +361,7 @@ xfs_rmapbt_write_verify( const struct xfs_buf_ops xfs_rmapbt_buf_ops = { .name = "xfs_rmapbt", + .magic = { 0, cpu_to_be32(XFS_RMAP_CRC_MAGIC) }, .verify_read = xfs_rmapbt_read_verify, .verify_write = xfs_rmapbt_write_verify, .verify_struct = xfs_rmapbt_verify, @@ -577,6 +574,15 @@ xfs_rmapbt_calc_reserves( tree_len = be32_to_cpu(agf->agf_rmap_blocks); xfs_trans_brelse(tp, agbp); + /* + * The log is permanently allocated, so the space it occupies will + * never be available for the kinds of things that would require btree + * expansion. We therefore can pretend the space isn't there. + */ + if (mp->m_sb.sb_logstart && + XFS_FSB_TO_AGNO(mp, mp->m_sb.sb_logstart) == agno) + agblocks -= mp->m_sb.sb_logblocks; + /* Reserve 1% of the AG or enough for 1 block per record. */ *ask += max(agblocks / 100, xfs_rmapbt_max_size(mp, agblocks)); *used += tree_len; diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c index eaaff67e9626..8ea1efc97b41 100644 --- a/fs/xfs/libxfs/xfs_rtbitmap.c +++ b/fs/xfs/libxfs/xfs_rtbitmap.c @@ -13,15 +13,7 @@ #include "xfs_mount.h" #include "xfs_inode.h" #include "xfs_bmap.h" -#include "xfs_bmap_util.h" -#include "xfs_bmap_btree.h" -#include "xfs_alloc.h" -#include "xfs_error.h" #include "xfs_trans.h" -#include "xfs_trans_space.h" -#include "xfs_trace.h" -#include "xfs_buf.h" -#include "xfs_icache.h" #include "xfs_rtalloc.h" diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index b5a82acd7dfe..a08dd8f40346 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -10,26 +10,20 @@ #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_bit.h" -#include "xfs_sb.h" #include "xfs_mount.h" -#include "xfs_defer.h" -#include "xfs_inode.h" #include "xfs_ialloc.h" #include "xfs_alloc.h" #include "xfs_error.h" #include "xfs_trace.h" -#include "xfs_cksum.h" #include "xfs_trans.h" #include "xfs_buf_item.h" #include "xfs_bmap_btree.h" #include "xfs_alloc_btree.h" -#include "xfs_ialloc_btree.h" #include "xfs_log.h" #include "xfs_rmap_btree.h" -#include "xfs_bmap.h" #include "xfs_refcount_btree.h" #include "xfs_da_format.h" -#include "xfs_da_btree.h" +#include "xfs_health.h" /* * Physical superblock buffer manipulations. Shared with libxfs in userspace. @@ -225,10 +219,11 @@ xfs_validate_sb_common( struct xfs_buf *bp, struct xfs_sb *sbp) { + struct xfs_dsb *dsb = XFS_BUF_TO_SBP(bp); uint32_t agcount = 0; uint32_t rem; - if (sbp->sb_magicnum != XFS_SB_MAGIC) { + if (!xfs_verify_magic(bp, dsb->sb_magicnum)) { xfs_warn(mp, "bad magic number"); return -EWRONGFS; } @@ -684,7 +679,7 @@ xfs_sb_read_verify( struct xfs_buf *bp) { struct xfs_sb sb; - struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_mount *mp = bp->b_mount; struct xfs_dsb *dsb = XFS_BUF_TO_SBP(bp); int error; @@ -750,7 +745,7 @@ xfs_sb_write_verify( struct xfs_buf *bp) { struct xfs_sb sb; - struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_mount *mp = bp->b_mount; struct xfs_buf_log_item *bip = bp->b_log_item; int error; @@ -781,12 +776,14 @@ out_error: const struct xfs_buf_ops xfs_sb_buf_ops = { .name = "xfs_sb", + .magic = { cpu_to_be32(XFS_SB_MAGIC), cpu_to_be32(XFS_SB_MAGIC) }, .verify_read = xfs_sb_read_verify, .verify_write = xfs_sb_write_verify, }; const struct xfs_buf_ops xfs_sb_quiet_buf_ops = { .name = "xfs_sb_quiet", + .magic = { cpu_to_be32(XFS_SB_MAGIC), cpu_to_be32(XFS_SB_MAGIC) }, .verify_read = xfs_sb_quiet_read_verify, .verify_write = xfs_sb_write_verify, }; @@ -796,12 +793,14 @@ const struct xfs_buf_ops xfs_sb_quiet_buf_ops = { * * Mount initialization code establishing various mount * fields from the superblock associated with the given - * mount structure + * mount structure. + * + * Inode geometry are calculated in xfs_ialloc_setup_geometry. */ void xfs_sb_mount_common( - struct xfs_mount *mp, - struct xfs_sb *sbp) + struct xfs_mount *mp, + struct xfs_sb *sbp) { mp->m_agfrotor = mp->m_agirotor = 0; mp->m_maxagi = mp->m_sb.sb_agcount; @@ -809,7 +808,6 @@ xfs_sb_mount_common( mp->m_blkbb_log = sbp->sb_blocklog - BBSHIFT; mp->m_sectbb_log = sbp->sb_sectlog - BBSHIFT; mp->m_agno_log = xfs_highbit32(sbp->sb_agcount - 1) + 1; - mp->m_agino_log = sbp->sb_inopblog + sbp->sb_agblklog; mp->m_blockmask = sbp->sb_blocksize - 1; mp->m_blockwsize = sbp->sb_blocksize >> XFS_WORDLOG; mp->m_blockwmask = mp->m_blockwsize - 1; @@ -819,11 +817,6 @@ xfs_sb_mount_common( mp->m_alloc_mnr[0] = mp->m_alloc_mxr[0] / 2; mp->m_alloc_mnr[1] = mp->m_alloc_mxr[1] / 2; - mp->m_inobt_mxr[0] = xfs_inobt_maxrecs(mp, sbp->sb_blocksize, 1); - mp->m_inobt_mxr[1] = xfs_inobt_maxrecs(mp, sbp->sb_blocksize, 0); - mp->m_inobt_mnr[0] = mp->m_inobt_mxr[0] / 2; - mp->m_inobt_mnr[1] = mp->m_inobt_mxr[1] / 2; - mp->m_bmap_dmxr[0] = xfs_bmbt_maxrecs(mp, sbp->sb_blocksize, 1); mp->m_bmap_dmxr[1] = xfs_bmbt_maxrecs(mp, sbp->sb_blocksize, 0); mp->m_bmap_dmnr[0] = mp->m_bmap_dmxr[0] / 2; @@ -840,14 +833,6 @@ xfs_sb_mount_common( mp->m_refc_mnr[1] = mp->m_refc_mxr[1] / 2; mp->m_bsize = XFS_FSB_TO_BB(mp, 1); - mp->m_ialloc_inos = max_t(uint16_t, XFS_INODES_PER_CHUNK, - sbp->sb_inopblock); - mp->m_ialloc_blks = mp->m_ialloc_inos >> sbp->sb_inopblog; - - if (sbp->sb_spino_align) - mp->m_ialloc_min_blks = sbp->sb_spino_align; - else - mp->m_ialloc_min_blks = mp->m_ialloc_blks; mp->m_alloc_set_aside = xfs_alloc_set_aside(mp); mp->m_ag_max_usable = xfs_alloc_ag_max_usable(mp); } @@ -874,7 +859,7 @@ xfs_initialize_perag_data( uint64_t bfreelst = 0; uint64_t btree = 0; uint64_t fdblocks; - int error; + int error = 0; for (index = 0; index < agcount; index++) { /* @@ -902,7 +887,7 @@ xfs_initialize_perag_data( /* * If the new summary counts are obviously incorrect, fail the * mount operation because that implies the AGFs are also corrupt. - * Clear BAD_SUMMARY so that we don't unmount with a dirty log, which + * Clear FS_COUNTERS so that we don't unmount with a dirty log, which * will prevent xfs_repair from fixing anything. */ if (fdblocks > sbp->sb_dblocks || ifree > ialloc) { @@ -920,7 +905,7 @@ xfs_initialize_perag_data( xfs_reinit_percpu_counters(mp); out: - mp->m_flags &= ~XFS_MOUNT_BAD_SUMMARY; + xfs_fs_mark_healthy(mp, XFS_SICK_FS_COUNTERS); return error; } @@ -935,7 +920,7 @@ xfs_log_sb( struct xfs_trans *tp) { struct xfs_mount *mp = tp->t_mountp; - struct xfs_buf *bp = xfs_trans_getsb(tp, mp, 0); + struct xfs_buf *bp = xfs_trans_getsb(tp, mp); mp->m_sb.sb_icount = percpu_counter_sum(&mp->m_icount); mp->m_sb.sb_ifree = percpu_counter_sum(&mp->m_ifree); @@ -1001,7 +986,7 @@ xfs_update_secondary_sbs( bp = xfs_buf_get(mp->m_ddev_targp, XFS_AG_DADDR(mp, agno, XFS_SB_DADDR), - XFS_FSS_TO_BB(mp, 1), 0); + XFS_FSS_TO_BB(mp, 1)); /* * If we get an error reading or writing alternate superblocks, * continue. xfs_repair chooses the "best" superblock based @@ -1065,7 +1050,7 @@ xfs_sync_sb_buf( if (error) return error; - bp = xfs_trans_getsb(tp, mp, 0); + bp = xfs_trans_getsb(tp, mp); xfs_log_sb(tp); xfs_trans_bhold(tp, bp); xfs_trans_set_sync(tp); @@ -1081,7 +1066,7 @@ out: return error; } -int +void xfs_fs_geometry( struct xfs_sb *sbp, struct xfs_fsop_geom *geo, @@ -1105,13 +1090,13 @@ xfs_fs_geometry( memcpy(geo->uuid, &sbp->sb_uuid, sizeof(sbp->sb_uuid)); if (struct_version < 2) - return 0; + return; geo->sunit = sbp->sb_unit; geo->swidth = sbp->sb_width; if (struct_version < 3) - return 0; + return; geo->version = XFS_FSOP_GEOM_VERSION; geo->flags = XFS_FSOP_GEOM_FLAGS_NLINK | @@ -1155,14 +1140,17 @@ xfs_fs_geometry( geo->dirblocksize = xfs_dir2_dirblock_bytes(sbp); if (struct_version < 4) - return 0; + return; if (xfs_sb_version_haslogv2(sbp)) geo->flags |= XFS_FSOP_GEOM_FLAGS_LOGV2; geo->logsunit = sbp->sb_logsunit; - return 0; + if (struct_version < 5) + return; + + geo->version = XFS_FSOP_GEOM_VERSION_V5; } /* Read a secondary superblock. */ diff --git a/fs/xfs/libxfs/xfs_sb.h b/fs/xfs/libxfs/xfs_sb.h index 13564d69800a..92465a9a5162 100644 --- a/fs/xfs/libxfs/xfs_sb.h +++ b/fs/xfs/libxfs/xfs_sb.h @@ -33,7 +33,7 @@ extern void xfs_sb_quota_from_disk(struct xfs_sb *sbp); extern int xfs_update_secondary_sbs(struct xfs_mount *mp); #define XFS_FS_GEOM_MAX_STRUCT_VER (4) -extern int xfs_fs_geometry(struct xfs_sb *sbp, struct xfs_fsop_geom *geo, +extern void xfs_fs_geometry(struct xfs_sb *sbp, struct xfs_fsop_geom *geo, int struct_version); extern int xfs_sb_read_secondary(struct xfs_mount *mp, struct xfs_trans *tp, xfs_agnumber_t agno, diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h index 1c5debe748f0..e0641b7337b3 100644 --- a/fs/xfs/libxfs/xfs_shared.h +++ b/fs/xfs/libxfs/xfs_shared.h @@ -25,7 +25,8 @@ extern const struct xfs_buf_ops xfs_agf_buf_ops; extern const struct xfs_buf_ops xfs_agi_buf_ops; extern const struct xfs_buf_ops xfs_agf_buf_ops; extern const struct xfs_buf_ops xfs_agfl_buf_ops; -extern const struct xfs_buf_ops xfs_allocbt_buf_ops; +extern const struct xfs_buf_ops xfs_bnobt_buf_ops; +extern const struct xfs_buf_ops xfs_cntbt_buf_ops; extern const struct xfs_buf_ops xfs_rmapbt_buf_ops; extern const struct xfs_buf_ops xfs_refcountbt_buf_ops; extern const struct xfs_buf_ops xfs_attr3_leaf_buf_ops; @@ -36,6 +37,7 @@ extern const struct xfs_buf_ops xfs_dquot_buf_ops; extern const struct xfs_buf_ops xfs_symlink_buf_ops; extern const struct xfs_buf_ops xfs_agi_buf_ops; extern const struct xfs_buf_ops xfs_inobt_buf_ops; +extern const struct xfs_buf_ops xfs_finobt_buf_ops; extern const struct xfs_buf_ops xfs_inode_buf_ops; extern const struct xfs_buf_ops xfs_inode_buf_ra_ops; extern const struct xfs_buf_ops xfs_dquot_buf_ops; @@ -63,7 +65,6 @@ void xfs_log_get_max_trans_res(struct xfs_mount *mp, #define XFS_TRANS_DQ_DIRTY 0x10 /* at least one dquot in trx dirty */ #define XFS_TRANS_RESERVE 0x20 /* OK to use reserved data blocks */ #define XFS_TRANS_NO_WRITECOUNT 0x40 /* do not elevate SB writecount */ -#define XFS_TRANS_NOFS 0x80 /* pass KM_NOFS to kmem_alloc */ /* * LOWMODE is used by the allocator to activate the lowspace algorithm - when * free space is running low the extent allocator may choose to allocate an @@ -134,4 +135,52 @@ void xfs_symlink_local_to_remote(struct xfs_trans *tp, struct xfs_buf *bp, struct xfs_inode *ip, struct xfs_ifork *ifp); xfs_failaddr_t xfs_symlink_shortform_verify(struct xfs_inode *ip); +/* Computed inode geometry for the filesystem. */ +struct xfs_ino_geometry { + /* Maximum inode count in this filesystem. */ + uint64_t maxicount; + + /* Actual inode cluster buffer size, in bytes. */ + unsigned int inode_cluster_size; + + /* + * Desired inode cluster buffer size, in bytes. This value is not + * rounded up to at least one filesystem block, which is necessary for + * the sole purpose of validating sb_spino_align. Runtime code must + * only ever use inode_cluster_size. + */ + unsigned int inode_cluster_size_raw; + + /* Inode cluster sizes, adjusted to be at least 1 fsb. */ + unsigned int inodes_per_cluster; + unsigned int blocks_per_cluster; + + /* Inode cluster alignment. */ + unsigned int cluster_align; + unsigned int cluster_align_inodes; + unsigned int inoalign_mask; /* mask sb_inoalignmt if used */ + + unsigned int inobt_mxr[2]; /* max inobt btree records */ + unsigned int inobt_mnr[2]; /* min inobt btree records */ + unsigned int inobt_maxlevels; /* max inobt btree levels. */ + + /* Size of inode allocations under normal operation. */ + unsigned int ialloc_inos; + unsigned int ialloc_blks; + + /* Minimum inode blocks for a sparse allocation. */ + unsigned int ialloc_min_blks; + + /* stripe unit inode alignment */ + unsigned int ialloc_align; + + unsigned int agino_log; /* #bits for agino in inum */ +}; + +/* Keep iterating the data structure. */ +#define XFS_ITER_CONTINUE (0) + +/* Stop iterating the data structure. */ +#define XFS_ITER_ABORT (1) + #endif /* __XFS_SHARED_H__ */ diff --git a/fs/xfs/libxfs/xfs_symlink_remote.c b/fs/xfs/libxfs/xfs_symlink_remote.c index 77d80106f989..3b8260ca7d1b 100644 --- a/fs/xfs/libxfs/xfs_symlink_remote.c +++ b/fs/xfs/libxfs/xfs_symlink_remote.c @@ -11,12 +11,8 @@ #include "xfs_shared.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" -#include "xfs_bmap_btree.h" #include "xfs_inode.h" #include "xfs_error.h" -#include "xfs_trace.h" -#include "xfs_symlink.h" -#include "xfs_cksum.h" #include "xfs_trans.h" #include "xfs_buf_item.h" #include "xfs_log.h" @@ -90,12 +86,12 @@ static xfs_failaddr_t xfs_symlink_verify( struct xfs_buf *bp) { - struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_mount *mp = bp->b_mount; struct xfs_dsymlink_hdr *dsl = bp->b_addr; if (!xfs_sb_version_hascrc(&mp->m_sb)) return __this_address; - if (dsl->sl_magic != cpu_to_be32(XFS_SYMLINK_MAGIC)) + if (!xfs_verify_magic(bp, dsl->sl_magic)) return __this_address; if (!uuid_equal(&dsl->sl_uuid, &mp->m_sb.sb_meta_uuid)) return __this_address; @@ -116,7 +112,7 @@ static void xfs_symlink_read_verify( struct xfs_buf *bp) { - struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_mount *mp = bp->b_mount; xfs_failaddr_t fa; /* no verification of non-crc buffers */ @@ -136,7 +132,7 @@ static void xfs_symlink_write_verify( struct xfs_buf *bp) { - struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_mount *mp = bp->b_mount; struct xfs_buf_log_item *bip = bp->b_log_item; xfs_failaddr_t fa; @@ -159,6 +155,7 @@ xfs_symlink_write_verify( const struct xfs_buf_ops xfs_symlink_buf_ops = { .name = "xfs_symlink", + .magic = { 0, cpu_to_be32(XFS_SYMLINK_MAGIC) }, .verify_read = xfs_symlink_read_verify, .verify_write = xfs_symlink_write_verify, .verify_struct = xfs_symlink_verify, diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/libxfs/xfs_trans_inode.c index 542927321a61..a9ad90926b87 100644 --- a/fs/xfs/xfs_trans_inode.c +++ b/fs/xfs/libxfs/xfs_trans_inode.c @@ -8,13 +8,10 @@ #include "xfs_shared.h" #include "xfs_format.h" #include "xfs_log_format.h" -#include "xfs_trans_resv.h" -#include "xfs_mount.h" #include "xfs_inode.h" #include "xfs_trans.h" #include "xfs_trans_priv.h" #include "xfs_inode_item.h" -#include "xfs_trace.h" #include <linux/iversion.h> @@ -69,6 +66,10 @@ xfs_trans_ichgtime( inode->i_mtime = tv; if (flags & XFS_ICHGTIME_CHG) inode->i_ctime = tv; + if (flags & XFS_ICHGTIME_CREATE) { + ip->i_d.di_crtime.t_sec = (int32_t)tv.tv_sec; + ip->i_d.di_crtime.t_nsec = (int32_t)tv.tv_nsec; + } } /* diff --git a/fs/xfs/libxfs/xfs_trans_resv.c b/fs/xfs/libxfs/xfs_trans_resv.c index f99a7aefe418..d12bbd526e7c 100644 --- a/fs/xfs/libxfs/xfs_trans_resv.c +++ b/fs/xfs/libxfs/xfs_trans_resv.c @@ -15,12 +15,10 @@ #include "xfs_da_btree.h" #include "xfs_inode.h" #include "xfs_bmap_btree.h" -#include "xfs_ialloc.h" #include "xfs_quota.h" #include "xfs_trans.h" #include "xfs_qm.h" #include "xfs_trans_space.h" -#include "xfs_trace.h" #define _ALLOC true #define _FREE false @@ -136,9 +134,10 @@ STATIC uint xfs_calc_inobt_res( struct xfs_mount *mp) { - return xfs_calc_buf_res(mp->m_in_maxlevels, XFS_FSB_TO_B(mp, 1)) + - xfs_calc_buf_res(xfs_allocfree_log_count(mp, 1), - XFS_FSB_TO_B(mp, 1)); + return xfs_calc_buf_res(M_IGEO(mp)->inobt_maxlevels, + XFS_FSB_TO_B(mp, 1)) + + xfs_calc_buf_res(xfs_allocfree_log_count(mp, 1), + XFS_FSB_TO_B(mp, 1)); } /* @@ -167,7 +166,7 @@ xfs_calc_finobt_res( * includes: * * the allocation btrees: 2 trees * (max depth - 1) * block size - * the inode chunk: m_ialloc_blks * N + * the inode chunk: m_ino_geo.ialloc_blks * N * * The size N of the inode chunk reservation depends on whether it is for * allocation or free and which type of create transaction is in use. An inode @@ -193,7 +192,7 @@ xfs_calc_inode_chunk_res( size = XFS_FSB_TO_B(mp, 1); } - res += xfs_calc_buf_res(mp->m_ialloc_blks, size); + res += xfs_calc_buf_res(M_IGEO(mp)->ialloc_blks, size); return res; } @@ -307,7 +306,7 @@ xfs_calc_iunlink_remove_reservation( struct xfs_mount *mp) { return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) + - 2 * max_t(uint, XFS_FSB_TO_B(mp, 1), mp->m_inode_cluster_size); + 2 * M_IGEO(mp)->inode_cluster_size; } /* @@ -345,7 +344,7 @@ STATIC uint xfs_calc_iunlink_add_reservation(xfs_mount_t *mp) { return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) + - max_t(uint, XFS_FSB_TO_B(mp, 1), mp->m_inode_cluster_size); + M_IGEO(mp)->inode_cluster_size; } /* @@ -876,9 +875,13 @@ xfs_trans_resv_calc( resp->tr_sb.tr_logres = xfs_calc_sb_reservation(mp); resp->tr_sb.tr_logcount = XFS_DEFAULT_LOG_COUNT; + /* growdata requires permanent res; it can free space to the last AG */ + resp->tr_growdata.tr_logres = xfs_calc_growdata_reservation(mp); + resp->tr_growdata.tr_logcount = XFS_DEFAULT_PERM_LOG_COUNT; + resp->tr_growdata.tr_logflags |= XFS_TRANS_PERM_LOG_RES; + /* The following transaction are logged in logical format */ resp->tr_ichange.tr_logres = xfs_calc_ichange_reservation(mp); - resp->tr_growdata.tr_logres = xfs_calc_growdata_reservation(mp); resp->tr_fsyncts.tr_logres = xfs_calc_swrite_reservation(mp); resp->tr_writeid.tr_logres = xfs_calc_writeid_reservation(mp); resp->tr_attrsetrt.tr_logres = xfs_calc_attrsetrt_reservation(mp); diff --git a/fs/xfs/libxfs/xfs_trans_space.h b/fs/xfs/libxfs/xfs_trans_space.h index a62fb950bef1..88221c7a04cc 100644 --- a/fs/xfs/libxfs/xfs_trans_space.h +++ b/fs/xfs/libxfs/xfs_trans_space.h @@ -56,9 +56,9 @@ #define XFS_DIRREMOVE_SPACE_RES(mp) \ XFS_DAREMOVE_SPACE_RES(mp, XFS_DATA_FORK) #define XFS_IALLOC_SPACE_RES(mp) \ - ((mp)->m_ialloc_blks + \ + (M_IGEO(mp)->ialloc_blks + \ (xfs_sb_version_hasfinobt(&mp->m_sb) ? 2 : 1 * \ - ((mp)->m_in_maxlevels - 1))) + (M_IGEO(mp)->inobt_maxlevels - 1))) /* * Space reservation values for various transactions. @@ -94,7 +94,8 @@ #define XFS_SYMLINK_SPACE_RES(mp,nl,b) \ (XFS_IALLOC_SPACE_RES(mp) + XFS_DIRENTER_SPACE_RES(mp,nl) + (b)) #define XFS_IFREE_SPACE_RES(mp) \ - (xfs_sb_version_hasfinobt(&mp->m_sb) ? (mp)->m_in_maxlevels : 0) + (xfs_sb_version_hasfinobt(&mp->m_sb) ? \ + M_IGEO(mp)->inobt_maxlevels : 0) #endif /* __XFS_TRANS_SPACE_H__ */ diff --git a/fs/xfs/libxfs/xfs_types.c b/fs/xfs/libxfs/xfs_types.c index 3306fc42cfad..4f595546a639 100644 --- a/fs/xfs/libxfs/xfs_types.c +++ b/fs/xfs/libxfs/xfs_types.c @@ -7,19 +7,10 @@ #include "xfs.h" #include "xfs_fs.h" #include "xfs_format.h" -#include "xfs_log_format.h" #include "xfs_shared.h" #include "xfs_trans_resv.h" #include "xfs_bit.h" -#include "xfs_sb.h" #include "xfs_mount.h" -#include "xfs_defer.h" -#include "xfs_inode.h" -#include "xfs_btree.h" -#include "xfs_rmap.h" -#include "xfs_alloc_btree.h" -#include "xfs_alloc.h" -#include "xfs_ialloc.h" /* Find the size of the AG, in blocks. */ xfs_agblock_t @@ -87,14 +78,14 @@ xfs_agino_range( * Calculate the first inode, which will be in the first * cluster-aligned block after the AGFL. */ - bno = round_up(XFS_AGFL_BLOCK(mp) + 1, mp->m_cluster_align); + bno = round_up(XFS_AGFL_BLOCK(mp) + 1, M_IGEO(mp)->cluster_align); *first = XFS_AGB_TO_AGINO(mp, bno); /* * Calculate the last inode, which will be at the end of the * last (aligned) cluster that can be allocated in the AG. */ - bno = round_down(eoag, mp->m_cluster_align); + bno = round_down(eoag, M_IGEO(mp)->cluster_align); *last = XFS_AGB_TO_AGINO(mp, bno) - 1; } @@ -116,6 +107,19 @@ xfs_verify_agino( } /* + * Verify that an AG inode number pointer neither points outside the AG + * nor points at static metadata, or is NULLAGINO. + */ +bool +xfs_verify_agino_or_null( + struct xfs_mount *mp, + xfs_agnumber_t agno, + xfs_agino_t agino) +{ + return agino == NULLAGINO || xfs_verify_agino(mp, agno, agino); +} + +/* * Verify that an FS inode number pointer neither points outside the * filesystem nor points at static AG metadata. */ @@ -172,7 +176,7 @@ xfs_verify_rtbno( } /* Calculate the range of valid icount values. */ -static void +void xfs_icount_range( struct xfs_mount *mp, unsigned long long *min, @@ -204,3 +208,14 @@ xfs_verify_icount( xfs_icount_range(mp, &min, &max); return icount >= min && icount <= max; } + +/* Sanity-checking of dir/attr block offsets. */ +bool +xfs_verify_dablk( + struct xfs_mount *mp, + xfs_fileoff_t dabno) +{ + xfs_dablk_t max_dablk = -1U; + + return dabno <= max_dablk; +} diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h index 8f02855a019a..802b34cd10fe 100644 --- a/fs/xfs/libxfs/xfs_types.h +++ b/fs/xfs/libxfs/xfs_types.h @@ -183,10 +183,15 @@ void xfs_agino_range(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agino_t *first, xfs_agino_t *last); bool xfs_verify_agino(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agino_t agino); +bool xfs_verify_agino_or_null(struct xfs_mount *mp, xfs_agnumber_t agno, + xfs_agino_t agino); bool xfs_verify_ino(struct xfs_mount *mp, xfs_ino_t ino); bool xfs_internal_inum(struct xfs_mount *mp, xfs_ino_t ino); bool xfs_verify_dir_ino(struct xfs_mount *mp, xfs_ino_t ino); bool xfs_verify_rtbno(struct xfs_mount *mp, xfs_rtblock_t rtbno); bool xfs_verify_icount(struct xfs_mount *mp, unsigned long long icount); +bool xfs_verify_dablk(struct xfs_mount *mp, xfs_fileoff_t off); +void xfs_icount_range(struct xfs_mount *mp, unsigned long long *min, + unsigned long long *max); #endif /* __XFS_TYPES_H__ */ diff --git a/fs/xfs/scrub/agheader.c b/fs/xfs/scrub/agheader.c index 90955ab1e895..16b09b941441 100644 --- a/fs/xfs/scrub/agheader.c +++ b/fs/xfs/scrub/agheader.c @@ -9,20 +9,13 @@ #include "xfs_format.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" -#include "xfs_defer.h" #include "xfs_btree.h" -#include "xfs_bit.h" -#include "xfs_log_format.h" -#include "xfs_trans.h" #include "xfs_sb.h" -#include "xfs_inode.h" #include "xfs_alloc.h" #include "xfs_ialloc.h" #include "xfs_rmap.h" -#include "scrub/xfs_scrub.h" #include "scrub/scrub.h" #include "scrub/common.h" -#include "scrub/trace.h" /* Superblock */ @@ -399,7 +392,7 @@ xchk_agf_xref_cntbt( if (!xchk_should_check_xref(sc, &error, &sc->sa.cnt_cur)) return; if (!have) { - if (agf->agf_freeblks != be32_to_cpu(0)) + if (agf->agf_freeblks != cpu_to_be32(0)) xchk_block_xref_set_corrupt(sc, sc->sa.agf_bp); return; } @@ -514,6 +507,7 @@ xchk_agf( { struct xfs_mount *mp = sc->mp; struct xfs_agf *agf; + struct xfs_perag *pag; xfs_agnumber_t agno; xfs_agblock_t agbno; xfs_agblock_t eoag; @@ -586,6 +580,16 @@ xchk_agf( if (agfl_count != 0 && fl_count != agfl_count) xchk_block_set_corrupt(sc, sc->sa.agf_bp); + /* Do the incore counters match? */ + pag = xfs_perag_get(mp, agno); + if (pag->pagf_freeblks != be32_to_cpu(agf->agf_freeblks)) + xchk_block_set_corrupt(sc, sc->sa.agf_bp); + if (pag->pagf_flcount != be32_to_cpu(agf->agf_flcount)) + xchk_block_set_corrupt(sc, sc->sa.agf_bp); + if (pag->pagf_btreeblks != be32_to_cpu(agf->agf_btreeblks)) + xchk_block_set_corrupt(sc, sc->sa.agf_bp); + xfs_perag_put(pag); + xchk_agf_xref(sc); out: return error; @@ -635,7 +639,7 @@ xchk_agfl_block( xchk_agfl_block_xref(sc, agbno); if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) - return XFS_BTREE_QUERY_RANGE_ABORT; + return XFS_ITER_ABORT; return 0; } @@ -726,7 +730,7 @@ xchk_agfl( /* Check the blocks in the AGFL. */ error = xfs_agfl_walk(sc->mp, XFS_BUF_TO_AGF(sc->sa.agf_bp), sc->sa.agfl_bp, xchk_agfl_block, &sai); - if (error == XFS_BTREE_QUERY_RANGE_ABORT) { + if (error == XFS_ITER_ABORT) { error = 0; goto out_free; } @@ -811,6 +815,7 @@ xchk_agi( { struct xfs_mount *mp = sc->mp; struct xfs_agi *agi; + struct xfs_perag *pag; xfs_agnumber_t agno; xfs_agblock_t agbno; xfs_agblock_t eoag; @@ -864,25 +869,31 @@ xchk_agi( /* Check inode pointers */ agino = be32_to_cpu(agi->agi_newino); - if (agino != NULLAGINO && !xfs_verify_agino(mp, agno, agino)) + if (!xfs_verify_agino_or_null(mp, agno, agino)) xchk_block_set_corrupt(sc, sc->sa.agi_bp); agino = be32_to_cpu(agi->agi_dirino); - if (agino != NULLAGINO && !xfs_verify_agino(mp, agno, agino)) + if (!xfs_verify_agino_or_null(mp, agno, agino)) xchk_block_set_corrupt(sc, sc->sa.agi_bp); /* Check unlinked inode buckets */ for (i = 0; i < XFS_AGI_UNLINKED_BUCKETS; i++) { agino = be32_to_cpu(agi->agi_unlinked[i]); - if (agino == NULLAGINO) - continue; - if (!xfs_verify_agino(mp, agno, agino)) + if (!xfs_verify_agino_or_null(mp, agno, agino)) xchk_block_set_corrupt(sc, sc->sa.agi_bp); } if (agi->agi_pad32 != cpu_to_be32(0)) xchk_block_set_corrupt(sc, sc->sa.agi_bp); + /* Do the incore counters match? */ + pag = xfs_perag_get(mp, agno); + if (pag->pagi_count != be32_to_cpu(agi->agi_count)) + xchk_block_set_corrupt(sc, sc->sa.agi_bp); + if (pag->pagi_freecount != be32_to_cpu(agi->agi_freecount)) + xchk_block_set_corrupt(sc, sc->sa.agi_bp); + xfs_perag_put(pag); + xchk_agi_xref(sc); out: return error; diff --git a/fs/xfs/scrub/agheader_repair.c b/fs/xfs/scrub/agheader_repair.c index 03d1e15cceba..7a1a38b636a9 100644 --- a/fs/xfs/scrub/agheader_repair.c +++ b/fs/xfs/scrub/agheader_repair.c @@ -9,22 +9,17 @@ #include "xfs_format.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" -#include "xfs_defer.h" #include "xfs_btree.h" -#include "xfs_bit.h" #include "xfs_log_format.h" #include "xfs_trans.h" #include "xfs_sb.h" -#include "xfs_inode.h" #include "xfs_alloc.h" #include "xfs_alloc_btree.h" #include "xfs_ialloc.h" #include "xfs_ialloc_btree.h" #include "xfs_rmap.h" #include "xfs_rmap_btree.h" -#include "xfs_refcount.h" #include "xfs_refcount_btree.h" -#include "scrub/xfs_scrub.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/trace.h" @@ -341,23 +336,19 @@ xrep_agf( struct xrep_find_ag_btree fab[XREP_AGF_MAX] = { [XREP_AGF_BNOBT] = { .rmap_owner = XFS_RMAP_OWN_AG, - .buf_ops = &xfs_allocbt_buf_ops, - .magic = XFS_ABTB_CRC_MAGIC, + .buf_ops = &xfs_bnobt_buf_ops, }, [XREP_AGF_CNTBT] = { .rmap_owner = XFS_RMAP_OWN_AG, - .buf_ops = &xfs_allocbt_buf_ops, - .magic = XFS_ABTC_CRC_MAGIC, + .buf_ops = &xfs_cntbt_buf_ops, }, [XREP_AGF_RMAPBT] = { .rmap_owner = XFS_RMAP_OWN_AG, .buf_ops = &xfs_rmapbt_buf_ops, - .magic = XFS_RMAP_CRC_MAGIC, }, [XREP_AGF_REFCOUNTBT] = { .rmap_owner = XFS_RMAP_OWN_REFC, .buf_ops = &xfs_refcountbt_buf_ops, - .magic = XFS_REFC_CRC_MAGIC, }, [XREP_AGF_END] = { .buf_ops = NULL, @@ -875,12 +866,10 @@ xrep_agi( [XREP_AGI_INOBT] = { .rmap_owner = XFS_RMAP_OWN_INOBT, .buf_ops = &xfs_inobt_buf_ops, - .magic = XFS_IBT_CRC_MAGIC, }, [XREP_AGI_FINOBT] = { .rmap_owner = XFS_RMAP_OWN_INOBT, - .buf_ops = &xfs_inobt_buf_ops, - .magic = XFS_FIBT_CRC_MAGIC, + .buf_ops = &xfs_finobt_buf_ops, }, [XREP_AGI_END] = { .buf_ops = NULL diff --git a/fs/xfs/scrub/alloc.c b/fs/xfs/scrub/alloc.c index 44883e9112ad..a43d1813c4ff 100644 --- a/fs/xfs/scrub/alloc.c +++ b/fs/xfs/scrub/alloc.c @@ -9,19 +9,12 @@ #include "xfs_format.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" -#include "xfs_defer.h" #include "xfs_btree.h" -#include "xfs_bit.h" -#include "xfs_log_format.h" -#include "xfs_trans.h" -#include "xfs_sb.h" #include "xfs_alloc.h" #include "xfs_rmap.h" -#include "scrub/xfs_scrub.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/btree.h" -#include "scrub/trace.h" /* * Set us up to scrub free space btrees. diff --git a/fs/xfs/scrub/attr.c b/fs/xfs/scrub/attr.c index 81d5e90547a1..1afc58bf71dd 100644 --- a/fs/xfs/scrub/attr.c +++ b/fs/xfs/scrub/attr.c @@ -9,26 +9,62 @@ #include "xfs_format.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" -#include "xfs_defer.h" -#include "xfs_btree.h" -#include "xfs_bit.h" #include "xfs_log_format.h" -#include "xfs_trans.h" -#include "xfs_sb.h" #include "xfs_inode.h" #include "xfs_da_format.h" #include "xfs_da_btree.h" -#include "xfs_dir2.h" #include "xfs_attr.h" #include "xfs_attr_leaf.h" -#include "scrub/xfs_scrub.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/dabtree.h" -#include "scrub/trace.h" +#include "scrub/attr.h" -#include <linux/posix_acl_xattr.h> -#include <linux/xattr.h> +/* + * Allocate enough memory to hold an attr value and attr block bitmaps, + * reallocating the buffer if necessary. Buffer contents are not preserved + * across a reallocation. + */ +int +xchk_setup_xattr_buf( + struct xfs_scrub *sc, + size_t value_size, + xfs_km_flags_t flags) +{ + size_t sz; + struct xchk_xattr_buf *ab = sc->buf; + + /* + * We need enough space to read an xattr value from the file or enough + * space to hold three copies of the xattr free space bitmap. We don't + * need the buffer space for both purposes at the same time. + */ + sz = 3 * sizeof(long) * BITS_TO_LONGS(sc->mp->m_attr_geo->blksize); + sz = max_t(size_t, sz, value_size); + + /* + * If there's already a buffer, figure out if we need to reallocate it + * to accommodate a larger size. + */ + if (ab) { + if (sz <= ab->sz) + return 0; + kmem_free(ab); + sc->buf = NULL; + } + + /* + * Don't zero the buffer upon allocation to avoid runtime overhead. + * All users must be careful never to read uninitialized contents. + */ + ab = kmem_alloc_large(sizeof(*ab) + sz, flags); + if (!ab) + return -ENOMEM; + + ab->sz = sz; + sc->buf = ab; + return 0; +} /* Set us up to scrub an inode's extended attributes. */ int @@ -36,19 +72,18 @@ xchk_setup_xattr( struct xfs_scrub *sc, struct xfs_inode *ip) { - size_t sz; + int error; /* - * Allocate the buffer without the inode lock held. We need enough - * space to read every xattr value in the file or enough space to - * hold three copies of the xattr free space bitmap. (Not both at - * the same time.) + * We failed to get memory while checking attrs, so this time try to + * get all the memory we're ever going to need. Allocate the buffer + * without the inode lock held, which means we can sleep. */ - sz = max_t(size_t, XATTR_SIZE_MAX, 3 * sizeof(long) * - BITS_TO_LONGS(sc->mp->m_attr_geo->blksize)); - sc->buf = kmem_zalloc_large(sz, KM_SLEEP); - if (!sc->buf) - return -ENOMEM; + if (sc->flags & XCHK_TRY_HARDER) { + error = xchk_setup_xattr_buf(sc, XATTR_SIZE_MAX, KM_SLEEP); + if (error) + return error; + } return xchk_setup_inode_contents(sc, ip, 0); } @@ -82,12 +117,36 @@ xchk_xattr_listent( sx = container_of(context, struct xchk_xattr, context); + if (xchk_should_terminate(sx->sc, &error)) { + context->seen_enough = error; + return; + } + if (flags & XFS_ATTR_INCOMPLETE) { /* Incomplete attr key, just mark the inode for preening. */ xchk_ino_set_preen(sx->sc, context->dp->i_ino); return; } + /* Does this name make sense? */ + if (!xfs_attr_namecheck(name, namelen)) { + xchk_fblock_set_corrupt(sx->sc, XFS_ATTR_FORK, args.blkno); + return; + } + + /* + * Try to allocate enough memory to extrat the attr value. If that + * doesn't work, we overload the seen_enough variable to convey + * the error message back to the main scrub function. + */ + error = xchk_setup_xattr_buf(sx->sc, valuelen, KM_MAYFAIL); + if (error == -ENOMEM) + error = -EDEADLOCK; + if (error) { + context->seen_enough = error; + return; + } + args.flags = ATTR_KERNOTIME; if (flags & XFS_ATTR_ROOT) args.flags |= ATTR_ROOT; @@ -100,8 +159,8 @@ xchk_xattr_listent( args.namelen = namelen; args.hashval = xfs_da_hashname(args.name, args.namelen); args.trans = context->tp; - args.value = sx->sc->buf; - args.valuelen = XATTR_SIZE_MAX; + args.value = xchk_xattr_valuebuf(sx->sc); + args.valuelen = valuelen; error = xfs_attr_get_ilocked(context->dp, &args); if (error == -EEXIST) @@ -114,7 +173,7 @@ xchk_xattr_listent( args.blkno); fail_xref: if (sx->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) - context->seen_enough = 1; + context->seen_enough = XFS_ITER_ABORT; return; } @@ -159,13 +218,12 @@ xchk_xattr_check_freemap( unsigned long *map, struct xfs_attr3_icleaf_hdr *leafhdr) { - unsigned long *freemap; - unsigned long *dstmap; + unsigned long *freemap = xchk_xattr_freemap(sc); + unsigned long *dstmap = xchk_xattr_dstmap(sc); unsigned int mapsize = sc->mp->m_attr_geo->blksize; int i; /* Construct bitmap of freemap contents. */ - freemap = (unsigned long *)sc->buf + BITS_TO_LONGS(mapsize); bitmap_zero(freemap, mapsize); for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) { if (!xchk_xattr_set_map(sc, freemap, @@ -175,7 +233,6 @@ xchk_xattr_check_freemap( } /* Look for bits that are set in freemap and are marked in use. */ - dstmap = freemap + BITS_TO_LONGS(mapsize); return bitmap_and(dstmap, freemap, map, mapsize) == 0; } @@ -190,13 +247,13 @@ xchk_xattr_entry( char *buf_end, struct xfs_attr_leafblock *leaf, struct xfs_attr3_icleaf_hdr *leafhdr, - unsigned long *usedmap, struct xfs_attr_leaf_entry *ent, int idx, unsigned int *usedbytes, __u32 *last_hashval) { struct xfs_mount *mp = ds->state->mp; + unsigned long *usedmap = xchk_xattr_usedmap(ds->sc); char *name_end; struct xfs_attr_leaf_name_local *lentry; struct xfs_attr_leaf_name_remote *rentry; @@ -256,16 +313,26 @@ xchk_xattr_block( struct xfs_attr_leafblock *leaf = bp->b_addr; struct xfs_attr_leaf_entry *ent; struct xfs_attr_leaf_entry *entries; - unsigned long *usedmap = ds->sc->buf; + unsigned long *usedmap; char *buf_end; size_t off; __u32 last_hashval = 0; unsigned int usedbytes = 0; unsigned int hdrsize; int i; + int error; if (*last_checked == blk->blkno) return 0; + + /* Allocate memory for block usage checking. */ + error = xchk_setup_xattr_buf(ds->sc, 0, KM_MAYFAIL); + if (error == -ENOMEM) + return -EDEADLOCK; + if (error) + return error; + usedmap = xchk_xattr_usedmap(ds->sc); + *last_checked = blk->blkno; bitmap_zero(usedmap, mp->m_attr_geo->blksize); @@ -313,7 +380,7 @@ xchk_xattr_block( /* Check the entry and nameval. */ xchk_xattr_entry(ds, level, buf_end, leaf, &leafhdr, - usedmap, ent, i, &usedbytes, &last_hashval); + ent, i, &usedbytes, &last_hashval); if (ds->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) goto out; @@ -453,6 +520,10 @@ xchk_xattr( error = xfs_attr_list_int_ilocked(&sx.context); if (!xchk_fblock_process_error(sc, XFS_ATTR_FORK, 0, &error)) goto out; + + /* Did our listent function try to return any errors? */ + if (sx.context.seen_enough < 0) + error = sx.context.seen_enough; out: return error; } diff --git a/fs/xfs/scrub/attr.h b/fs/xfs/scrub/attr.h new file mode 100644 index 000000000000..13a1d2e8424d --- /dev/null +++ b/fs/xfs/scrub/attr.h @@ -0,0 +1,71 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (C) 2019 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <darrick.wong@oracle.com> + */ +#ifndef __XFS_SCRUB_ATTR_H__ +#define __XFS_SCRUB_ATTR_H__ + +/* + * Temporary storage for online scrub and repair of extended attributes. + */ +struct xchk_xattr_buf { + /* Size of @buf, in bytes. */ + size_t sz; + + /* + * Memory buffer -- either used for extracting attr values while + * walking the attributes; or for computing attr block bitmaps when + * checking the attribute tree. + * + * Each bitmap contains enough bits to track every byte in an attr + * block (rounded up to the size of an unsigned long). The attr block + * used space bitmap starts at the beginning of the buffer; the free + * space bitmap follows immediately after; and we have a third buffer + * for storing intermediate bitmap results. + */ + uint8_t buf[0]; +}; + +/* A place to store attribute values. */ +static inline uint8_t * +xchk_xattr_valuebuf( + struct xfs_scrub *sc) +{ + struct xchk_xattr_buf *ab = sc->buf; + + return ab->buf; +} + +/* A bitmap of space usage computed by walking an attr leaf block. */ +static inline unsigned long * +xchk_xattr_usedmap( + struct xfs_scrub *sc) +{ + struct xchk_xattr_buf *ab = sc->buf; + + return (unsigned long *)ab->buf; +} + +/* A bitmap of free space computed by walking attr leaf block free info. */ +static inline unsigned long * +xchk_xattr_freemap( + struct xfs_scrub *sc) +{ + return xchk_xattr_usedmap(sc) + + BITS_TO_LONGS(sc->mp->m_attr_geo->blksize); +} + +/* A bitmap used to hold temporary results. */ +static inline unsigned long * +xchk_xattr_dstmap( + struct xfs_scrub *sc) +{ + return xchk_xattr_freemap(sc) + + BITS_TO_LONGS(sc->mp->m_attr_geo->blksize); +} + +int xchk_setup_xattr_buf(struct xfs_scrub *sc, size_t value_size, + xfs_km_flags_t flags); + +#endif /* __XFS_SCRUB_ATTR_H__ */ diff --git a/fs/xfs/scrub/bitmap.c b/fs/xfs/scrub/bitmap.c index fdadc9e1dc49..3d47d111be5a 100644 --- a/fs/xfs/scrub/bitmap.c +++ b/fs/xfs/scrub/bitmap.c @@ -10,11 +10,6 @@ #include "xfs_trans_resv.h" #include "xfs_mount.h" #include "xfs_btree.h" -#include "scrub/xfs_scrub.h" -#include "scrub/scrub.h" -#include "scrub/common.h" -#include "scrub/trace.h" -#include "scrub/repair.h" #include "scrub/bitmap.h" /* diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c index e1d11f3223e3..1bd29fdc2ab5 100644 --- a/fs/xfs/scrub/bmap.c +++ b/fs/xfs/scrub/bmap.c @@ -9,27 +9,19 @@ #include "xfs_format.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" -#include "xfs_defer.h" #include "xfs_btree.h" #include "xfs_bit.h" #include "xfs_log_format.h" #include "xfs_trans.h" -#include "xfs_sb.h" #include "xfs_inode.h" -#include "xfs_inode_fork.h" #include "xfs_alloc.h" -#include "xfs_rtalloc.h" #include "xfs_bmap.h" -#include "xfs_bmap_util.h" #include "xfs_bmap_btree.h" #include "xfs_rmap.h" #include "xfs_rmap_btree.h" -#include "xfs_refcount.h" -#include "scrub/xfs_scrub.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/btree.h" -#include "scrub/trace.h" /* Set us up with an inode's bmap. */ int @@ -281,6 +273,31 @@ xchk_bmap_extent_xref( xchk_ag_free(info->sc, &info->sc->sa); } +/* + * Directories and attr forks should never have blocks that can't be addressed + * by a xfs_dablk_t. + */ +STATIC void +xchk_bmap_dirattr_extent( + struct xfs_inode *ip, + struct xchk_bmap_info *info, + struct xfs_bmbt_irec *irec) +{ + struct xfs_mount *mp = ip->i_mount; + xfs_fileoff_t off; + + if (!S_ISDIR(VFS_I(ip)->i_mode) && info->whichfork != XFS_ATTR_FORK) + return; + + if (!xfs_verify_dablk(mp, irec->br_startoff)) + xchk_fblock_set_corrupt(info->sc, info->whichfork, + irec->br_startoff); + + off = irec->br_startoff + irec->br_blockcount - 1; + if (!xfs_verify_dablk(mp, off)) + xchk_fblock_set_corrupt(info->sc, info->whichfork, off); +} + /* Scrub a single extent record. */ STATIC int xchk_bmap_extent( @@ -305,6 +322,8 @@ xchk_bmap_extent( xchk_fblock_set_corrupt(info->sc, info->whichfork, irec->br_startoff); + xchk_bmap_dirattr_extent(ip, info, irec); + /* There should never be a "hole" extent in either extent list. */ if (irec->br_startblock == HOLESTARTBLOCK) xchk_fblock_set_corrupt(info->sc, info->whichfork, diff --git a/fs/xfs/scrub/btree.c b/fs/xfs/scrub/btree.c index 6f94d1f7322d..f52a7b8256f9 100644 --- a/fs/xfs/scrub/btree.c +++ b/fs/xfs/scrub/btree.c @@ -9,14 +9,7 @@ #include "xfs_format.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" -#include "xfs_defer.h" #include "xfs_btree.h" -#include "xfs_bit.h" -#include "xfs_log_format.h" -#include "xfs_trans.h" -#include "xfs_sb.h" -#include "xfs_inode.h" -#include "xfs_alloc.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/btree.h" @@ -415,8 +408,17 @@ xchk_btree_check_owner( struct xfs_btree_cur *cur = bs->cur; struct check_owner *co; - if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) && bp == NULL) + /* + * In theory, xfs_btree_get_block should only give us a null buffer + * pointer for the root of a root-in-inode btree type, but we need + * to check defensively here in case the cursor state is also screwed + * up. + */ + if (bp == NULL) { + if (!(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)) + xchk_btree_set_corrupt(bs->sc, bs->cur, level); return 0; + } /* * We want to cross-reference each btree block with the bnobt diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c index 0c54ff55b901..18876056e5e0 100644 --- a/fs/xfs/scrub/common.c +++ b/fs/xfs/scrub/common.c @@ -9,22 +9,16 @@ #include "xfs_format.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" -#include "xfs_defer.h" #include "xfs_btree.h" -#include "xfs_bit.h" #include "xfs_log_format.h" #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_inode.h" #include "xfs_icache.h" -#include "xfs_itable.h" #include "xfs_alloc.h" #include "xfs_alloc_btree.h" -#include "xfs_bmap.h" -#include "xfs_bmap_btree.h" #include "xfs_ialloc.h" #include "xfs_ialloc_btree.h" -#include "xfs_refcount.h" #include "xfs_refcount_btree.h" #include "xfs_rmap.h" #include "xfs_rmap_btree.h" @@ -32,12 +26,11 @@ #include "xfs_trans_priv.h" #include "xfs_attr.h" #include "xfs_reflink.h" -#include "scrub/xfs_scrub.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/trace.h" -#include "scrub/btree.h" #include "scrub/repair.h" +#include "scrub/health.h" /* Common code for the metadata scrubbers. */ @@ -208,6 +201,15 @@ xchk_ino_set_preen( trace_xchk_ino_preen(sc, ino, __return_address); } +/* Record something being wrong with the filesystem primary superblock. */ +void +xchk_set_corrupt( + struct xfs_scrub *sc) +{ + sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT; + trace_xchk_fs_error(sc, 0, __return_address); +} + /* Record a corrupt block. */ void xchk_block_set_corrupt( @@ -458,13 +460,18 @@ xchk_ag_btcur_init( struct xfs_mount *mp = sc->mp; xfs_agnumber_t agno = sa->agno; - if (sa->agf_bp) { + xchk_perag_get(sc->mp, sa); + if (sa->agf_bp && + xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_BNO)) { /* Set up a bnobt cursor for cross-referencing. */ sa->bno_cur = xfs_allocbt_init_cursor(mp, sc->tp, sa->agf_bp, agno, XFS_BTNUM_BNO); if (!sa->bno_cur) goto err; + } + if (sa->agf_bp && + xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_CNT)) { /* Set up a cntbt cursor for cross-referencing. */ sa->cnt_cur = xfs_allocbt_init_cursor(mp, sc->tp, sa->agf_bp, agno, XFS_BTNUM_CNT); @@ -473,7 +480,8 @@ xchk_ag_btcur_init( } /* Set up a inobt cursor for cross-referencing. */ - if (sa->agi_bp) { + if (sa->agi_bp && + xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_INO)) { sa->ino_cur = xfs_inobt_init_cursor(mp, sc->tp, sa->agi_bp, agno, XFS_BTNUM_INO); if (!sa->ino_cur) @@ -481,7 +489,8 @@ xchk_ag_btcur_init( } /* Set up a finobt cursor for cross-referencing. */ - if (sa->agi_bp && xfs_sb_version_hasfinobt(&mp->m_sb)) { + if (sa->agi_bp && xfs_sb_version_hasfinobt(&mp->m_sb) && + xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_FINO)) { sa->fino_cur = xfs_inobt_init_cursor(mp, sc->tp, sa->agi_bp, agno, XFS_BTNUM_FINO); if (!sa->fino_cur) @@ -489,7 +498,8 @@ xchk_ag_btcur_init( } /* Set up a rmapbt cursor for cross-referencing. */ - if (sa->agf_bp && xfs_sb_version_hasrmapbt(&mp->m_sb)) { + if (sa->agf_bp && xfs_sb_version_hasrmapbt(&mp->m_sb) && + xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_RMAP)) { sa->rmap_cur = xfs_rmapbt_init_cursor(mp, sc->tp, sa->agf_bp, agno); if (!sa->rmap_cur) @@ -497,7 +507,8 @@ xchk_ag_btcur_init( } /* Set up a refcountbt cursor for cross-referencing. */ - if (sa->agf_bp && xfs_sb_version_hasreflink(&mp->m_sb)) { + if (sa->agf_bp && xfs_sb_version_hasreflink(&mp->m_sb) && + xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_REFC)) { sa->refc_cur = xfs_refcountbt_init_cursor(mp, sc->tp, sa->agf_bp, agno); if (!sa->refc_cur) @@ -884,3 +895,21 @@ xchk_ilock_inverted( } return -EDEADLOCK; } + +/* Pause background reaping of resources. */ +void +xchk_stop_reaping( + struct xfs_scrub *sc) +{ + sc->flags |= XCHK_REAPING_DISABLED; + xfs_stop_block_reaping(sc->mp); +} + +/* Restart background reaping of resources. */ +void +xchk_start_reaping( + struct xfs_scrub *sc) +{ + xfs_start_block_reaping(sc->mp); + sc->flags &= ~XCHK_REAPING_DISABLED; +} diff --git a/fs/xfs/scrub/common.h b/fs/xfs/scrub/common.h index e26a430bd466..003a772cd26c 100644 --- a/fs/xfs/scrub/common.h +++ b/fs/xfs/scrub/common.h @@ -39,6 +39,7 @@ void xchk_block_set_preen(struct xfs_scrub *sc, struct xfs_buf *bp); void xchk_ino_set_preen(struct xfs_scrub *sc, xfs_ino_t ino); +void xchk_set_corrupt(struct xfs_scrub *sc); void xchk_block_set_corrupt(struct xfs_scrub *sc, struct xfs_buf *bp); void xchk_ino_set_corrupt(struct xfs_scrub *sc, xfs_ino_t ino); @@ -105,6 +106,7 @@ xchk_setup_quota(struct xfs_scrub *sc, struct xfs_inode *ip) return -ENOENT; } #endif +int xchk_setup_fscounters(struct xfs_scrub *sc, struct xfs_inode *ip); void xchk_ag_free(struct xfs_scrub *sc, struct xchk_ag *sa); int xchk_ag_init(struct xfs_scrub *sc, xfs_agnumber_t agno, @@ -137,5 +139,7 @@ static inline bool xchk_skip_xref(struct xfs_scrub_metadata *sm) int xchk_metadata_inode_forks(struct xfs_scrub *sc); int xchk_ilock_inverted(struct xfs_inode *ip, uint lock_mode); +void xchk_stop_reaping(struct xfs_scrub *sc); +void xchk_start_reaping(struct xfs_scrub *sc); #endif /* __XFS_SCRUB_COMMON_H__ */ diff --git a/fs/xfs/scrub/dabtree.c b/fs/xfs/scrub/dabtree.c index f1260b4bfdee..94c4f1de1922 100644 --- a/fs/xfs/scrub/dabtree.c +++ b/fs/xfs/scrub/dabtree.c @@ -9,20 +9,12 @@ #include "xfs_format.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" -#include "xfs_defer.h" -#include "xfs_btree.h" -#include "xfs_bit.h" #include "xfs_log_format.h" #include "xfs_trans.h" -#include "xfs_sb.h" #include "xfs_inode.h" -#include "xfs_inode_fork.h" -#include "xfs_da_format.h" -#include "xfs_da_btree.h" #include "xfs_dir2.h" #include "xfs_dir2_priv.h" #include "xfs_attr_leaf.h" -#include "scrub/xfs_scrub.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/trace.h" @@ -574,6 +566,11 @@ xchk_da_btree( /* Drill another level deeper. */ blkno = be32_to_cpu(key->before); level++; + if (level >= XFS_DA_NODE_MAXDEPTH) { + /* Too deep! */ + xchk_da_set_corrupt(&ds, level - 1); + break; + } ds.tree_level--; error = xchk_da_btree_block(&ds, level, blkno); if (error) diff --git a/fs/xfs/scrub/dir.c b/fs/xfs/scrub/dir.c index cd3e4d768a18..1e2e11721eb9 100644 --- a/fs/xfs/scrub/dir.c +++ b/fs/xfs/scrub/dir.c @@ -9,24 +9,14 @@ #include "xfs_format.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" -#include "xfs_defer.h" -#include "xfs_btree.h" -#include "xfs_bit.h" #include "xfs_log_format.h" #include "xfs_trans.h" -#include "xfs_sb.h" #include "xfs_inode.h" #include "xfs_icache.h" -#include "xfs_itable.h" -#include "xfs_da_format.h" -#include "xfs_da_btree.h" #include "xfs_dir2.h" #include "xfs_dir2_priv.h" -#include "xfs_ialloc.h" -#include "scrub/xfs_scrub.h" #include "scrub/scrub.h" #include "scrub/common.h" -#include "scrub/trace.h" #include "scrub/dabtree.h" /* Set us up to scrub directories. */ @@ -129,6 +119,12 @@ xchk_dir_actor( goto out; } + /* Does this name make sense? */ + if (!xfs_dir2_namecheck(name, namelen)) { + xchk_fblock_set_corrupt(sdc->sc, XFS_DATA_FORK, offset); + goto out; + } + if (!strncmp(".", name, namelen)) { /* If this is "." then check that the inum matches the dir. */ if (xfs_sb_version_hasftype(&mp->m_sb) && type != DT_DIR) diff --git a/fs/xfs/scrub/fscounters.c b/fs/xfs/scrub/fscounters.c new file mode 100644 index 000000000000..fc3f510c9034 --- /dev/null +++ b/fs/xfs/scrub/fscounters.c @@ -0,0 +1,354 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Copyright (C) 2019 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <darrick.wong@oracle.com> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_sb.h" +#include "xfs_alloc.h" +#include "xfs_ialloc.h" +#include "xfs_health.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/trace.h" + +/* + * FS Summary Counters + * =================== + * + * The basics of filesystem summary counter checking are that we iterate the + * AGs counting the number of free blocks, free space btree blocks, per-AG + * reservations, inodes, delayed allocation reservations, and free inodes. + * Then we compare what we computed against the in-core counters. + * + * However, the reality is that summary counters are a tricky beast to check. + * While we /could/ freeze the filesystem and scramble around the AGs counting + * the free blocks, in practice we prefer not do that for a scan because + * freezing is costly. To get around this, we added a per-cpu counter of the + * delalloc reservations so that we can rotor around the AGs relatively + * quickly, and we allow the counts to be slightly off because we're not taking + * any locks while we do this. + * + * So the first thing we do is warm up the buffer cache in the setup routine by + * walking all the AGs to make sure the incore per-AG structure has been + * initialized. The expected value calculation then iterates the incore per-AG + * structures as quickly as it can. We snapshot the percpu counters before and + * after this operation and use the difference in counter values to guess at + * our tolerance for mismatch between expected and actual counter values. + */ + +/* + * Since the expected value computation is lockless but only browses incore + * values, the percpu counters should be fairly close to each other. However, + * we'll allow ourselves to be off by at least this (arbitrary) amount. + */ +#define XCHK_FSCOUNT_MIN_VARIANCE (512) + +/* + * Make sure the per-AG structure has been initialized from the on-disk header + * contents and trust that the incore counters match the ondisk counters. (The + * AGF and AGI scrubbers check them, and a normal xfs_scrub run checks the + * summary counters after checking all AG headers). Do this from the setup + * function so that the inner AG aggregation loop runs as quickly as possible. + * + * This function runs during the setup phase /before/ we start checking any + * metadata. + */ +STATIC int +xchk_fscount_warmup( + struct xfs_scrub *sc) +{ + struct xfs_mount *mp = sc->mp; + struct xfs_buf *agi_bp = NULL; + struct xfs_buf *agf_bp = NULL; + struct xfs_perag *pag = NULL; + xfs_agnumber_t agno; + int error = 0; + + for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) { + pag = xfs_perag_get(mp, agno); + + if (pag->pagi_init && pag->pagf_init) + goto next_loop_perag; + + /* Lock both AG headers. */ + error = xfs_ialloc_read_agi(mp, sc->tp, agno, &agi_bp); + if (error) + break; + error = xfs_alloc_read_agf(mp, sc->tp, agno, 0, &agf_bp); + if (error) + break; + error = -ENOMEM; + if (!agf_bp || !agi_bp) + break; + + /* + * These are supposed to be initialized by the header read + * function. + */ + error = -EFSCORRUPTED; + if (!pag->pagi_init || !pag->pagf_init) + break; + + xfs_buf_relse(agf_bp); + agf_bp = NULL; + xfs_buf_relse(agi_bp); + agi_bp = NULL; +next_loop_perag: + xfs_perag_put(pag); + pag = NULL; + error = 0; + + if (fatal_signal_pending(current)) + break; + } + + if (agf_bp) + xfs_buf_relse(agf_bp); + if (agi_bp) + xfs_buf_relse(agi_bp); + if (pag) + xfs_perag_put(pag); + return error; +} + +int +xchk_setup_fscounters( + struct xfs_scrub *sc, + struct xfs_inode *ip) +{ + struct xchk_fscounters *fsc; + int error; + + sc->buf = kmem_zalloc(sizeof(struct xchk_fscounters), KM_SLEEP); + if (!sc->buf) + return -ENOMEM; + fsc = sc->buf; + + xfs_icount_range(sc->mp, &fsc->icount_min, &fsc->icount_max); + + /* We must get the incore counters set up before we can proceed. */ + error = xchk_fscount_warmup(sc); + if (error) + return error; + + /* + * Pause background reclaim while we're scrubbing to reduce the + * likelihood of background perturbations to the counters throwing off + * our calculations. + */ + xchk_stop_reaping(sc); + + return xchk_trans_alloc(sc, 0); +} + +/* + * Calculate what the global in-core counters ought to be from the incore + * per-AG structure. Callers can compare this to the actual in-core counters + * to estimate by how much both in-core and on-disk counters need to be + * adjusted. + */ +STATIC int +xchk_fscount_aggregate_agcounts( + struct xfs_scrub *sc, + struct xchk_fscounters *fsc) +{ + struct xfs_mount *mp = sc->mp; + struct xfs_perag *pag; + uint64_t delayed; + xfs_agnumber_t agno; + int tries = 8; + +retry: + fsc->icount = 0; + fsc->ifree = 0; + fsc->fdblocks = 0; + + for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) { + pag = xfs_perag_get(mp, agno); + + /* This somehow got unset since the warmup? */ + if (!pag->pagi_init || !pag->pagf_init) { + xfs_perag_put(pag); + return -EFSCORRUPTED; + } + + /* Count all the inodes */ + fsc->icount += pag->pagi_count; + fsc->ifree += pag->pagi_freecount; + + /* Add up the free/freelist/bnobt/cntbt blocks */ + fsc->fdblocks += pag->pagf_freeblks; + fsc->fdblocks += pag->pagf_flcount; + fsc->fdblocks += pag->pagf_btreeblks; + + /* + * Per-AG reservations are taken out of the incore counters, + * so they must be left out of the free blocks computation. + */ + fsc->fdblocks -= pag->pag_meta_resv.ar_reserved; + fsc->fdblocks -= pag->pag_rmapbt_resv.ar_orig_reserved; + + xfs_perag_put(pag); + + if (fatal_signal_pending(current)) + break; + } + + /* + * The global incore space reservation is taken from the incore + * counters, so leave that out of the computation. + */ + fsc->fdblocks -= mp->m_resblks_avail; + + /* + * Delayed allocation reservations are taken out of the incore counters + * but not recorded on disk, so leave them and their indlen blocks out + * of the computation. + */ + delayed = percpu_counter_sum(&mp->m_delalloc_blks); + fsc->fdblocks -= delayed; + + trace_xchk_fscounters_calc(mp, fsc->icount, fsc->ifree, fsc->fdblocks, + delayed); + + + /* Bail out if the values we compute are totally nonsense. */ + if (fsc->icount < fsc->icount_min || fsc->icount > fsc->icount_max || + fsc->fdblocks > mp->m_sb.sb_dblocks || + fsc->ifree > fsc->icount_max) + return -EFSCORRUPTED; + + /* + * If ifree > icount then we probably had some perturbation in the + * counters while we were calculating things. We'll try a few times + * to maintain ifree <= icount before giving up. + */ + if (fsc->ifree > fsc->icount) { + if (tries--) + goto retry; + xchk_set_incomplete(sc); + return 0; + } + + return 0; +} + +/* + * Is the @counter reasonably close to the @expected value? + * + * We neither locked nor froze anything in the filesystem while aggregating the + * per-AG data to compute the @expected value, which means that the counter + * could have changed. We know the @old_value of the summation of the counter + * before the aggregation, and we re-sum the counter now. If the expected + * value falls between the two summations, we're ok. + * + * Otherwise, we /might/ have a problem. If the change in the summations is + * more than we want to tolerate, the filesystem is probably busy and we should + * just send back INCOMPLETE and see if userspace will try again. + */ +static inline bool +xchk_fscount_within_range( + struct xfs_scrub *sc, + const int64_t old_value, + struct percpu_counter *counter, + uint64_t expected) +{ + int64_t min_value, max_value; + int64_t curr_value = percpu_counter_sum(counter); + + trace_xchk_fscounters_within_range(sc->mp, expected, curr_value, + old_value); + + /* Negative values are always wrong. */ + if (curr_value < 0) + return false; + + /* Exact matches are always ok. */ + if (curr_value == expected) + return true; + + min_value = min(old_value, curr_value); + max_value = max(old_value, curr_value); + + /* Within the before-and-after range is ok. */ + if (expected >= min_value && expected <= max_value) + return true; + + /* + * If the difference between the two summations is too large, the fs + * might just be busy and so we'll mark the scrub incomplete. Return + * true here so that we don't mark the counter corrupt. + * + * XXX: In the future when userspace can grant scrub permission to + * quiesce the filesystem to solve the outsized variance problem, this + * check should be moved up and the return code changed to signal to + * userspace that we need quiesce permission. + */ + if (max_value - min_value >= XCHK_FSCOUNT_MIN_VARIANCE) { + xchk_set_incomplete(sc); + return true; + } + + return false; +} + +/* Check the superblock counters. */ +int +xchk_fscounters( + struct xfs_scrub *sc) +{ + struct xfs_mount *mp = sc->mp; + struct xchk_fscounters *fsc = sc->buf; + int64_t icount, ifree, fdblocks; + int error; + + /* Snapshot the percpu counters. */ + icount = percpu_counter_sum(&mp->m_icount); + ifree = percpu_counter_sum(&mp->m_ifree); + fdblocks = percpu_counter_sum(&mp->m_fdblocks); + + /* No negative values, please! */ + if (icount < 0 || ifree < 0 || fdblocks < 0) + xchk_set_corrupt(sc); + + /* See if icount is obviously wrong. */ + if (icount < fsc->icount_min || icount > fsc->icount_max) + xchk_set_corrupt(sc); + + /* See if fdblocks is obviously wrong. */ + if (fdblocks > mp->m_sb.sb_dblocks) + xchk_set_corrupt(sc); + + /* + * If ifree exceeds icount by more than the minimum variance then + * something's probably wrong with the counters. + */ + if (ifree > icount && ifree - icount > XCHK_FSCOUNT_MIN_VARIANCE) + xchk_set_corrupt(sc); + + /* Walk the incore AG headers to calculate the expected counters. */ + error = xchk_fscount_aggregate_agcounts(sc, fsc); + if (!xchk_process_error(sc, 0, XFS_SB_BLOCK(mp), &error)) + return error; + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_INCOMPLETE) + return 0; + + /* Compare the in-core counters with whatever we counted. */ + if (!xchk_fscount_within_range(sc, icount, &mp->m_icount, fsc->icount)) + xchk_set_corrupt(sc); + + if (!xchk_fscount_within_range(sc, ifree, &mp->m_ifree, fsc->ifree)) + xchk_set_corrupt(sc); + + if (!xchk_fscount_within_range(sc, fdblocks, &mp->m_fdblocks, + fsc->fdblocks)) + xchk_set_corrupt(sc); + + return 0; +} diff --git a/fs/xfs/scrub/health.c b/fs/xfs/scrub/health.c new file mode 100644 index 000000000000..b2f602811e9d --- /dev/null +++ b/fs/xfs/scrub/health.c @@ -0,0 +1,229 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Copyright (C) 2019 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <darrick.wong@oracle.com> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_btree.h" +#include "xfs_sb.h" +#include "xfs_health.h" +#include "scrub/scrub.h" + +/* + * Scrub and In-Core Filesystem Health Assessments + * =============================================== + * + * Online scrub and repair have the time and the ability to perform stronger + * checks than we can do from the metadata verifiers, because they can + * cross-reference records between data structures. Therefore, scrub is in a + * good position to update the online filesystem health assessments to reflect + * the good/bad state of the data structure. + * + * We therefore extend scrub in the following ways to achieve this: + * + * 1. Create a "sick_mask" field in the scrub context. When we're setting up a + * scrub call, set this to the default XFS_SICK_* flag(s) for the selected + * scrub type (call it A). Scrub and repair functions can override the default + * sick_mask value if they choose. + * + * 2. If the scrubber returns a runtime error code, we exit making no changes + * to the incore sick state. + * + * 3. If the scrubber finds that A is clean, use sick_mask to clear the incore + * sick flags before exiting. + * + * 4. If the scrubber finds that A is corrupt, use sick_mask to set the incore + * sick flags. If the user didn't want to repair then we exit, leaving the + * metadata structure unfixed and the sick flag set. + * + * 5. Now we know that A is corrupt and the user wants to repair, so run the + * repairer. If the repairer returns an error code, we exit with that error + * code, having made no further changes to the incore sick state. + * + * 6. If repair rebuilds A correctly and the subsequent re-scrub of A is clean, + * use sick_mask to clear the incore sick flags. This should have the effect + * that A is no longer marked sick. + * + * 7. If repair rebuilds A incorrectly, the re-scrub will find it corrupt and + * use sick_mask to set the incore sick flags. This should have no externally + * visible effect since we already set them in step (4). + * + * There are some complications to this story, however. For certain types of + * complementary metadata indices (e.g. inobt/finobt), it is easier to rebuild + * both structures at the same time. The following principles apply to this + * type of repair strategy: + * + * 8. Any repair function that rebuilds multiple structures should update + * sick_mask_visible to reflect whatever other structures are rebuilt, and + * verify that all the rebuilt structures can pass a scrub check. The outcomes + * of 5-7 still apply, but with a sick_mask that covers everything being + * rebuilt. + */ + +/* Map our scrub type to a sick mask and a set of health update functions. */ + +enum xchk_health_group { + XHG_FS = 1, + XHG_RT, + XHG_AG, + XHG_INO, +}; + +struct xchk_health_map { + enum xchk_health_group group; + unsigned int sick_mask; +}; + +static const struct xchk_health_map type_to_health_flag[XFS_SCRUB_TYPE_NR] = { + [XFS_SCRUB_TYPE_SB] = { XHG_AG, XFS_SICK_AG_SB }, + [XFS_SCRUB_TYPE_AGF] = { XHG_AG, XFS_SICK_AG_AGF }, + [XFS_SCRUB_TYPE_AGFL] = { XHG_AG, XFS_SICK_AG_AGFL }, + [XFS_SCRUB_TYPE_AGI] = { XHG_AG, XFS_SICK_AG_AGI }, + [XFS_SCRUB_TYPE_BNOBT] = { XHG_AG, XFS_SICK_AG_BNOBT }, + [XFS_SCRUB_TYPE_CNTBT] = { XHG_AG, XFS_SICK_AG_CNTBT }, + [XFS_SCRUB_TYPE_INOBT] = { XHG_AG, XFS_SICK_AG_INOBT }, + [XFS_SCRUB_TYPE_FINOBT] = { XHG_AG, XFS_SICK_AG_FINOBT }, + [XFS_SCRUB_TYPE_RMAPBT] = { XHG_AG, XFS_SICK_AG_RMAPBT }, + [XFS_SCRUB_TYPE_REFCNTBT] = { XHG_AG, XFS_SICK_AG_REFCNTBT }, + [XFS_SCRUB_TYPE_INODE] = { XHG_INO, XFS_SICK_INO_CORE }, + [XFS_SCRUB_TYPE_BMBTD] = { XHG_INO, XFS_SICK_INO_BMBTD }, + [XFS_SCRUB_TYPE_BMBTA] = { XHG_INO, XFS_SICK_INO_BMBTA }, + [XFS_SCRUB_TYPE_BMBTC] = { XHG_INO, XFS_SICK_INO_BMBTC }, + [XFS_SCRUB_TYPE_DIR] = { XHG_INO, XFS_SICK_INO_DIR }, + [XFS_SCRUB_TYPE_XATTR] = { XHG_INO, XFS_SICK_INO_XATTR }, + [XFS_SCRUB_TYPE_SYMLINK] = { XHG_INO, XFS_SICK_INO_SYMLINK }, + [XFS_SCRUB_TYPE_PARENT] = { XHG_INO, XFS_SICK_INO_PARENT }, + [XFS_SCRUB_TYPE_RTBITMAP] = { XHG_RT, XFS_SICK_RT_BITMAP }, + [XFS_SCRUB_TYPE_RTSUM] = { XHG_RT, XFS_SICK_RT_SUMMARY }, + [XFS_SCRUB_TYPE_UQUOTA] = { XHG_FS, XFS_SICK_FS_UQUOTA }, + [XFS_SCRUB_TYPE_GQUOTA] = { XHG_FS, XFS_SICK_FS_GQUOTA }, + [XFS_SCRUB_TYPE_PQUOTA] = { XHG_FS, XFS_SICK_FS_PQUOTA }, + [XFS_SCRUB_TYPE_FSCOUNTERS] = { XHG_FS, XFS_SICK_FS_COUNTERS }, +}; + +/* Return the health status mask for this scrub type. */ +unsigned int +xchk_health_mask_for_scrub_type( + __u32 scrub_type) +{ + return type_to_health_flag[scrub_type].sick_mask; +} + +/* + * Update filesystem health assessments based on what we found and did. + * + * If the scrubber finds errors, we mark sick whatever's mentioned in + * sick_mask, no matter whether this is a first scan or an + * evaluation of repair effectiveness. + * + * Otherwise, no direct corruption was found, so mark whatever's in + * sick_mask as healthy. + */ +void +xchk_update_health( + struct xfs_scrub *sc) +{ + struct xfs_perag *pag; + bool bad; + + if (!sc->sick_mask) + return; + + bad = (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT); + switch (type_to_health_flag[sc->sm->sm_type].group) { + case XHG_AG: + pag = xfs_perag_get(sc->mp, sc->sm->sm_agno); + if (bad) + xfs_ag_mark_sick(pag, sc->sick_mask); + else + xfs_ag_mark_healthy(pag, sc->sick_mask); + xfs_perag_put(pag); + break; + case XHG_INO: + if (!sc->ip) + return; + if (bad) + xfs_inode_mark_sick(sc->ip, sc->sick_mask); + else + xfs_inode_mark_healthy(sc->ip, sc->sick_mask); + break; + case XHG_FS: + if (bad) + xfs_fs_mark_sick(sc->mp, sc->sick_mask); + else + xfs_fs_mark_healthy(sc->mp, sc->sick_mask); + break; + case XHG_RT: + if (bad) + xfs_rt_mark_sick(sc->mp, sc->sick_mask); + else + xfs_rt_mark_healthy(sc->mp, sc->sick_mask); + break; + default: + ASSERT(0); + break; + } +} + +/* Is the given per-AG btree healthy enough for scanning? */ +bool +xchk_ag_btree_healthy_enough( + struct xfs_scrub *sc, + struct xfs_perag *pag, + xfs_btnum_t btnum) +{ + unsigned int mask = 0; + + /* + * We always want the cursor if it's the same type as whatever we're + * scrubbing, even if we already know the structure is corrupt. + * + * Otherwise, we're only interested in the btree for cross-referencing. + * If we know the btree is bad then don't bother, just set XFAIL. + */ + switch (btnum) { + case XFS_BTNUM_BNO: + if (sc->sm->sm_type == XFS_SCRUB_TYPE_BNOBT) + return true; + mask = XFS_SICK_AG_BNOBT; + break; + case XFS_BTNUM_CNT: + if (sc->sm->sm_type == XFS_SCRUB_TYPE_CNTBT) + return true; + mask = XFS_SICK_AG_CNTBT; + break; + case XFS_BTNUM_INO: + if (sc->sm->sm_type == XFS_SCRUB_TYPE_INOBT) + return true; + mask = XFS_SICK_AG_INOBT; + break; + case XFS_BTNUM_FINO: + if (sc->sm->sm_type == XFS_SCRUB_TYPE_FINOBT) + return true; + mask = XFS_SICK_AG_FINOBT; + break; + case XFS_BTNUM_RMAP: + if (sc->sm->sm_type == XFS_SCRUB_TYPE_RMAPBT) + return true; + mask = XFS_SICK_AG_RMAPBT; + break; + case XFS_BTNUM_REFC: + if (sc->sm->sm_type == XFS_SCRUB_TYPE_REFCNTBT) + return true; + mask = XFS_SICK_AG_REFCNTBT; + break; + default: + ASSERT(0); + return true; + } + + if (xfs_ag_has_sickness(pag, mask)) { + sc->sm->sm_flags |= XFS_SCRUB_OFLAG_XFAIL; + return false; + } + + return true; +} diff --git a/fs/xfs/scrub/health.h b/fs/xfs/scrub/health.h new file mode 100644 index 000000000000..d0b938d3d028 --- /dev/null +++ b/fs/xfs/scrub/health.h @@ -0,0 +1,14 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Copyright (C) 2019 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <darrick.wong@oracle.com> + */ +#ifndef __XFS_SCRUB_HEALTH_H__ +#define __XFS_SCRUB_HEALTH_H__ + +unsigned int xchk_health_mask_for_scrub_type(__u32 scrub_type); +void xchk_update_health(struct xfs_scrub *sc); +bool xchk_ag_btree_healthy_enough(struct xfs_scrub *sc, struct xfs_perag *pag, + xfs_btnum_t btnum); + +#endif /* __XFS_SCRUB_HEALTH_H__ */ diff --git a/fs/xfs/scrub/ialloc.c b/fs/xfs/scrub/ialloc.c index 882dc56c5c21..681758704fda 100644 --- a/fs/xfs/scrub/ialloc.c +++ b/fs/xfs/scrub/ialloc.c @@ -9,21 +9,14 @@ #include "xfs_format.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" -#include "xfs_defer.h" #include "xfs_btree.h" -#include "xfs_bit.h" #include "xfs_log_format.h" #include "xfs_trans.h" -#include "xfs_sb.h" #include "xfs_inode.h" -#include "xfs_alloc.h" #include "xfs_ialloc.h" #include "xfs_ialloc_btree.h" #include "xfs_icache.h" #include "xfs_rmap.h" -#include "xfs_log.h" -#include "xfs_trans_priv.h" -#include "scrub/xfs_scrub.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/btree.h" @@ -39,7 +32,7 @@ xchk_setup_ag_iallocbt( struct xfs_scrub *sc, struct xfs_inode *ip) { - return xchk_setup_ag_btree(sc, ip, sc->try_harder); + return xchk_setup_ag_btree(sc, ip, sc->flags & XCHK_TRY_HARDER); } /* Inode btree scrubber. */ @@ -47,6 +40,12 @@ xchk_setup_ag_iallocbt( struct xchk_iallocbt { /* Number of inodes we see while scanning inobt. */ unsigned long long inodes; + + /* Expected next startino, for big block filesystems. */ + xfs_agino_t next_startino; + + /* Expected end of the current inode cluster. */ + xfs_agino_t next_cluster_ino; }; /* @@ -128,42 +127,58 @@ xchk_iallocbt_freecount( return hweight64(freemask); } -/* Check a particular inode with ir_free. */ +/* + * Check that an inode's allocation status matches ir_free in the inobt + * record. First we try querying the in-core inode state, and if the inode + * isn't loaded we examine the on-disk inode directly. + * + * Since there can be 1:M and M:1 mappings between inobt records and inode + * clusters, we pass in the inode location information as an inobt record; + * the index of an inode cluster within the inobt record (as well as the + * cluster buffer itself); and the index of the inode within the cluster. + * + * @irec is the inobt record. + * @irec_ino is the inode offset from the start of the record. + * @dip is the on-disk inode. + */ STATIC int -xchk_iallocbt_check_cluster_freemask( +xchk_iallocbt_check_cluster_ifree( struct xchk_btree *bs, - xfs_ino_t fsino, - xfs_agino_t chunkino, - xfs_agino_t clusterino, struct xfs_inobt_rec_incore *irec, - struct xfs_buf *bp) + unsigned int irec_ino, + struct xfs_dinode *dip) { - struct xfs_dinode *dip; struct xfs_mount *mp = bs->cur->bc_mp; - bool inode_is_free = false; + xfs_ino_t fsino; + xfs_agino_t agino; + bool irec_free; + bool ino_inuse; bool freemask_ok; - bool inuse; int error = 0; if (xchk_should_terminate(bs->sc, &error)) return error; - dip = xfs_buf_offset(bp, clusterino * mp->m_sb.sb_inodesize); + /* + * Given an inobt record and the offset of an inode from the start of + * the record, compute which fs inode we're talking about. + */ + agino = irec->ir_startino + irec_ino; + fsino = XFS_AGINO_TO_INO(mp, bs->cur->bc_private.a.agno, agino); + irec_free = (irec->ir_free & XFS_INOBT_MASK(irec_ino)); + if (be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC || - (dip->di_version >= 3 && - be64_to_cpu(dip->di_ino) != fsino + clusterino)) { + (dip->di_version >= 3 && be64_to_cpu(dip->di_ino) != fsino)) { xchk_btree_set_corrupt(bs->sc, bs->cur, 0); goto out; } - if (irec->ir_free & XFS_INOBT_MASK(chunkino + clusterino)) - inode_is_free = true; - error = xfs_icache_inode_is_allocated(mp, bs->cur->bc_tp, - fsino + clusterino, &inuse); + error = xfs_icache_inode_is_allocated(mp, bs->cur->bc_tp, fsino, + &ino_inuse); if (error == -ENODATA) { /* Not cached, just read the disk buffer */ - freemask_ok = inode_is_free ^ !!(dip->di_mode); - if (!bs->sc->try_harder && !freemask_ok) + freemask_ok = irec_free ^ !!(dip->di_mode); + if (!(bs->sc->flags & XCHK_TRY_HARDER) && !freemask_ok) return -EDEADLOCK; } else if (error < 0) { /* @@ -174,7 +189,7 @@ xchk_iallocbt_check_cluster_freemask( goto out; } else { /* Inode is all there. */ - freemask_ok = inode_is_free ^ inuse; + freemask_ok = irec_free ^ ino_inuse; } if (!freemask_ok) xchk_btree_set_corrupt(bs->sc, bs->cur, 0); @@ -182,86 +197,223 @@ out: return 0; } -/* Make sure the free mask is consistent with what the inodes think. */ +/* + * Check that the holemask and freemask of a hypothetical inode cluster match + * what's actually on disk. If sparse inodes are enabled, the cluster does + * not actually have to map to inodes if the corresponding holemask bit is set. + * + * @cluster_base is the first inode in the cluster within the @irec. + */ STATIC int -xchk_iallocbt_check_freemask( +xchk_iallocbt_check_cluster( struct xchk_btree *bs, - struct xfs_inobt_rec_incore *irec) + struct xfs_inobt_rec_incore *irec, + unsigned int cluster_base) { struct xfs_imap imap; struct xfs_mount *mp = bs->cur->bc_mp; struct xfs_dinode *dip; - struct xfs_buf *bp; - xfs_ino_t fsino; - xfs_agino_t nr_inodes; - xfs_agino_t agino; - xfs_agino_t chunkino; - xfs_agino_t clusterino; + struct xfs_buf *cluster_bp; + unsigned int nr_inodes; + xfs_agnumber_t agno = bs->cur->bc_private.a.agno; xfs_agblock_t agbno; - uint16_t holemask; + unsigned int cluster_index; + uint16_t cluster_mask = 0; uint16_t ir_holemask; int error = 0; - /* Make sure the freemask matches the inode records. */ - nr_inodes = mp->m_inodes_per_cluster; - - for (agino = irec->ir_startino; - agino < irec->ir_startino + XFS_INODES_PER_CHUNK; - agino += mp->m_inodes_per_cluster) { - fsino = XFS_AGINO_TO_INO(mp, bs->cur->bc_private.a.agno, agino); - chunkino = agino - irec->ir_startino; - agbno = XFS_AGINO_TO_AGBNO(mp, agino); - - /* Compute the holemask mask for this cluster. */ - for (clusterino = 0, holemask = 0; clusterino < nr_inodes; - clusterino += XFS_INODES_PER_HOLEMASK_BIT) - holemask |= XFS_INOBT_MASK((chunkino + clusterino) / - XFS_INODES_PER_HOLEMASK_BIT); - - /* The whole cluster must be a hole or not a hole. */ - ir_holemask = (irec->ir_holemask & holemask); - if (ir_holemask != holemask && ir_holemask != 0) { + nr_inodes = min_t(unsigned int, XFS_INODES_PER_CHUNK, + M_IGEO(mp)->inodes_per_cluster); + + /* Map this inode cluster */ + agbno = XFS_AGINO_TO_AGBNO(mp, irec->ir_startino + cluster_base); + + /* Compute a bitmask for this cluster that can be used for holemask. */ + for (cluster_index = 0; + cluster_index < nr_inodes; + cluster_index += XFS_INODES_PER_HOLEMASK_BIT) + cluster_mask |= XFS_INOBT_MASK((cluster_base + cluster_index) / + XFS_INODES_PER_HOLEMASK_BIT); + + /* + * Map the first inode of this cluster to a buffer and offset. + * Be careful about inobt records that don't align with the start of + * the inode buffer when block sizes are large enough to hold multiple + * inode chunks. When this happens, cluster_base will be zero but + * ir_startino can be large enough to make im_boffset nonzero. + */ + ir_holemask = (irec->ir_holemask & cluster_mask); + imap.im_blkno = XFS_AGB_TO_DADDR(mp, agno, agbno); + imap.im_len = XFS_FSB_TO_BB(mp, M_IGEO(mp)->blocks_per_cluster); + imap.im_boffset = XFS_INO_TO_OFFSET(mp, irec->ir_startino) << + mp->m_sb.sb_inodelog; + + if (imap.im_boffset != 0 && cluster_base != 0) { + ASSERT(imap.im_boffset == 0 || cluster_base == 0); + xchk_btree_set_corrupt(bs->sc, bs->cur, 0); + return 0; + } + + trace_xchk_iallocbt_check_cluster(mp, agno, irec->ir_startino, + imap.im_blkno, imap.im_len, cluster_base, nr_inodes, + cluster_mask, ir_holemask, + XFS_INO_TO_OFFSET(mp, irec->ir_startino + + cluster_base)); + + /* The whole cluster must be a hole or not a hole. */ + if (ir_holemask != cluster_mask && ir_holemask != 0) { + xchk_btree_set_corrupt(bs->sc, bs->cur, 0); + return 0; + } + + /* If any part of this is a hole, skip it. */ + if (ir_holemask) { + xchk_xref_is_not_owned_by(bs->sc, agbno, + M_IGEO(mp)->blocks_per_cluster, + &XFS_RMAP_OINFO_INODES); + return 0; + } + + xchk_xref_is_owned_by(bs->sc, agbno, M_IGEO(mp)->blocks_per_cluster, + &XFS_RMAP_OINFO_INODES); + + /* Grab the inode cluster buffer. */ + error = xfs_imap_to_bp(mp, bs->cur->bc_tp, &imap, &dip, &cluster_bp, + 0, 0); + if (!xchk_btree_xref_process_error(bs->sc, bs->cur, 0, &error)) + return error; + + /* Check free status of each inode within this cluster. */ + for (cluster_index = 0; cluster_index < nr_inodes; cluster_index++) { + struct xfs_dinode *dip; + + if (imap.im_boffset >= BBTOB(cluster_bp->b_length)) { xchk_btree_set_corrupt(bs->sc, bs->cur, 0); - continue; + break; } - /* If any part of this is a hole, skip it. */ - if (ir_holemask) { - xchk_xref_is_not_owned_by(bs->sc, agbno, - mp->m_blocks_per_cluster, - &XFS_RMAP_OINFO_INODES); - continue; + dip = xfs_buf_offset(cluster_bp, imap.im_boffset); + error = xchk_iallocbt_check_cluster_ifree(bs, irec, + cluster_base + cluster_index, dip); + if (error) + break; + imap.im_boffset += mp->m_sb.sb_inodesize; + } + + xfs_trans_brelse(bs->cur->bc_tp, cluster_bp); + return error; +} + +/* + * For all the inode clusters that could map to this inobt record, make sure + * that the holemask makes sense and that the allocation status of each inode + * matches the freemask. + */ +STATIC int +xchk_iallocbt_check_clusters( + struct xchk_btree *bs, + struct xfs_inobt_rec_incore *irec) +{ + unsigned int cluster_base; + int error = 0; + + /* + * For the common case where this inobt record maps to multiple inode + * clusters this will call _check_cluster for each cluster. + * + * For the case that multiple inobt records map to a single cluster, + * this will call _check_cluster once. + */ + for (cluster_base = 0; + cluster_base < XFS_INODES_PER_CHUNK; + cluster_base += M_IGEO(bs->sc->mp)->inodes_per_cluster) { + error = xchk_iallocbt_check_cluster(bs, irec, cluster_base); + if (error) + break; + } + + return error; +} + +/* + * Make sure this inode btree record is aligned properly. Because a fs block + * contains multiple inodes, we check that the inobt record is aligned to the + * correct inode, not just the correct block on disk. This results in a finer + * grained corruption check. + */ +STATIC void +xchk_iallocbt_rec_alignment( + struct xchk_btree *bs, + struct xfs_inobt_rec_incore *irec) +{ + struct xfs_mount *mp = bs->sc->mp; + struct xchk_iallocbt *iabt = bs->private; + struct xfs_ino_geometry *igeo = M_IGEO(mp); + + /* + * finobt records have different positioning requirements than inobt + * records: each finobt record must have a corresponding inobt record. + * That is checked in the xref function, so for now we only catch the + * obvious case where the record isn't at all aligned properly. + * + * Note that if a fs block contains more than a single chunk of inodes, + * we will have finobt records only for those chunks containing free + * inodes, and therefore expect chunk alignment of finobt records. + * Otherwise, we expect that the finobt record is aligned to the + * cluster alignment as told by the superblock. + */ + if (bs->cur->bc_btnum == XFS_BTNUM_FINO) { + unsigned int imask; + + imask = min_t(unsigned int, XFS_INODES_PER_CHUNK, + igeo->cluster_align_inodes) - 1; + if (irec->ir_startino & imask) + xchk_btree_set_corrupt(bs->sc, bs->cur, 0); + return; + } + + if (iabt->next_startino != NULLAGINO) { + /* + * We're midway through a cluster of inodes that is mapped by + * multiple inobt records. Did we get the record for the next + * irec in the sequence? + */ + if (irec->ir_startino != iabt->next_startino) { + xchk_btree_set_corrupt(bs->sc, bs->cur, 0); + return; } - xchk_xref_is_owned_by(bs->sc, agbno, mp->m_blocks_per_cluster, - &XFS_RMAP_OINFO_INODES); + iabt->next_startino += XFS_INODES_PER_CHUNK; - /* Grab the inode cluster buffer. */ - imap.im_blkno = XFS_AGB_TO_DADDR(mp, bs->cur->bc_private.a.agno, - agbno); - imap.im_len = XFS_FSB_TO_BB(mp, mp->m_blocks_per_cluster); - imap.im_boffset = 0; - - error = xfs_imap_to_bp(mp, bs->cur->bc_tp, &imap, - &dip, &bp, 0, 0); - if (!xchk_btree_xref_process_error(bs->sc, bs->cur, 0, - &error)) - continue; - - /* Which inodes are free? */ - for (clusterino = 0; clusterino < nr_inodes; clusterino++) { - error = xchk_iallocbt_check_cluster_freemask(bs, - fsino, chunkino, clusterino, irec, bp); - if (error) { - xfs_trans_brelse(bs->cur->bc_tp, bp); - return error; - } + /* Are we done with the cluster? */ + if (iabt->next_startino >= iabt->next_cluster_ino) { + iabt->next_startino = NULLAGINO; + iabt->next_cluster_ino = NULLAGINO; } + return; + } + + /* inobt records must be aligned to cluster and inoalignmnt size. */ + if (irec->ir_startino & (igeo->cluster_align_inodes - 1)) { + xchk_btree_set_corrupt(bs->sc, bs->cur, 0); + return; + } - xfs_trans_brelse(bs->cur->bc_tp, bp); + if (irec->ir_startino & (igeo->inodes_per_cluster - 1)) { + xchk_btree_set_corrupt(bs->sc, bs->cur, 0); + return; } - return error; + if (igeo->inodes_per_cluster <= XFS_INODES_PER_CHUNK) + return; + + /* + * If this is the start of an inode cluster that can be mapped by + * multiple inobt records, the next inobt record must follow exactly + * after this one. + */ + iabt->next_startino = irec->ir_startino + XFS_INODES_PER_CHUNK; + iabt->next_cluster_ino = irec->ir_startino + igeo->inodes_per_cluster; } /* Scrub an inobt/finobt record. */ @@ -276,7 +428,6 @@ xchk_iallocbt_rec( uint64_t holes; xfs_agnumber_t agno = bs->cur->bc_private.a.agno; xfs_agino_t agino; - xfs_agblock_t agbno; xfs_extlen_t len; int holecount; int i; @@ -303,11 +454,9 @@ xchk_iallocbt_rec( goto out; } - /* Make sure this record is aligned to cluster and inoalignmnt size. */ - agbno = XFS_AGINO_TO_AGBNO(mp, irec.ir_startino); - if ((agbno & (mp->m_cluster_align - 1)) || - (agbno & (mp->m_blocks_per_cluster - 1))) - xchk_btree_set_corrupt(bs->sc, bs->cur, 0); + xchk_iallocbt_rec_alignment(bs, &irec); + if (bs->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + goto out; iabt->inodes += irec.ir_count; @@ -320,7 +469,7 @@ xchk_iallocbt_rec( if (!xchk_iallocbt_chunk(bs, &irec, agino, len)) goto out; - goto check_freemask; + goto check_clusters; } /* Check each chunk of a sparse inode cluster. */ @@ -346,8 +495,8 @@ xchk_iallocbt_rec( holecount + irec.ir_count != XFS_INODES_PER_CHUNK) xchk_btree_set_corrupt(bs->sc, bs->cur, 0); -check_freemask: - error = xchk_iallocbt_check_freemask(bs, &irec); +check_clusters: + error = xchk_iallocbt_check_clusters(bs, &irec); if (error) goto out; @@ -429,6 +578,8 @@ xchk_iallocbt( struct xfs_btree_cur *cur; struct xchk_iallocbt iabt = { .inodes = 0, + .next_startino = NULLAGINO, + .next_cluster_ino = NULLAGINO, }; int error; diff --git a/fs/xfs/scrub/inode.c b/fs/xfs/scrub/inode.c index e213efc194a1..6d483ab29e63 100644 --- a/fs/xfs/scrub/inode.c +++ b/fs/xfs/scrub/inode.c @@ -9,27 +9,17 @@ #include "xfs_format.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" -#include "xfs_defer.h" #include "xfs_btree.h" -#include "xfs_bit.h" #include "xfs_log_format.h" -#include "xfs_trans.h" -#include "xfs_sb.h" #include "xfs_inode.h" -#include "xfs_icache.h" -#include "xfs_inode_buf.h" -#include "xfs_inode_fork.h" #include "xfs_ialloc.h" #include "xfs_da_format.h" #include "xfs_reflink.h" #include "xfs_rmap.h" -#include "xfs_bmap.h" #include "xfs_bmap_util.h" -#include "scrub/xfs_scrub.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/btree.h" -#include "scrub/trace.h" /* * Grab total control of the inode metadata. It doesn't matter here if diff --git a/fs/xfs/scrub/parent.c b/fs/xfs/scrub/parent.c index 1c9d7c7f64f5..c962bd534690 100644 --- a/fs/xfs/scrub/parent.c +++ b/fs/xfs/scrub/parent.c @@ -9,21 +9,13 @@ #include "xfs_format.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" -#include "xfs_defer.h" -#include "xfs_btree.h" -#include "xfs_bit.h" #include "xfs_log_format.h" -#include "xfs_trans.h" -#include "xfs_sb.h" #include "xfs_inode.h" #include "xfs_icache.h" #include "xfs_dir2.h" #include "xfs_dir2_priv.h" -#include "xfs_ialloc.h" -#include "scrub/xfs_scrub.h" #include "scrub/scrub.h" #include "scrub/common.h" -#include "scrub/trace.h" /* Set us up to scrub parents. */ int @@ -320,7 +312,7 @@ out: * If we failed to lock the parent inode even after a retry, just mark * this scrub incomplete and return. */ - if (sc->try_harder && error == -EDEADLOCK) { + if ((sc->flags & XCHK_TRY_HARDER) && error == -EDEADLOCK) { error = 0; xchk_set_incomplete(sc); } diff --git a/fs/xfs/scrub/quota.c b/fs/xfs/scrub/quota.c index 782d582d3edd..0a33b4421c32 100644 --- a/fs/xfs/scrub/quota.c +++ b/fs/xfs/scrub/quota.c @@ -9,24 +9,13 @@ #include "xfs_format.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" -#include "xfs_defer.h" -#include "xfs_btree.h" -#include "xfs_bit.h" #include "xfs_log_format.h" #include "xfs_trans.h" -#include "xfs_sb.h" #include "xfs_inode.h" -#include "xfs_inode_fork.h" -#include "xfs_alloc.h" -#include "xfs_bmap.h" #include "xfs_quota.h" #include "xfs_qm.h" -#include "xfs_dquot.h" -#include "xfs_dquot_item.h" -#include "scrub/xfs_scrub.h" #include "scrub/scrub.h" #include "scrub/common.h" -#include "scrub/trace.h" /* Convert a scrub type code to a DQ flag, or return 0 if error. */ static inline uint @@ -60,7 +49,7 @@ xchk_setup_quota( dqtype = xchk_quota_to_dqtype(sc); if (dqtype == 0) return -EINVAL; - sc->has_quotaofflock = true; + sc->flags |= XCHK_HAS_QUOTAOFFLOCK; mutex_lock(&sc->mp->m_quotainfo->qi_quotaofflock); if (!xfs_this_quota_on(sc->mp, dqtype)) return -ENOENT; @@ -144,7 +133,7 @@ xchk_quota_item( if (bsoft > bhard) xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset); - if (ihard > mp->m_maxicount) + if (ihard > M_IGEO(mp)->maxicount) xchk_fblock_set_warning(sc, XFS_DATA_FORK, offset); if (isoft > ihard) xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset); diff --git a/fs/xfs/scrub/refcount.c b/fs/xfs/scrub/refcount.c index 708b4158eb90..93b3793bc5b3 100644 --- a/fs/xfs/scrub/refcount.c +++ b/fs/xfs/scrub/refcount.c @@ -7,22 +7,12 @@ #include "xfs_fs.h" #include "xfs_shared.h" #include "xfs_format.h" -#include "xfs_trans_resv.h" -#include "xfs_mount.h" -#include "xfs_defer.h" #include "xfs_btree.h" -#include "xfs_bit.h" -#include "xfs_log_format.h" -#include "xfs_trans.h" -#include "xfs_sb.h" -#include "xfs_alloc.h" #include "xfs_rmap.h" #include "xfs_refcount.h" -#include "scrub/xfs_scrub.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/btree.h" -#include "scrub/trace.h" /* * Set us up to scrub reference count btrees. diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c index 6acf1bfa0bfe..4cfeec57fb05 100644 --- a/fs/xfs/scrub/repair.c +++ b/fs/xfs/scrub/repair.c @@ -9,29 +9,21 @@ #include "xfs_format.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" -#include "xfs_defer.h" #include "xfs_btree.h" -#include "xfs_bit.h" #include "xfs_log_format.h" #include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_inode.h" -#include "xfs_icache.h" #include "xfs_alloc.h" #include "xfs_alloc_btree.h" #include "xfs_ialloc.h" #include "xfs_ialloc_btree.h" #include "xfs_rmap.h" #include "xfs_rmap_btree.h" -#include "xfs_refcount.h" #include "xfs_refcount_btree.h" #include "xfs_extent_busy.h" #include "xfs_ag_resv.h" -#include "xfs_trans_space.h" #include "xfs_quota.h" -#include "xfs_attr.h" -#include "xfs_reflink.h" -#include "scrub/xfs_scrub.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/trace.h" @@ -46,8 +38,7 @@ int xrep_attempt( struct xfs_inode *ip, - struct xfs_scrub *sc, - bool *fixed) + struct xfs_scrub *sc) { int error = 0; @@ -66,13 +57,13 @@ xrep_attempt( * scrub so that we can tell userspace if we fixed the problem. */ sc->sm->sm_flags &= ~XFS_SCRUB_FLAGS_OUT; - *fixed = true; + sc->flags |= XREP_ALREADY_FIXED; return -EAGAIN; case -EDEADLOCK: case -EAGAIN: /* Tell the caller to try again having grabbed all the locks. */ - if (!sc->try_harder) { - sc->try_harder = true; + if (!(sc->flags & XCHK_TRY_HARDER)) { + sc->flags |= XCHK_TRY_HARDER; return -EAGAIN; } /* @@ -137,10 +128,16 @@ xrep_roll_ag_trans( if (sc->sa.agfl_bp) xfs_trans_bhold(sc->tp, sc->sa.agfl_bp); - /* Roll the transaction. */ + /* + * Roll the transaction. We still own the buffer and the buffer lock + * regardless of whether or not the roll succeeds. If the roll fails, + * the buffers will be released during teardown on our way out of the + * kernel. If it succeeds, we join them to the new transaction and + * move on. + */ error = xfs_trans_roll(&sc->tp); if (error) - goto out_release; + return error; /* Join AG headers to the new transaction. */ if (sc->sa.agi_bp) @@ -151,21 +148,6 @@ xrep_roll_ag_trans( xfs_trans_bjoin(sc->tp, sc->sa.agfl_bp); return 0; - -out_release: - /* - * Rolling failed, so release the hold on the buffers. The - * buffers will be released during teardown on our way out - * of the kernel. - */ - if (sc->sa.agi_bp) - xfs_trans_bhold_release(sc->tp, sc->sa.agi_bp); - if (sc->sa.agf_bp) - xfs_trans_bhold_release(sc->tp, sc->sa.agf_bp); - if (sc->sa.agfl_bp) - xfs_trans_bhold_release(sc->tp, sc->sa.agfl_bp); - - return error; } /* @@ -367,7 +349,7 @@ xrep_init_btblock( bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, XFS_FSB_TO_DADDR(mp, fsb), XFS_FSB_TO_BB(mp, 1), 0); xfs_buf_zero(bp, 0, BBTOB(bp->b_length)); - xfs_btree_init_block(mp, bp, btnum, 0, 0, sc->sa.agno, 0); + xfs_btree_init_block(mp, bp, btnum, 0, 0, sc->sa.agno); xfs_trans_buf_set_type(tp, bp, XFS_BLFT_BTREE_BUF); xfs_trans_log_buf(tp, bp, 0, bp->b_length); bp->b_ops = ops; @@ -682,7 +664,7 @@ xrep_findroot_agfl_walk( { xfs_agblock_t *agbno = priv; - return (*agbno == bno) ? XFS_BTREE_QUERY_RANGE_ABORT : 0; + return (*agbno == bno) ? XFS_ITER_ABORT : 0; } /* Does this block match the btree information passed in? */ @@ -712,7 +694,7 @@ xrep_findroot_block( if (owner == XFS_RMAP_OWN_AG) { error = xfs_agfl_walk(mp, ri->agf, ri->agfl_bp, xrep_findroot_agfl_walk, &agbno); - if (error == XFS_BTREE_QUERY_RANGE_ABORT) + if (error == XFS_ITER_ABORT) return 0; if (error) return error; @@ -743,7 +725,8 @@ xrep_findroot_block( /* Ensure the block magic matches the btree type we're looking for. */ btblock = XFS_BUF_TO_BLOCK(bp); - if (be32_to_cpu(btblock->bb_magic) != fab->magic) + ASSERT(fab->buf_ops->magic[1] != 0); + if (btblock->bb_magic != fab->buf_ops->magic[1]) goto out; /* diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h index f2fc18bb7605..60c61d7052a8 100644 --- a/fs/xfs/scrub/repair.h +++ b/fs/xfs/scrub/repair.h @@ -15,7 +15,7 @@ static inline int xrep_notsupported(struct xfs_scrub *sc) /* Repair helpers */ -int xrep_attempt(struct xfs_inode *ip, struct xfs_scrub *sc, bool *fixed); +int xrep_attempt(struct xfs_inode *ip, struct xfs_scrub *sc); void xrep_failure(struct xfs_mount *mp); int xrep_roll_ag_trans(struct xfs_scrub *sc); bool xrep_ag_has_space(struct xfs_perag *pag, xfs_extlen_t nr_blocks, @@ -42,9 +42,6 @@ struct xrep_find_ag_btree { /* in: buffer ops */ const struct xfs_buf_ops *buf_ops; - /* in: magic number of the btree */ - uint32_t magic; - /* out: the highest btree block found and the tree height */ xfs_agblock_t root; unsigned int height; @@ -67,8 +64,7 @@ int xrep_agi(struct xfs_scrub *sc); static inline int xrep_attempt( struct xfs_inode *ip, - struct xfs_scrub *sc, - bool *fixed) + struct xfs_scrub *sc) { return -EOPNOTSUPP; } diff --git a/fs/xfs/scrub/rmap.c b/fs/xfs/scrub/rmap.c index 92a140c5b55e..8d4cefd761c1 100644 --- a/fs/xfs/scrub/rmap.c +++ b/fs/xfs/scrub/rmap.c @@ -9,21 +9,12 @@ #include "xfs_format.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" -#include "xfs_defer.h" #include "xfs_btree.h" -#include "xfs_bit.h" -#include "xfs_log_format.h" -#include "xfs_trans.h" -#include "xfs_sb.h" -#include "xfs_alloc.h" -#include "xfs_ialloc.h" #include "xfs_rmap.h" #include "xfs_refcount.h" -#include "scrub/xfs_scrub.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/btree.h" -#include "scrub/trace.h" /* * Set us up to scrub reverse mapping btrees. diff --git a/fs/xfs/scrub/rtbitmap.c b/fs/xfs/scrub/rtbitmap.c index 665d4bbb17cc..c642bc206c41 100644 --- a/fs/xfs/scrub/rtbitmap.c +++ b/fs/xfs/scrub/rtbitmap.c @@ -9,19 +9,12 @@ #include "xfs_format.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" -#include "xfs_defer.h" -#include "xfs_btree.h" -#include "xfs_bit.h" #include "xfs_log_format.h" #include "xfs_trans.h" -#include "xfs_sb.h" -#include "xfs_alloc.h" #include "xfs_rtalloc.h" #include "xfs_inode.h" -#include "scrub/xfs_scrub.h" #include "scrub/scrub.h" #include "scrub/common.h" -#include "scrub/trace.h" /* Set us up with the realtime metadata locked. */ int @@ -141,9 +134,8 @@ xchk_xref_is_used_rt_space( startext = fsbno; endext = fsbno + len - 1; do_div(startext, sc->mp->m_sb.sb_rextsize); - if (do_div(endext, sc->mp->m_sb.sb_rextsize)) - endext++; - extcount = endext - startext; + do_div(endext, sc->mp->m_sb.sb_rextsize); + extcount = endext - startext + 1; xfs_ilock(sc->mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP); error = xfs_rtalloc_extent_is_free(sc->mp, sc->tp, startext, extcount, &is_free); diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c index 1b2344d00525..15c8c5f3f688 100644 --- a/fs/xfs/scrub/scrub.c +++ b/fs/xfs/scrub/scrub.c @@ -9,37 +9,18 @@ #include "xfs_format.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" -#include "xfs_defer.h" -#include "xfs_btree.h" -#include "xfs_bit.h" #include "xfs_log_format.h" #include "xfs_trans.h" -#include "xfs_sb.h" #include "xfs_inode.h" -#include "xfs_icache.h" -#include "xfs_itable.h" -#include "xfs_alloc.h" -#include "xfs_alloc_btree.h" -#include "xfs_bmap.h" -#include "xfs_bmap_btree.h" -#include "xfs_ialloc.h" -#include "xfs_ialloc_btree.h" -#include "xfs_refcount.h" -#include "xfs_refcount_btree.h" -#include "xfs_rmap.h" -#include "xfs_rmap_btree.h" #include "xfs_quota.h" #include "xfs_qm.h" #include "xfs_errortag.h" #include "xfs_error.h" -#include "xfs_log.h" -#include "xfs_trans_priv.h" -#include "scrub/xfs_scrub.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/trace.h" -#include "scrub/btree.h" #include "scrub/repair.h" +#include "scrub/health.h" /* * Online Scrub and Repair @@ -186,8 +167,12 @@ xchk_teardown( xfs_irele(sc->ip); sc->ip = NULL; } - if (sc->has_quotaofflock) + if (sc->flags & XCHK_REAPING_DISABLED) + xchk_start_reaping(sc); + if (sc->flags & XCHK_HAS_QUOTAOFFLOCK) { mutex_unlock(&sc->mp->m_quotainfo->qi_quotaofflock); + sc->flags &= ~XCHK_HAS_QUOTAOFFLOCK; + } if (sc->buf) { kmem_free(sc->buf); sc->buf = NULL; @@ -347,6 +332,12 @@ static const struct xchk_meta_ops meta_scrub_ops[] = { .scrub = xchk_quota, .repair = xrep_notsupported, }, + [XFS_SCRUB_TYPE_FSCOUNTERS] = { /* fs summary counters */ + .type = ST_FS, + .setup = xchk_setup_fscounters, + .scrub = xchk_fscounters, + .repair = xrep_notsupported, + }, }; /* This isn't a stable feature, warn once per day. */ @@ -466,10 +457,14 @@ xfs_scrub_metadata( struct xfs_inode *ip, struct xfs_scrub_metadata *sm) { - struct xfs_scrub sc; + struct xfs_scrub sc = { + .mp = ip->i_mount, + .sm = sm, + .sa = { + .agno = NULLAGNUMBER, + }, + }; struct xfs_mount *mp = ip->i_mount; - bool try_harder = false; - bool already_fixed = false; int error = 0; BUILD_BUG_ON(sizeof(meta_scrub_ops) != @@ -491,21 +486,17 @@ xfs_scrub_metadata( xchk_experimental_warning(mp); + sc.ops = &meta_scrub_ops[sm->sm_type]; + sc.sick_mask = xchk_health_mask_for_scrub_type(sm->sm_type); retry_op: /* Set up for the operation. */ - memset(&sc, 0, sizeof(sc)); - sc.mp = ip->i_mount; - sc.sm = sm; - sc.ops = &meta_scrub_ops[sm->sm_type]; - sc.try_harder = try_harder; - sc.sa.agno = NULLAGNUMBER; error = sc.ops->setup(&sc, ip); if (error) goto out_teardown; /* Scrub for errors. */ error = sc.ops->scrub(&sc); - if (!try_harder && error == -EDEADLOCK) { + if (!(sc.flags & XCHK_TRY_HARDER) && error == -EDEADLOCK) { /* * Scrubbers return -EDEADLOCK to mean 'try harder'. * Tear down everything we hold, then set up again with @@ -514,12 +505,15 @@ retry_op: error = xchk_teardown(&sc, ip, 0); if (error) goto out; - try_harder = true; + sc.flags |= XCHK_TRY_HARDER; goto retry_op; } else if (error) goto out_teardown; - if ((sc.sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) && !already_fixed) { + xchk_update_health(&sc); + + if ((sc.sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) && + !(sc.flags & XREP_ALREADY_FIXED)) { bool needs_fix; /* Let debug users force us into the repair routines. */ @@ -542,10 +536,13 @@ retry_op: * If it's broken, userspace wants us to fix it, and we haven't * already tried to fix it, then attempt a repair. */ - error = xrep_attempt(ip, &sc, &already_fixed); + error = xrep_attempt(ip, &sc); if (error == -EAGAIN) { - if (sc.try_harder) - try_harder = true; + /* + * Either the repair function succeeded or it couldn't + * get all the resources it needs; either way, we go + * back to the beginning and call the scrub function. + */ error = xchk_teardown(&sc, ip, 0); if (error) { xrep_failure(mp); diff --git a/fs/xfs/scrub/scrub.h b/fs/xfs/scrub/scrub.h index 22f754fba8e5..ad1ceb44a628 100644 --- a/fs/xfs/scrub/scrub.h +++ b/fs/xfs/scrub/scrub.h @@ -62,13 +62,27 @@ struct xfs_scrub { struct xfs_inode *ip; void *buf; uint ilock_flags; - bool try_harder; - bool has_quotaofflock; + + /* See the XCHK/XREP state flags below. */ + unsigned int flags; + + /* + * The XFS_SICK_* flags that correspond to the metadata being scrubbed + * or repaired. We will use this mask to update the in-core fs health + * status with whatever we find. + */ + unsigned int sick_mask; /* State tracking for single-AG operations. */ struct xchk_ag sa; }; +/* XCHK state flags grow up from zero, XREP state flags grown down from 2^31 */ +#define XCHK_TRY_HARDER (1 << 0) /* can't get resources, try again */ +#define XCHK_HAS_QUOTAOFFLOCK (1 << 1) /* we hold the quotaoff lock */ +#define XCHK_REAPING_DISABLED (1 << 2) /* background block reaping paused */ +#define XREP_ALREADY_FIXED (1 << 31) /* checking our repair work */ + /* Metadata scrubbers */ int xchk_tester(struct xfs_scrub *sc); int xchk_superblock(struct xfs_scrub *sc); @@ -113,6 +127,7 @@ xchk_quota(struct xfs_scrub *sc) return -ENOENT; } #endif +int xchk_fscounters(struct xfs_scrub *sc); /* cross-referencing helpers */ void xchk_xref_is_used_space(struct xfs_scrub *sc, xfs_agblock_t agbno, @@ -138,4 +153,12 @@ void xchk_xref_is_used_rt_space(struct xfs_scrub *sc, xfs_rtblock_t rtbno, # define xchk_xref_is_used_rt_space(sc, rtbno, len) do { } while (0) #endif +struct xchk_fscounters { + uint64_t icount; + uint64_t ifree; + uint64_t fdblocks; + unsigned long long icount_min; + unsigned long long icount_max; +}; + #endif /* __XFS_SCRUB_SCRUB_H__ */ diff --git a/fs/xfs/scrub/symlink.c b/fs/xfs/scrub/symlink.c index f7ebaa946999..99c0b1234c3c 100644 --- a/fs/xfs/scrub/symlink.c +++ b/fs/xfs/scrub/symlink.c @@ -9,19 +9,11 @@ #include "xfs_format.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" -#include "xfs_defer.h" -#include "xfs_btree.h" -#include "xfs_bit.h" #include "xfs_log_format.h" -#include "xfs_trans.h" -#include "xfs_sb.h" #include "xfs_inode.h" -#include "xfs_inode_fork.h" #include "xfs_symlink.h" -#include "scrub/xfs_scrub.h" #include "scrub/scrub.h" #include "scrub/common.h" -#include "scrub/trace.h" /* Set us up to scrub a symbolic link. */ int diff --git a/fs/xfs/scrub/trace.c b/fs/xfs/scrub/trace.c index 96feaf8dcdec..9eaab2eb5ed3 100644 --- a/fs/xfs/scrub/trace.c +++ b/fs/xfs/scrub/trace.c @@ -10,15 +10,9 @@ #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" -#include "xfs_defer.h" -#include "xfs_da_format.h" #include "xfs_inode.h" #include "xfs_btree.h" -#include "xfs_trans.h" -#include "xfs_bit.h" -#include "scrub/xfs_scrub.h" #include "scrub/scrub.h" -#include "scrub/common.h" /* Figure out which block the btree cursor was pointing to. */ static inline xfs_fsblock_t diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h index 8344b14031ef..3362bae28b46 100644 --- a/fs/xfs/scrub/trace.h +++ b/fs/xfs/scrub/trace.h @@ -50,6 +50,7 @@ TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_RTSUM); TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_UQUOTA); TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_GQUOTA); TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_PQUOTA); +TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_FSCOUNTERS); #define XFS_SCRUB_TYPE_STRINGS \ { XFS_SCRUB_TYPE_PROBE, "probe" }, \ @@ -75,7 +76,8 @@ TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_PQUOTA); { XFS_SCRUB_TYPE_RTSUM, "rtsummary" }, \ { XFS_SCRUB_TYPE_UQUOTA, "usrquota" }, \ { XFS_SCRUB_TYPE_GQUOTA, "grpquota" }, \ - { XFS_SCRUB_TYPE_PQUOTA, "prjquota" } + { XFS_SCRUB_TYPE_PQUOTA, "prjquota" }, \ + { XFS_SCRUB_TYPE_FSCOUNTERS, "fscounters" } DECLARE_EVENT_CLASS(xchk_class, TP_PROTO(struct xfs_inode *ip, struct xfs_scrub_metadata *sm, @@ -223,6 +225,7 @@ DEFINE_EVENT(xchk_block_error_class, name, \ void *ret_ip), \ TP_ARGS(sc, daddr, ret_ip)) +DEFINE_SCRUB_BLOCK_ERROR_EVENT(xchk_fs_error); DEFINE_SCRUB_BLOCK_ERROR_EVENT(xchk_block_error); DEFINE_SCRUB_BLOCK_ERROR_EVENT(xchk_block_preen); @@ -545,6 +548,109 @@ TRACE_EVENT(xchk_xref_error, __entry->ret_ip) ); +TRACE_EVENT(xchk_iallocbt_check_cluster, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, + xfs_agino_t startino, xfs_daddr_t map_daddr, + unsigned short map_len, unsigned int chunk_ino, + unsigned int nr_inodes, uint16_t cluster_mask, + uint16_t holemask, unsigned int cluster_ino), + TP_ARGS(mp, agno, startino, map_daddr, map_len, chunk_ino, nr_inodes, + cluster_mask, holemask, cluster_ino), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agino_t, startino) + __field(xfs_daddr_t, map_daddr) + __field(unsigned short, map_len) + __field(unsigned int, chunk_ino) + __field(unsigned int, nr_inodes) + __field(unsigned int, cluster_ino) + __field(uint16_t, cluster_mask) + __field(uint16_t, holemask) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + __entry->startino = startino; + __entry->map_daddr = map_daddr; + __entry->map_len = map_len; + __entry->chunk_ino = chunk_ino; + __entry->nr_inodes = nr_inodes; + __entry->cluster_mask = cluster_mask; + __entry->holemask = holemask; + __entry->cluster_ino = cluster_ino; + ), + TP_printk("dev %d:%d agno %d startino %u daddr 0x%llx len %d chunkino %u nr_inodes %u cluster_mask 0x%x holemask 0x%x cluster_ino %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->startino, + __entry->map_daddr, + __entry->map_len, + __entry->chunk_ino, + __entry->nr_inodes, + __entry->cluster_mask, + __entry->holemask, + __entry->cluster_ino) +) + +TRACE_EVENT(xchk_fscounters_calc, + TP_PROTO(struct xfs_mount *mp, uint64_t icount, uint64_t ifree, + uint64_t fdblocks, uint64_t delalloc), + TP_ARGS(mp, icount, ifree, fdblocks, delalloc), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(int64_t, icount_sb) + __field(uint64_t, icount_calculated) + __field(int64_t, ifree_sb) + __field(uint64_t, ifree_calculated) + __field(int64_t, fdblocks_sb) + __field(uint64_t, fdblocks_calculated) + __field(uint64_t, delalloc) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->icount_sb = mp->m_sb.sb_icount; + __entry->icount_calculated = icount; + __entry->ifree_sb = mp->m_sb.sb_ifree; + __entry->ifree_calculated = ifree; + __entry->fdblocks_sb = mp->m_sb.sb_fdblocks; + __entry->fdblocks_calculated = fdblocks; + __entry->delalloc = delalloc; + ), + TP_printk("dev %d:%d icount %lld:%llu ifree %lld::%llu fdblocks %lld::%llu delalloc %llu", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->icount_sb, + __entry->icount_calculated, + __entry->ifree_sb, + __entry->ifree_calculated, + __entry->fdblocks_sb, + __entry->fdblocks_calculated, + __entry->delalloc) +) + +TRACE_EVENT(xchk_fscounters_within_range, + TP_PROTO(struct xfs_mount *mp, uint64_t expected, int64_t curr_value, + int64_t old_value), + TP_ARGS(mp, expected, curr_value, old_value), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(uint64_t, expected) + __field(int64_t, curr_value) + __field(int64_t, old_value) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->expected = expected; + __entry->curr_value = curr_value; + __entry->old_value = old_value; + ), + TP_printk("dev %d:%d expected %llu curr_value %lld old_value %lld", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->expected, + __entry->curr_value, + __entry->old_value) +) + /* repair tracepoints */ #if IS_ENABLED(CONFIG_XFS_ONLINE_REPAIR) diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c index 8039e35147dd..cbda40d40326 100644 --- a/fs/xfs/xfs_acl.c +++ b/fs/xfs/xfs_acl.c @@ -4,16 +4,14 @@ * All Rights Reserved. */ #include "xfs.h" +#include "xfs_shared.h" #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" #include "xfs_inode.h" -#include "xfs_acl.h" #include "xfs_attr.h" #include "xfs_trace.h" -#include <linux/slab.h> -#include <linux/xattr.h> #include <linux/posix_acl_xattr.h> diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index d9048bcea49c..f16d5f196c6b 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -12,23 +12,19 @@ #include "xfs_mount.h" #include "xfs_inode.h" #include "xfs_trans.h" -#include "xfs_inode_item.h" -#include "xfs_alloc.h" -#include "xfs_error.h" #include "xfs_iomap.h" #include "xfs_trace.h" #include "xfs_bmap.h" #include "xfs_bmap_util.h" -#include "xfs_bmap_btree.h" #include "xfs_reflink.h" -#include <linux/writeback.h> /* * structure owned by writepages passed to individual writepage calls */ struct xfs_writepage_ctx { struct xfs_bmbt_irec imap; - unsigned int io_type; + int fork; + unsigned int data_seq; unsigned int cow_seq; struct xfs_ioend *ioend; }; @@ -62,7 +58,7 @@ xfs_find_daxdev_for_inode( static void xfs_finish_page_writeback( struct inode *inode, - struct bio_vec *bvec, + struct bio_vec *bvec, int error) { struct iomap_page *iop = to_iomap_page(bvec->bv_page); @@ -97,7 +93,7 @@ xfs_destroy_ioend( for (bio = &ioend->io_inline_bio; bio; bio = next) { struct bio_vec *bvec; - int i; + struct bvec_iter_all iter_all; /* * For the last bio, bi_private points to the ioend, so we @@ -109,7 +105,7 @@ xfs_destroy_ioend( next = bio->bi_private; /* walk each page on bio, ending page IO on them */ - bio_for_each_segment_all(bvec, bio, i) + bio_for_each_segment_all(bvec, bio, iter_all) xfs_finish_page_writeback(inode, bvec, error); bio_put(bio); } @@ -137,8 +133,7 @@ xfs_setfilesize_trans_alloc( struct xfs_trans *tp; int error; - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_fsyncts, 0, 0, - XFS_TRANS_NOFS, &tp); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_fsyncts, 0, 0, 0, &tp); if (error) return error; @@ -232,17 +227,24 @@ xfs_setfilesize_ioend( * IO write completion. */ STATIC void -xfs_end_io( - struct work_struct *work) +xfs_end_ioend( + struct xfs_ioend *ioend) { - struct xfs_ioend *ioend = - container_of(work, struct xfs_ioend, io_work); + struct list_head ioend_list; struct xfs_inode *ip = XFS_I(ioend->io_inode); xfs_off_t offset = ioend->io_offset; size_t size = ioend->io_size; + unsigned int nofs_flag; int error; /* + * We can allocate memory here while doing writeback on behalf of + * memory reclaim. To avoid memory allocation deadlocks set the + * task-wide nofs context for the following operations. + */ + nofs_flag = memalloc_nofs_save(); + + /* * Just clean up the in-memory strutures if the fs has been shut down. */ if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { @@ -255,35 +257,140 @@ xfs_end_io( */ error = blk_status_to_errno(ioend->io_bio->bi_status); if (unlikely(error)) { - switch (ioend->io_type) { - case XFS_IO_COW: + if (ioend->io_fork == XFS_COW_FORK) xfs_reflink_cancel_cow_range(ip, offset, size, true); - break; - } - goto done; } /* - * Success: commit the COW or unwritten blocks if needed. + * Success: commit the COW or unwritten blocks if needed. */ - switch (ioend->io_type) { - case XFS_IO_COW: + if (ioend->io_fork == XFS_COW_FORK) error = xfs_reflink_end_cow(ip, offset, size); - break; - case XFS_IO_UNWRITTEN: - /* writeback should never update isize */ + else if (ioend->io_state == XFS_EXT_UNWRITTEN) error = xfs_iomap_write_unwritten(ip, offset, size, false); - break; - default: + else ASSERT(!xfs_ioend_is_append(ioend) || ioend->io_append_trans); - break; - } done: if (ioend->io_append_trans) error = xfs_setfilesize_ioend(ioend, error); + list_replace_init(&ioend->io_list, &ioend_list); xfs_destroy_ioend(ioend, error); + + while (!list_empty(&ioend_list)) { + ioend = list_first_entry(&ioend_list, struct xfs_ioend, + io_list); + list_del_init(&ioend->io_list); + xfs_destroy_ioend(ioend, error); + } + + memalloc_nofs_restore(nofs_flag); +} + +/* + * We can merge two adjacent ioends if they have the same set of work to do. + */ +static bool +xfs_ioend_can_merge( + struct xfs_ioend *ioend, + struct xfs_ioend *next) +{ + if (ioend->io_bio->bi_status != next->io_bio->bi_status) + return false; + if ((ioend->io_fork == XFS_COW_FORK) ^ (next->io_fork == XFS_COW_FORK)) + return false; + if ((ioend->io_state == XFS_EXT_UNWRITTEN) ^ + (next->io_state == XFS_EXT_UNWRITTEN)) + return false; + if (ioend->io_offset + ioend->io_size != next->io_offset) + return false; + return true; +} + +/* + * If the to be merged ioend has a preallocated transaction for file + * size updates we need to ensure the ioend it is merged into also + * has one. If it already has one we can simply cancel the transaction + * as it is guaranteed to be clean. + */ +static void +xfs_ioend_merge_append_transactions( + struct xfs_ioend *ioend, + struct xfs_ioend *next) +{ + if (!ioend->io_append_trans) { + ioend->io_append_trans = next->io_append_trans; + next->io_append_trans = NULL; + } else { + xfs_setfilesize_ioend(next, -ECANCELED); + } +} + +/* Try to merge adjacent completions. */ +STATIC void +xfs_ioend_try_merge( + struct xfs_ioend *ioend, + struct list_head *more_ioends) +{ + struct xfs_ioend *next_ioend; + + while (!list_empty(more_ioends)) { + next_ioend = list_first_entry(more_ioends, struct xfs_ioend, + io_list); + if (!xfs_ioend_can_merge(ioend, next_ioend)) + break; + list_move_tail(&next_ioend->io_list, &ioend->io_list); + ioend->io_size += next_ioend->io_size; + if (next_ioend->io_append_trans) + xfs_ioend_merge_append_transactions(ioend, next_ioend); + } +} + +/* list_sort compare function for ioends */ +static int +xfs_ioend_compare( + void *priv, + struct list_head *a, + struct list_head *b) +{ + struct xfs_ioend *ia; + struct xfs_ioend *ib; + + ia = container_of(a, struct xfs_ioend, io_list); + ib = container_of(b, struct xfs_ioend, io_list); + if (ia->io_offset < ib->io_offset) + return -1; + else if (ia->io_offset > ib->io_offset) + return 1; + return 0; +} + +/* Finish all pending io completions. */ +void +xfs_end_io( + struct work_struct *work) +{ + struct xfs_inode *ip; + struct xfs_ioend *ioend; + struct list_head completion_list; + unsigned long flags; + + ip = container_of(work, struct xfs_inode, i_ioend_work); + + spin_lock_irqsave(&ip->i_ioend_lock, flags); + list_replace_init(&ip->i_ioend_list, &completion_list); + spin_unlock_irqrestore(&ip->i_ioend_lock, flags); + + list_sort(NULL, &completion_list, xfs_ioend_compare); + + while (!list_empty(&completion_list)) { + ioend = list_first_entry(&completion_list, struct xfs_ioend, + io_list); + list_del_init(&ioend->io_list); + xfs_ioend_try_merge(ioend, &completion_list); + xfs_end_ioend(ioend); + } } STATIC void @@ -291,16 +398,92 @@ xfs_end_bio( struct bio *bio) { struct xfs_ioend *ioend = bio->bi_private; - struct xfs_mount *mp = XFS_I(ioend->io_inode)->i_mount; - - if (ioend->io_type == XFS_IO_UNWRITTEN || ioend->io_type == XFS_IO_COW) - queue_work(mp->m_unwritten_workqueue, &ioend->io_work); - else if (ioend->io_append_trans) - queue_work(mp->m_data_workqueue, &ioend->io_work); - else + struct xfs_inode *ip = XFS_I(ioend->io_inode); + struct xfs_mount *mp = ip->i_mount; + unsigned long flags; + + if (ioend->io_fork == XFS_COW_FORK || + ioend->io_state == XFS_EXT_UNWRITTEN || + ioend->io_append_trans != NULL) { + spin_lock_irqsave(&ip->i_ioend_lock, flags); + if (list_empty(&ip->i_ioend_list)) + WARN_ON_ONCE(!queue_work(mp->m_unwritten_workqueue, + &ip->i_ioend_work)); + list_add_tail(&ioend->io_list, &ip->i_ioend_list); + spin_unlock_irqrestore(&ip->i_ioend_lock, flags); + } else xfs_destroy_ioend(ioend, blk_status_to_errno(bio->bi_status)); } +/* + * Fast revalidation of the cached writeback mapping. Return true if the current + * mapping is valid, false otherwise. + */ +static bool +xfs_imap_valid( + struct xfs_writepage_ctx *wpc, + struct xfs_inode *ip, + xfs_fileoff_t offset_fsb) +{ + if (offset_fsb < wpc->imap.br_startoff || + offset_fsb >= wpc->imap.br_startoff + wpc->imap.br_blockcount) + return false; + /* + * If this is a COW mapping, it is sufficient to check that the mapping + * covers the offset. Be careful to check this first because the caller + * can revalidate a COW mapping without updating the data seqno. + */ + if (wpc->fork == XFS_COW_FORK) + return true; + + /* + * This is not a COW mapping. Check the sequence number of the data fork + * because concurrent changes could have invalidated the extent. Check + * the COW fork because concurrent changes since the last time we + * checked (and found nothing at this offset) could have added + * overlapping blocks. + */ + if (wpc->data_seq != READ_ONCE(ip->i_df.if_seq)) + return false; + if (xfs_inode_has_cow_data(ip) && + wpc->cow_seq != READ_ONCE(ip->i_cowfp->if_seq)) + return false; + return true; +} + +/* + * Pass in a dellalloc extent and convert it to real extents, return the real + * extent that maps offset_fsb in wpc->imap. + * + * The current page is held locked so nothing could have removed the block + * backing offset_fsb, although it could have moved from the COW to the data + * fork by another thread. + */ +static int +xfs_convert_blocks( + struct xfs_writepage_ctx *wpc, + struct xfs_inode *ip, + xfs_fileoff_t offset_fsb) +{ + int error; + + /* + * Attempt to allocate whatever delalloc extent currently backs + * offset_fsb and put the result into wpc->imap. Allocate in a loop + * because it may take several attempts to allocate real blocks for a + * contiguous delalloc extent if free space is sufficiently fragmented. + */ + do { + error = xfs_bmapi_convert_delalloc(ip, wpc->fork, offset_fsb, + &wpc->imap, wpc->fork == XFS_COW_FORK ? + &wpc->cow_seq : &wpc->data_seq); + if (error) + return error; + } while (wpc->imap.br_startoff + wpc->imap.br_blockcount <= offset_fsb); + + return 0; +} + STATIC int xfs_map_blocks( struct xfs_writepage_ctx *wpc, @@ -310,26 +493,16 @@ xfs_map_blocks( struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; ssize_t count = i_blocksize(inode); - xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset), end_fsb; + xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); + xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + count); xfs_fileoff_t cow_fsb = NULLFILEOFF; struct xfs_bmbt_irec imap; - int whichfork = XFS_DATA_FORK; struct xfs_iext_cursor icur; - bool imap_valid; + int retries = 0; int error = 0; - /* - * We have to make sure the cached mapping is within EOF to protect - * against eofblocks trimming on file release leaving us with a stale - * mapping. Otherwise, a page for a subsequent file extending buffered - * write could get picked up by this writeback cycle and written to the - * wrong blocks. - * - * Note that what we really want here is a generic mapping invalidation - * mechanism to protect us from arbitrary extent modifying contexts, not - * just eofblocks. - */ - xfs_trim_extent_eof(&wpc->imap, ip); + if (XFS_FORCED_SHUTDOWN(mp)) + return -EIO; /* * COW fork blocks can overlap data fork blocks even if the blocks @@ -346,31 +519,19 @@ xfs_map_blocks( * against concurrent updates and provides a memory barrier on the way * out that ensures that we always see the current value. */ - imap_valid = offset_fsb >= wpc->imap.br_startoff && - offset_fsb < wpc->imap.br_startoff + wpc->imap.br_blockcount; - if (imap_valid && - (!xfs_inode_has_cow_data(ip) || - wpc->io_type == XFS_IO_COW || - wpc->cow_seq == READ_ONCE(ip->i_cowfp->if_seq))) + if (xfs_imap_valid(wpc, ip, offset_fsb)) return 0; - if (XFS_FORCED_SHUTDOWN(mp)) - return -EIO; - /* * If we don't have a valid map, now it's time to get a new one for this * offset. This will convert delayed allocations (including COW ones) * into real extents. If we return without a valid map, it means we * landed in a hole and we skip the block. */ +retry: xfs_ilock(ip, XFS_ILOCK_SHARED); ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE || (ip->i_df.if_flags & XFS_IFEXTENTS)); - ASSERT(offset <= mp->m_super->s_maxbytes); - - if (offset > mp->m_super->s_maxbytes - count) - count = mp->m_super->s_maxbytes - offset; - end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count); /* * Check if this is offset is covered by a COW extents, and if yes use @@ -382,30 +543,16 @@ xfs_map_blocks( if (cow_fsb != NULLFILEOFF && cow_fsb <= offset_fsb) { wpc->cow_seq = READ_ONCE(ip->i_cowfp->if_seq); xfs_iunlock(ip, XFS_ILOCK_SHARED); - /* - * Truncate can race with writeback since writeback doesn't - * take the iolock and truncate decreases the file size before - * it starts truncating the pages between new_size and old_size. - * Therefore, we can end up in the situation where writeback - * gets a CoW fork mapping but the truncate makes the mapping - * invalid and we end up in here trying to get a new mapping. - * bail out here so that we simply never get a valid mapping - * and so we drop the write altogether. The page truncation - * will kill the contents anyway. - */ - if (offset > i_size_read(inode)) { - wpc->io_type = XFS_IO_HOLE; - return 0; - } - whichfork = XFS_COW_FORK; - wpc->io_type = XFS_IO_COW; + + wpc->fork = XFS_COW_FORK; goto allocate_blocks; } /* - * Map valid and no COW extent in the way? We're done. + * No COW extent overlap. Revalidate now that we may have updated + * ->cow_seq. If the data mapping is still valid, we're done. */ - if (imap_valid) { + if (xfs_imap_valid(wpc, ip, offset_fsb)) { xfs_iunlock(ip, XFS_ILOCK_SHARED); return 0; } @@ -417,51 +564,65 @@ xfs_map_blocks( */ if (!xfs_iext_lookup_extent(ip, &ip->i_df, offset_fsb, &icur, &imap)) imap.br_startoff = end_fsb; /* fake a hole past EOF */ + wpc->data_seq = READ_ONCE(ip->i_df.if_seq); xfs_iunlock(ip, XFS_ILOCK_SHARED); + wpc->fork = XFS_DATA_FORK; + + /* landed in a hole or beyond EOF? */ if (imap.br_startoff > offset_fsb) { - /* landed in a hole or beyond EOF */ imap.br_blockcount = imap.br_startoff - offset_fsb; imap.br_startoff = offset_fsb; imap.br_startblock = HOLESTARTBLOCK; - wpc->io_type = XFS_IO_HOLE; - } else { - /* - * Truncate to the next COW extent if there is one. This is the - * only opportunity to do this because we can skip COW fork - * lookups for the subsequent blocks in the mapping; however, - * the requirement to treat the COW range separately remains. - */ - if (cow_fsb != NULLFILEOFF && - cow_fsb < imap.br_startoff + imap.br_blockcount) - imap.br_blockcount = cow_fsb - imap.br_startoff; - - if (isnullstartblock(imap.br_startblock)) { - /* got a delalloc extent */ - wpc->io_type = XFS_IO_DELALLOC; - goto allocate_blocks; - } - - if (imap.br_state == XFS_EXT_UNWRITTEN) - wpc->io_type = XFS_IO_UNWRITTEN; - else - wpc->io_type = XFS_IO_OVERWRITE; + imap.br_state = XFS_EXT_NORM; } + /* + * Truncate to the next COW extent if there is one. This is the only + * opportunity to do this because we can skip COW fork lookups for the + * subsequent blocks in the mapping; however, the requirement to treat + * the COW range separately remains. + */ + if (cow_fsb != NULLFILEOFF && + cow_fsb < imap.br_startoff + imap.br_blockcount) + imap.br_blockcount = cow_fsb - imap.br_startoff; + + /* got a delalloc extent? */ + if (imap.br_startblock != HOLESTARTBLOCK && + isnullstartblock(imap.br_startblock)) + goto allocate_blocks; + wpc->imap = imap; - xfs_trim_extent_eof(&wpc->imap, ip); - trace_xfs_map_blocks_found(ip, offset, count, wpc->io_type, &imap); + trace_xfs_map_blocks_found(ip, offset, count, wpc->fork, &imap); return 0; allocate_blocks: - error = xfs_iomap_write_allocate(ip, whichfork, offset, &imap, - &wpc->cow_seq); - if (error) + error = xfs_convert_blocks(wpc, ip, offset_fsb); + if (error) { + /* + * If we failed to find the extent in the COW fork we might have + * raced with a COW to data fork conversion or truncate. + * Restart the lookup to catch the extent in the data fork for + * the former case, but prevent additional retries to avoid + * looping forever for the latter case. + */ + if (error == -EAGAIN && wpc->fork == XFS_COW_FORK && !retries++) + goto retry; + ASSERT(error != -EAGAIN); return error; - ASSERT(whichfork == XFS_COW_FORK || cow_fsb == NULLFILEOFF || - imap.br_startoff + imap.br_blockcount <= cow_fsb); - wpc->imap = imap; - xfs_trim_extent_eof(&wpc->imap, ip); - trace_xfs_map_blocks_alloc(ip, offset, count, wpc->io_type, &imap); + } + + /* + * Due to merging the return real extent might be larger than the + * original delalloc one. Trim the return extent to the next COW + * boundary again to force a re-lookup. + */ + if (wpc->fork != XFS_COW_FORK && cow_fsb != NULLFILEOFF && + cow_fsb < wpc->imap.br_startoff + wpc->imap.br_blockcount) + wpc->imap.br_blockcount = cow_fsb - wpc->imap.br_startoff; + + ASSERT(wpc->imap.br_startoff <= offset_fsb); + ASSERT(wpc->imap.br_startoff + wpc->imap.br_blockcount > offset_fsb); + trace_xfs_map_blocks_alloc(ip, offset, count, wpc->fork, &imap); return 0; } @@ -473,7 +634,7 @@ allocate_blocks: * reference to the ioend to ensure that the ioend completion is only done once * all bios have been submitted and the ioend is really done. * - * If @fail is non-zero, it means that we have a situation where some part of + * If @status is non-zero, it means that we have a situation where some part of * the submission process has failed after we have marked paged for writeback * and unlocked them. In this situation, we need to fail the bio and ioend * rather than submit it to IO. This typically only happens on a filesystem @@ -485,33 +646,33 @@ xfs_submit_ioend( struct xfs_ioend *ioend, int status) { - /* Convert CoW extents to regular */ - if (!status && ioend->io_type == XFS_IO_COW) { - /* - * Yuk. This can do memory allocation, but is not a - * transactional operation so everything is done in GFP_KERNEL - * context. That can deadlock, because we hold pages in - * writeback state and GFP_KERNEL allocations can block on them. - * Hence we must operate in nofs conditions here. - */ - unsigned nofs_flag; + unsigned int nofs_flag; - nofs_flag = memalloc_nofs_save(); + /* + * We can allocate memory here while doing writeback on behalf of + * memory reclaim. To avoid memory allocation deadlocks set the + * task-wide nofs context for the following operations. + */ + nofs_flag = memalloc_nofs_save(); + + /* Convert CoW extents to regular */ + if (!status && ioend->io_fork == XFS_COW_FORK) { status = xfs_reflink_convert_cow(XFS_I(ioend->io_inode), ioend->io_offset, ioend->io_size); - memalloc_nofs_restore(nofs_flag); } /* Reserve log space if we might write beyond the on-disk inode size. */ if (!status && - ioend->io_type != XFS_IO_UNWRITTEN && + (ioend->io_fork == XFS_COW_FORK || + ioend->io_state != XFS_EXT_UNWRITTEN) && xfs_ioend_is_append(ioend) && !ioend->io_append_trans) status = xfs_setfilesize_trans_alloc(ioend); + memalloc_nofs_restore(nofs_flag); + ioend->io_bio->bi_private = ioend; ioend->io_bio->bi_end_io = xfs_end_bio; - ioend->io_bio->bi_opf = REQ_OP_WRITE | wbc_to_write_flags(wbc); /* * If we are failing the IO now, just mark the ioend with an @@ -525,7 +686,6 @@ xfs_submit_ioend( return status; } - ioend->io_bio->bi_write_hint = ioend->io_inode->i_write_hint; submit_bio(ioend->io_bio); return 0; } @@ -533,10 +693,12 @@ xfs_submit_ioend( static struct xfs_ioend * xfs_alloc_ioend( struct inode *inode, - unsigned int type, + int fork, + xfs_exntst_t state, xfs_off_t offset, struct block_device *bdev, - sector_t sector) + sector_t sector, + struct writeback_control *wbc) { struct xfs_ioend *ioend; struct bio *bio; @@ -544,14 +706,17 @@ xfs_alloc_ioend( bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_PAGES, &xfs_ioend_bioset); bio_set_dev(bio, bdev); bio->bi_iter.bi_sector = sector; + bio->bi_opf = REQ_OP_WRITE | wbc_to_write_flags(wbc); + bio->bi_write_hint = inode->i_write_hint; + wbc_init_bio(wbc, bio); ioend = container_of(bio, struct xfs_ioend, io_inline_bio); INIT_LIST_HEAD(&ioend->io_list); - ioend->io_type = type; + ioend->io_fork = fork; + ioend->io_state = state; ioend->io_inode = inode; ioend->io_size = 0; ioend->io_offset = offset; - INIT_WORK(&ioend->io_work, xfs_end_io); ioend->io_append_trans = NULL; ioend->io_bio = bio; return ioend; @@ -564,24 +729,22 @@ xfs_alloc_ioend( * so that the bi_private linkage is set up in the right direction for the * traversal in xfs_destroy_ioend(). */ -static void +static struct bio * xfs_chain_bio( - struct xfs_ioend *ioend, - struct writeback_control *wbc, - struct block_device *bdev, - sector_t sector) + struct bio *prev) { struct bio *new; new = bio_alloc(GFP_NOFS, BIO_MAX_PAGES); - bio_set_dev(new, bdev); - new->bi_iter.bi_sector = sector; - bio_chain(ioend->io_bio, new); - bio_get(ioend->io_bio); /* for xfs_destroy_ioend */ - ioend->io_bio->bi_opf = REQ_OP_WRITE | wbc_to_write_flags(wbc); - ioend->io_bio->bi_write_hint = ioend->io_inode->i_write_hint; - submit_bio(ioend->io_bio); - ioend->io_bio = new; + bio_copy_dev(new, prev);/* also copies over blkcg information */ + new->bi_iter.bi_sector = bio_end_sector(prev); + new->bi_opf = prev->bi_opf; + new->bi_write_hint = prev->bi_write_hint; + + bio_chain(prev, new); + bio_get(prev); /* for xfs_destroy_ioend */ + submit_bio(prev); + return new; } /* @@ -603,29 +766,37 @@ xfs_add_to_ioend( struct block_device *bdev = xfs_find_bdev_for_inode(inode); unsigned len = i_blocksize(inode); unsigned poff = offset & (PAGE_SIZE - 1); + bool merged, same_page = false; sector_t sector; sector = xfs_fsb_to_db(ip, wpc->imap.br_startblock) + ((offset - XFS_FSB_TO_B(mp, wpc->imap.br_startoff)) >> 9); - if (!wpc->ioend || wpc->io_type != wpc->ioend->io_type || + if (!wpc->ioend || + wpc->fork != wpc->ioend->io_fork || + wpc->imap.br_state != wpc->ioend->io_state || sector != bio_end_sector(wpc->ioend->io_bio) || offset != wpc->ioend->io_offset + wpc->ioend->io_size) { if (wpc->ioend) list_add(&wpc->ioend->io_list, iolist); - wpc->ioend = xfs_alloc_ioend(inode, wpc->io_type, offset, - bdev, sector); + wpc->ioend = xfs_alloc_ioend(inode, wpc->fork, + wpc->imap.br_state, offset, bdev, sector, wbc); } - if (!__bio_try_merge_page(wpc->ioend->io_bio, page, len, poff)) { - if (iop) - atomic_inc(&iop->write_count); - if (bio_full(wpc->ioend->io_bio)) - xfs_chain_bio(wpc->ioend, wbc, bdev, sector); - __bio_add_page(wpc->ioend->io_bio, page, len, poff); + merged = __bio_try_merge_page(wpc->ioend->io_bio, page, len, poff, + &same_page); + + if (iop && !same_page) + atomic_inc(&iop->write_count); + + if (!merged) { + if (bio_full(wpc->ioend->io_bio, len)) + wpc->ioend->io_bio = xfs_chain_bio(wpc->ioend->io_bio); + bio_add_page(wpc->ioend->io_bio, page, len, poff); } wpc->ioend->io_size += len; + wbc_account_cgroup_owner(wbc, page, len); } STATIC void @@ -723,7 +894,7 @@ xfs_writepage_map( error = xfs_map_blocks(wpc, inode, file_offset); if (error) break; - if (wpc->io_type == XFS_IO_HOLE) + if (wpc->imap.br_startblock == HOLESTARTBLOCK) continue; xfs_add_to_ioend(inode, file_offset, page, iop, wpc, wbc, &submit_list); @@ -918,9 +1089,7 @@ xfs_vm_writepage( struct page *page, struct writeback_control *wbc) { - struct xfs_writepage_ctx wpc = { - .io_type = XFS_IO_HOLE, - }; + struct xfs_writepage_ctx wpc = { }; int ret; ret = xfs_do_writepage(page, wbc, &wpc); @@ -934,9 +1103,7 @@ xfs_vm_writepages( struct address_space *mapping, struct writeback_control *wbc) { - struct xfs_writepage_ctx wpc = { - .io_type = XFS_IO_HOLE, - }; + struct xfs_writepage_ctx wpc = { }; int ret; xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED); @@ -983,7 +1150,7 @@ xfs_vm_bmap( * Since we don't pass back blockdev info, we can't return bmap * information for rt files either. */ - if (xfs_is_reflink_inode(ip) || XFS_IS_REALTIME_INODE(ip)) + if (xfs_is_cow_inode(ip) || XFS_IS_REALTIME_INODE(ip)) return 0; return iomap_bmap(mapping, block, &xfs_iomap_ops); } diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h index e5c23948a8ab..45a1ea240cbb 100644 --- a/fs/xfs/xfs_aops.h +++ b/fs/xfs/xfs_aops.h @@ -9,36 +9,15 @@ extern struct bio_set xfs_ioend_bioset; /* - * Types of I/O for bmap clustering and I/O completion tracking. - * - * This enum is used in string mapping in xfs_trace.h; please keep the - * TRACE_DEFINE_ENUMs for it up to date. - */ -enum { - XFS_IO_HOLE, /* covers region without any block allocation */ - XFS_IO_DELALLOC, /* covers delalloc region */ - XFS_IO_UNWRITTEN, /* covers allocated but uninitialized data */ - XFS_IO_OVERWRITE, /* covers already allocated extent */ - XFS_IO_COW, /* covers copy-on-write extent */ -}; - -#define XFS_IO_TYPES \ - { XFS_IO_HOLE, "hole" }, \ - { XFS_IO_DELALLOC, "delalloc" }, \ - { XFS_IO_UNWRITTEN, "unwritten" }, \ - { XFS_IO_OVERWRITE, "overwrite" }, \ - { XFS_IO_COW, "CoW" } - -/* * Structure for buffered I/O completions. */ struct xfs_ioend { struct list_head io_list; /* next ioend in chain */ - unsigned int io_type; /* delalloc / unwritten */ + int io_fork; /* inode fork written back */ + xfs_exntst_t io_state; /* extent state */ struct inode *io_inode; /* file being written to */ size_t io_size; /* size of the extent */ xfs_off_t io_offset; /* offset in the file */ - struct work_struct io_work; /* xfsdatad work queue */ struct xfs_trans *io_append_trans;/* xact. for size update */ struct bio *io_bio; /* bio being built */ struct bio io_inline_bio; /* MUST BE LAST! */ @@ -49,7 +28,6 @@ extern const struct address_space_operations xfs_dax_aops; int xfs_setfilesize(struct xfs_inode *ip, xfs_off_t offset, size_t size); -extern void xfs_count_page_state(struct page *, int *, int *); extern struct block_device *xfs_find_bdev_for_inode(struct inode *); extern struct dax_device *xfs_find_daxdev_for_inode(struct inode *); diff --git a/fs/xfs/xfs_attr_inactive.c b/fs/xfs/xfs_attr_inactive.c index 228821b2ebe0..dc93c51c17de 100644 --- a/fs/xfs/xfs_attr_inactive.c +++ b/fs/xfs/xfs_attr_inactive.c @@ -15,18 +15,13 @@ #include "xfs_da_format.h" #include "xfs_da_btree.h" #include "xfs_inode.h" -#include "xfs_alloc.h" #include "xfs_attr_remote.h" #include "xfs_trans.h" -#include "xfs_inode_item.h" #include "xfs_bmap.h" #include "xfs_attr.h" #include "xfs_attr_leaf.h" -#include "xfs_error.h" #include "xfs_quota.h" -#include "xfs_trace.h" #include "xfs_dir2.h" -#include "xfs_defer.h" /* * Look at all the extents for this logical region, @@ -121,7 +116,7 @@ xfs_attr3_leaf_inactive( int size; int tmp; int i; - struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_mount *mp = bp->b_mount; leaf = bp->b_addr; xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr, leaf); diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c index a58034049995..58fc820a70c6 100644 --- a/fs/xfs/xfs_attr_list.c +++ b/fs/xfs/xfs_attr_list.c @@ -6,25 +6,20 @@ */ #include "xfs.h" #include "xfs_fs.h" +#include "xfs_shared.h" #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" -#include "xfs_bit.h" #include "xfs_mount.h" #include "xfs_da_format.h" -#include "xfs_da_btree.h" #include "xfs_inode.h" #include "xfs_trans.h" -#include "xfs_inode_item.h" #include "xfs_bmap.h" #include "xfs_attr.h" #include "xfs_attr_sf.h" -#include "xfs_attr_remote.h" #include "xfs_attr_leaf.h" #include "xfs_error.h" #include "xfs_trace.h" -#include "xfs_buf_item.h" -#include "xfs_cksum.h" #include "xfs_dir2.h" STATIC int @@ -555,6 +550,7 @@ xfs_attr_put_listent( attrlist_ent_t *aep; int arraytop; + ASSERT(!context->seen_enough); ASSERT(!(context->flags & ATTR_KERNOVAL)); ASSERT(context->count >= 0); ASSERT(context->count < (ATTR_MAX_VALUELEN/8)); diff --git a/fs/xfs/xfs_bio_io.c b/fs/xfs/xfs_bio_io.c new file mode 100644 index 000000000000..e2148f2d5d6b --- /dev/null +++ b/fs/xfs/xfs_bio_io.c @@ -0,0 +1,61 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2019 Christoph Hellwig. + */ +#include "xfs.h" + +static inline unsigned int bio_max_vecs(unsigned int count) +{ + return min_t(unsigned, howmany(count, PAGE_SIZE), BIO_MAX_PAGES); +} + +int +xfs_rw_bdev( + struct block_device *bdev, + sector_t sector, + unsigned int count, + char *data, + unsigned int op) + +{ + unsigned int is_vmalloc = is_vmalloc_addr(data); + unsigned int left = count; + int error; + struct bio *bio; + + if (is_vmalloc && op == REQ_OP_WRITE) + flush_kernel_vmap_range(data, count); + + bio = bio_alloc(GFP_KERNEL, bio_max_vecs(left)); + bio_set_dev(bio, bdev); + bio->bi_iter.bi_sector = sector; + bio->bi_opf = op | REQ_META | REQ_SYNC; + + do { + struct page *page = kmem_to_page(data); + unsigned int off = offset_in_page(data); + unsigned int len = min_t(unsigned, left, PAGE_SIZE - off); + + while (bio_add_page(bio, page, len, off) != len) { + struct bio *prev = bio; + + bio = bio_alloc(GFP_KERNEL, bio_max_vecs(left)); + bio_copy_dev(bio, prev); + bio->bi_iter.bi_sector = bio_end_sector(prev); + bio->bi_opf = prev->bi_opf; + bio_chain(prev, bio); + + submit_bio(prev); + } + + data += len; + left -= len; + } while (left > 0); + + error = submit_bio_wait(bio); + bio_put(bio); + + if (is_vmalloc && op == REQ_OP_READ) + invalidate_kernel_vmap_range(data, count); + return error; +} diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c index ce45f066995e..9fa4a7ee8cfc 100644 --- a/fs/xfs/xfs_bmap_item.c +++ b/fs/xfs/xfs_bmap_item.c @@ -9,17 +9,16 @@ #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_bit.h" +#include "xfs_shared.h" #include "xfs_mount.h" #include "xfs_defer.h" #include "xfs_inode.h" #include "xfs_trans.h" #include "xfs_trans_priv.h" -#include "xfs_buf_item.h" #include "xfs_bmap_item.h" #include "xfs_log.h" #include "xfs_bmap.h" #include "xfs_icache.h" -#include "xfs_trace.h" #include "xfs_bmap_btree.h" #include "xfs_trans_space.h" @@ -96,15 +95,6 @@ xfs_bui_item_format( } /* - * Pinning has no meaning for an bui item, so just return. - */ -STATIC void -xfs_bui_item_pin( - struct xfs_log_item *lip) -{ -} - -/* * The unpin operation is the last place an BUI is manipulated in the log. It is * either inserted in the AIL or aborted in the event of a log I/O error. In * either case, the BUI transaction has been successfully committed to make it @@ -123,71 +113,22 @@ xfs_bui_item_unpin( } /* - * BUI items have no locking or pushing. However, since BUIs are pulled from - * the AIL when their corresponding BUDs are committed to disk, their situation - * is very similar to being pinned. Return XFS_ITEM_PINNED so that the caller - * will eventually flush the log. This should help in getting the BUI out of - * the AIL. - */ -STATIC uint -xfs_bui_item_push( - struct xfs_log_item *lip, - struct list_head *buffer_list) -{ - return XFS_ITEM_PINNED; -} - -/* * The BUI has been either committed or aborted if the transaction has been * cancelled. If the transaction was cancelled, an BUD isn't going to be * constructed and thus we free the BUI here directly. */ STATIC void -xfs_bui_item_unlock( +xfs_bui_item_release( struct xfs_log_item *lip) { - if (test_bit(XFS_LI_ABORTED, &lip->li_flags)) - xfs_bui_release(BUI_ITEM(lip)); -} - -/* - * The BUI is logged only once and cannot be moved in the log, so simply return - * the lsn at which it's been logged. - */ -STATIC xfs_lsn_t -xfs_bui_item_committed( - struct xfs_log_item *lip, - xfs_lsn_t lsn) -{ - return lsn; + xfs_bui_release(BUI_ITEM(lip)); } -/* - * The BUI dependency tracking op doesn't do squat. It can't because - * it doesn't know where the free extent is coming from. The dependency - * tracking has to be handled by the "enclosing" metadata object. For - * example, for inodes, the inode is locked throughout the extent freeing - * so the dependency should be recorded there. - */ -STATIC void -xfs_bui_item_committing( - struct xfs_log_item *lip, - xfs_lsn_t lsn) -{ -} - -/* - * This is the ops vector shared by all bui log items. - */ static const struct xfs_item_ops xfs_bui_item_ops = { .iop_size = xfs_bui_item_size, .iop_format = xfs_bui_item_format, - .iop_pin = xfs_bui_item_pin, .iop_unpin = xfs_bui_item_unpin, - .iop_unlock = xfs_bui_item_unlock, - .iop_committed = xfs_bui_item_committed, - .iop_push = xfs_bui_item_push, - .iop_committing = xfs_bui_item_committing, + .iop_release = xfs_bui_item_release, }; /* @@ -249,126 +190,241 @@ xfs_bud_item_format( } /* - * Pinning has no meaning for an bud item, so just return. + * The BUD is either committed or aborted if the transaction is cancelled. If + * the transaction is cancelled, drop our reference to the BUI and free the + * BUD. */ STATIC void -xfs_bud_item_pin( +xfs_bud_item_release( struct xfs_log_item *lip) { + struct xfs_bud_log_item *budp = BUD_ITEM(lip); + + xfs_bui_release(budp->bud_buip); + kmem_zone_free(xfs_bud_zone, budp); } -/* - * Since pinning has no meaning for an bud item, unpinning does - * not either. - */ -STATIC void -xfs_bud_item_unpin( - struct xfs_log_item *lip, - int remove) +static const struct xfs_item_ops xfs_bud_item_ops = { + .flags = XFS_ITEM_RELEASE_WHEN_COMMITTED, + .iop_size = xfs_bud_item_size, + .iop_format = xfs_bud_item_format, + .iop_release = xfs_bud_item_release, +}; + +static struct xfs_bud_log_item * +xfs_trans_get_bud( + struct xfs_trans *tp, + struct xfs_bui_log_item *buip) { + struct xfs_bud_log_item *budp; + + budp = kmem_zone_zalloc(xfs_bud_zone, KM_SLEEP); + xfs_log_item_init(tp->t_mountp, &budp->bud_item, XFS_LI_BUD, + &xfs_bud_item_ops); + budp->bud_buip = buip; + budp->bud_format.bud_bui_id = buip->bui_format.bui_id; + + xfs_trans_add_item(tp, &budp->bud_item); + return budp; } /* - * There isn't much you can do to push on an bud item. It is simply stuck - * waiting for the log to be flushed to disk. + * Finish an bmap update and log it to the BUD. Note that the + * transaction is marked dirty regardless of whether the bmap update + * succeeds or fails to support the BUI/BUD lifecycle rules. */ -STATIC uint -xfs_bud_item_push( - struct xfs_log_item *lip, - struct list_head *buffer_list) +static int +xfs_trans_log_finish_bmap_update( + struct xfs_trans *tp, + struct xfs_bud_log_item *budp, + enum xfs_bmap_intent_type type, + struct xfs_inode *ip, + int whichfork, + xfs_fileoff_t startoff, + xfs_fsblock_t startblock, + xfs_filblks_t *blockcount, + xfs_exntst_t state) { - return XFS_ITEM_PINNED; + int error; + + error = xfs_bmap_finish_one(tp, ip, type, whichfork, startoff, + startblock, blockcount, state); + + /* + * Mark the transaction dirty, even on error. This ensures the + * transaction is aborted, which: + * + * 1.) releases the BUI and frees the BUD + * 2.) shuts down the filesystem + */ + tp->t_flags |= XFS_TRANS_DIRTY; + set_bit(XFS_LI_DIRTY, &budp->bud_item.li_flags); + + return error; } -/* - * The BUD is either committed or aborted if the transaction is cancelled. If - * the transaction is cancelled, drop our reference to the BUI and free the - * BUD. - */ -STATIC void -xfs_bud_item_unlock( - struct xfs_log_item *lip) +/* Sort bmap intents by inode. */ +static int +xfs_bmap_update_diff_items( + void *priv, + struct list_head *a, + struct list_head *b) { - struct xfs_bud_log_item *budp = BUD_ITEM(lip); + struct xfs_bmap_intent *ba; + struct xfs_bmap_intent *bb; - if (test_bit(XFS_LI_ABORTED, &lip->li_flags)) { - xfs_bui_release(budp->bud_buip); - kmem_zone_free(xfs_bud_zone, budp); - } + ba = container_of(a, struct xfs_bmap_intent, bi_list); + bb = container_of(b, struct xfs_bmap_intent, bi_list); + return ba->bi_owner->i_ino - bb->bi_owner->i_ino; } -/* - * When the bud item is committed to disk, all we need to do is delete our - * reference to our partner bui item and then free ourselves. Since we're - * freeing ourselves we must return -1 to keep the transaction code from - * further referencing this item. - */ -STATIC xfs_lsn_t -xfs_bud_item_committed( - struct xfs_log_item *lip, - xfs_lsn_t lsn) +/* Get an BUI. */ +STATIC void * +xfs_bmap_update_create_intent( + struct xfs_trans *tp, + unsigned int count) { - struct xfs_bud_log_item *budp = BUD_ITEM(lip); + struct xfs_bui_log_item *buip; + + ASSERT(count == XFS_BUI_MAX_FAST_EXTENTS); + ASSERT(tp != NULL); + + buip = xfs_bui_init(tp->t_mountp); + ASSERT(buip != NULL); /* - * Drop the BUI reference regardless of whether the BUD has been - * aborted. Once the BUD transaction is constructed, it is the sole - * responsibility of the BUD to release the BUI (even if the BUI is - * aborted due to log I/O error). + * Get a log_item_desc to point at the new item. */ - xfs_bui_release(budp->bud_buip); - kmem_zone_free(xfs_bud_zone, budp); + xfs_trans_add_item(tp, &buip->bui_item); + return buip; +} - return (xfs_lsn_t)-1; +/* Set the map extent flags for this mapping. */ +static void +xfs_trans_set_bmap_flags( + struct xfs_map_extent *bmap, + enum xfs_bmap_intent_type type, + int whichfork, + xfs_exntst_t state) +{ + bmap->me_flags = 0; + switch (type) { + case XFS_BMAP_MAP: + case XFS_BMAP_UNMAP: + bmap->me_flags = type; + break; + default: + ASSERT(0); + } + if (state == XFS_EXT_UNWRITTEN) + bmap->me_flags |= XFS_BMAP_EXTENT_UNWRITTEN; + if (whichfork == XFS_ATTR_FORK) + bmap->me_flags |= XFS_BMAP_EXTENT_ATTR_FORK; } -/* - * The BUD dependency tracking op doesn't do squat. It can't because - * it doesn't know where the free extent is coming from. The dependency - * tracking has to be handled by the "enclosing" metadata object. For - * example, for inodes, the inode is locked throughout the extent freeing - * so the dependency should be recorded there. - */ +/* Log bmap updates in the intent item. */ STATIC void -xfs_bud_item_committing( - struct xfs_log_item *lip, - xfs_lsn_t lsn) +xfs_bmap_update_log_item( + struct xfs_trans *tp, + void *intent, + struct list_head *item) { + struct xfs_bui_log_item *buip = intent; + struct xfs_bmap_intent *bmap; + uint next_extent; + struct xfs_map_extent *map; + + bmap = container_of(item, struct xfs_bmap_intent, bi_list); + + tp->t_flags |= XFS_TRANS_DIRTY; + set_bit(XFS_LI_DIRTY, &buip->bui_item.li_flags); + + /* + * atomic_inc_return gives us the value after the increment; + * we want to use it as an array index so we need to subtract 1 from + * it. + */ + next_extent = atomic_inc_return(&buip->bui_next_extent) - 1; + ASSERT(next_extent < buip->bui_format.bui_nextents); + map = &buip->bui_format.bui_extents[next_extent]; + map->me_owner = bmap->bi_owner->i_ino; + map->me_startblock = bmap->bi_bmap.br_startblock; + map->me_startoff = bmap->bi_bmap.br_startoff; + map->me_len = bmap->bi_bmap.br_blockcount; + xfs_trans_set_bmap_flags(map, bmap->bi_type, bmap->bi_whichfork, + bmap->bi_bmap.br_state); } -/* - * This is the ops vector shared by all bud log items. - */ -static const struct xfs_item_ops xfs_bud_item_ops = { - .iop_size = xfs_bud_item_size, - .iop_format = xfs_bud_item_format, - .iop_pin = xfs_bud_item_pin, - .iop_unpin = xfs_bud_item_unpin, - .iop_unlock = xfs_bud_item_unlock, - .iop_committed = xfs_bud_item_committed, - .iop_push = xfs_bud_item_push, - .iop_committing = xfs_bud_item_committing, -}; +/* Get an BUD so we can process all the deferred rmap updates. */ +STATIC void * +xfs_bmap_update_create_done( + struct xfs_trans *tp, + void *intent, + unsigned int count) +{ + return xfs_trans_get_bud(tp, intent); +} -/* - * Allocate and initialize an bud item with the given number of extents. - */ -struct xfs_bud_log_item * -xfs_bud_init( - struct xfs_mount *mp, - struct xfs_bui_log_item *buip) +/* Process a deferred rmap update. */ +STATIC int +xfs_bmap_update_finish_item( + struct xfs_trans *tp, + struct list_head *item, + void *done_item, + void **state) +{ + struct xfs_bmap_intent *bmap; + xfs_filblks_t count; + int error; + + bmap = container_of(item, struct xfs_bmap_intent, bi_list); + count = bmap->bi_bmap.br_blockcount; + error = xfs_trans_log_finish_bmap_update(tp, done_item, + bmap->bi_type, + bmap->bi_owner, bmap->bi_whichfork, + bmap->bi_bmap.br_startoff, + bmap->bi_bmap.br_startblock, + &count, + bmap->bi_bmap.br_state); + if (!error && count > 0) { + ASSERT(bmap->bi_type == XFS_BMAP_UNMAP); + bmap->bi_bmap.br_blockcount = count; + return -EAGAIN; + } + kmem_free(bmap); + return error; +} +/* Abort all pending BUIs. */ +STATIC void +xfs_bmap_update_abort_intent( + void *intent) { - struct xfs_bud_log_item *budp; + xfs_bui_release(intent); +} - budp = kmem_zone_zalloc(xfs_bud_zone, KM_SLEEP); - xfs_log_item_init(mp, &budp->bud_item, XFS_LI_BUD, &xfs_bud_item_ops); - budp->bud_buip = buip; - budp->bud_format.bud_bui_id = buip->bui_format.bui_id; +/* Cancel a deferred rmap update. */ +STATIC void +xfs_bmap_update_cancel_item( + struct list_head *item) +{ + struct xfs_bmap_intent *bmap; - return budp; + bmap = container_of(item, struct xfs_bmap_intent, bi_list); + kmem_free(bmap); } +const struct xfs_defer_op_type xfs_bmap_update_defer_type = { + .max_items = XFS_BUI_MAX_FAST_EXTENTS, + .diff_items = xfs_bmap_update_diff_items, + .create_intent = xfs_bmap_update_create_intent, + .abort_intent = xfs_bmap_update_abort_intent, + .log_item = xfs_bmap_update_log_item, + .create_done = xfs_bmap_update_create_done, + .finish_item = xfs_bmap_update_finish_item, + .cancel_item = xfs_bmap_update_cancel_item, +}; + /* * Process a bmap update intent item that was recovered from the log. * We need to update some inode's bmbt. diff --git a/fs/xfs/xfs_bmap_item.h b/fs/xfs/xfs_bmap_item.h index 89e043a88bb8..ad479cc73de8 100644 --- a/fs/xfs/xfs_bmap_item.h +++ b/fs/xfs/xfs_bmap_item.h @@ -75,8 +75,6 @@ extern struct kmem_zone *xfs_bui_zone; extern struct kmem_zone *xfs_bud_zone; struct xfs_bui_log_item *xfs_bui_init(struct xfs_mount *); -struct xfs_bud_log_item *xfs_bud_init(struct xfs_mount *, - struct xfs_bui_log_item *); void xfs_bui_item_free(struct xfs_bui_log_item *); void xfs_bui_release(struct xfs_bui_log_item *); int xfs_bui_recover(struct xfs_trans *parent_tp, struct xfs_bui_log_item *buip); diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 1ee8c5539fa4..98c6a7a71427 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -12,12 +12,10 @@ #include "xfs_trans_resv.h" #include "xfs_bit.h" #include "xfs_mount.h" -#include "xfs_da_format.h" #include "xfs_defer.h" #include "xfs_inode.h" #include "xfs_btree.h" #include "xfs_trans.h" -#include "xfs_extfree_item.h" #include "xfs_alloc.h" #include "xfs_bmap.h" #include "xfs_bmap_util.h" @@ -28,11 +26,8 @@ #include "xfs_trans_space.h" #include "xfs_trace.h" #include "xfs_icache.h" -#include "xfs_log.h" -#include "xfs_rmap_btree.h" #include "xfs_iomap.h" #include "xfs_reflink.h" -#include "xfs_refcount.h" /* Kernel only BMAP related definitions and functions */ @@ -276,7 +271,7 @@ xfs_bmap_count_tree( struct xfs_btree_block *block, *nextblock; int numrecs; - error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, XFS_BMAP_BTREE_REF, + error = xfs_btree_read_bufl(mp, tp, bno, &bp, XFS_BMAP_BTREE_REF, &xfs_bmbt_buf_ops); if (error) return error; @@ -287,7 +282,7 @@ xfs_bmap_count_tree( /* Not at node above leaves, count this level of nodes */ nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib); while (nextbno != NULLFSBLOCK) { - error = xfs_btree_read_bufl(mp, tp, nextbno, 0, &nbp, + error = xfs_btree_read_bufl(mp, tp, nextbno, &nbp, XFS_BMAP_BTREE_REF, &xfs_bmbt_buf_ops); if (error) @@ -321,7 +316,7 @@ xfs_bmap_count_tree( if (nextbno == NULLFSBLOCK) break; bno = nextbno; - error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, + error = xfs_btree_read_bufl(mp, tp, bno, &bp, XFS_BMAP_BTREE_REF, &xfs_bmbt_buf_ops); if (error) @@ -1162,16 +1157,13 @@ xfs_zero_file_space( * by virtue of the hole punch. */ error = xfs_free_file_space(ip, offset, len); - if (error) - goto out; + if (error || xfs_is_always_cow_inode(ip)) + return error; - error = xfs_alloc_file_space(ip, round_down(offset, blksize), + return xfs_alloc_file_space(ip, round_down(offset, blksize), round_up(offset + len, blksize) - round_down(offset, blksize), XFS_BMAPI_PREALLOC); -out: - return error; - } static int @@ -1196,6 +1188,8 @@ xfs_prepare_shift( * about to shift down every extent from offset to EOF. */ error = xfs_flush_unmap_range(ip, offset, XFS_ISIZE(ip)); + if (error) + return error; /* * Clean out anything hanging around in the cow fork now that diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 4f5f2ff3f70f..ca0849043f54 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -4,24 +4,9 @@ * All Rights Reserved. */ #include "xfs.h" -#include <linux/stddef.h> -#include <linux/errno.h> -#include <linux/gfp.h> -#include <linux/pagemap.h> -#include <linux/init.h> -#include <linux/vmalloc.h> -#include <linux/bio.h> -#include <linux/sysctl.h> -#include <linux/proc_fs.h> -#include <linux/workqueue.h> -#include <linux/percpu.h> -#include <linux/blkdev.h> -#include <linux/hash.h> -#include <linux/kthread.h> -#include <linux/migrate.h> #include <linux/backing-dev.h> -#include <linux/freezer.h> +#include "xfs_shared.h" #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" @@ -213,7 +198,7 @@ xfs_buf_free_maps( } } -struct xfs_buf * +static struct xfs_buf * _xfs_buf_alloc( struct xfs_buftarg *target, struct xfs_buf_map *map, @@ -243,6 +228,7 @@ _xfs_buf_alloc( sema_init(&bp->b_sema, 0); /* held, no waiters */ spin_lock_init(&bp->b_lock); bp->b_target = target; + bp->b_mount = target->bt_mount; bp->b_flags = flags; /* @@ -263,12 +249,11 @@ _xfs_buf_alloc( bp->b_maps[i].bm_len = map[i].bm_len; bp->b_length += map[i].bm_len; } - bp->b_io_length = bp->b_length; atomic_set(&bp->b_pin_count, 0); init_waitqueue_head(&bp->b_waiters); - XFS_STATS_INC(target->bt_mount, xb_create); + XFS_STATS_INC(bp->b_mount, xb_create); trace_xfs_buf_init(bp, _RET_IP_); return bp; @@ -425,12 +410,12 @@ retry: current->comm, current->pid, __func__, gfp_mask); - XFS_STATS_INC(bp->b_target->bt_mount, xb_page_retries); + XFS_STATS_INC(bp->b_mount, xb_page_retries); congestion_wait(BLK_RW_ASYNC, HZ/50); goto retry; } - XFS_STATS_INC(bp->b_target->bt_mount, xb_page_found); + XFS_STATS_INC(bp->b_mount, xb_page_found); nbytes = min_t(size_t, size, PAGE_SIZE - offset); size -= nbytes; @@ -776,29 +761,24 @@ _xfs_buf_read( } /* - * Set buffer ops on an unchecked buffer and validate it, if possible. - * - * If the caller passed in an ops structure and the buffer doesn't have ops - * assigned, set the ops and use them to verify the contents. If the contents - * cannot be verified, we'll clear XBF_DONE. We assume the buffer has no - * recorded errors and is already in XBF_DONE state. + * Reverify a buffer found in cache without an attached ->b_ops. * - * Under normal operations, every in-core buffer must have buffer ops assigned - * to them when the buffer is read in from disk so that we can validate the - * metadata. + * If the caller passed an ops structure and the buffer doesn't have ops + * assigned, set the ops and use it to verify the contents. If verification + * fails, clear XBF_DONE. We assume the buffer has no recorded errors and is + * already in XBF_DONE state on entry. * - * However, there are two scenarios where one can encounter in-core buffers - * that don't have buffer ops. The first is during log recovery of buffers on - * a V4 filesystem, though these buffers are purged at the end of recovery. - * - * The other is online repair, which tries to match arbitrary metadata blocks - * with btree types in order to find the root. If online repair doesn't match - * the buffer with /any/ btree type, the buffer remains in memory in DONE state - * with no ops, and a subsequent read_buf call from elsewhere will not set the - * ops. This function helps us fix this situation. + * Under normal operations, every in-core buffer is verified on read I/O + * completion. There are two scenarios that can lead to in-core buffers without + * an assigned ->b_ops. The first is during log recovery of buffers on a V4 + * filesystem, though these buffers are purged at the end of recovery. The + * other is online repair, which intentionally reads with a NULL buffer ops to + * run several verifiers across an in-core buffer in order to establish buffer + * type. If repair can't establish that, the buffer will be left in memory + * with NULL buffer ops. */ int -xfs_buf_ensure_ops( +xfs_buf_reverify( struct xfs_buf *bp, const struct xfs_buf_ops *ops) { @@ -840,7 +820,7 @@ xfs_buf_read_map( return bp; } - xfs_buf_ensure_ops(bp, ops); + xfs_buf_reverify(bp, ops); if (flags & XBF_ASYNC) { /* @@ -914,83 +894,6 @@ xfs_buf_read_uncached( return 0; } -/* - * Return a buffer allocated as an empty buffer and associated to external - * memory via xfs_buf_associate_memory() back to it's empty state. - */ -void -xfs_buf_set_empty( - struct xfs_buf *bp, - size_t numblks) -{ - if (bp->b_pages) - _xfs_buf_free_pages(bp); - - bp->b_pages = NULL; - bp->b_page_count = 0; - bp->b_addr = NULL; - bp->b_length = numblks; - bp->b_io_length = numblks; - - ASSERT(bp->b_map_count == 1); - bp->b_bn = XFS_BUF_DADDR_NULL; - bp->b_maps[0].bm_bn = XFS_BUF_DADDR_NULL; - bp->b_maps[0].bm_len = bp->b_length; -} - -static inline struct page * -mem_to_page( - void *addr) -{ - if ((!is_vmalloc_addr(addr))) { - return virt_to_page(addr); - } else { - return vmalloc_to_page(addr); - } -} - -int -xfs_buf_associate_memory( - xfs_buf_t *bp, - void *mem, - size_t len) -{ - int rval; - int i = 0; - unsigned long pageaddr; - unsigned long offset; - size_t buflen; - int page_count; - - pageaddr = (unsigned long)mem & PAGE_MASK; - offset = (unsigned long)mem - pageaddr; - buflen = PAGE_ALIGN(len + offset); - page_count = buflen >> PAGE_SHIFT; - - /* Free any previous set of page pointers */ - if (bp->b_pages) - _xfs_buf_free_pages(bp); - - bp->b_pages = NULL; - bp->b_addr = mem; - - rval = _xfs_buf_get_pages(bp, page_count); - if (rval) - return rval; - - bp->b_offset = offset; - - for (i = 0; i < bp->b_page_count; i++) { - bp->b_pages[i] = mem_to_page((void *)pageaddr); - pageaddr += PAGE_SIZE; - } - - bp->b_io_length = BTOBB(len); - bp->b_length = BTOBB(buflen); - - return 0; -} - xfs_buf_t * xfs_buf_get_uncached( struct xfs_buftarg *target, @@ -1185,7 +1088,7 @@ xfs_buf_lock( trace_xfs_buf_lock(bp, _RET_IP_); if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE)) - xfs_log_force(bp->b_target->bt_mount, 0); + xfs_log_force(bp->b_mount, 0); down(&bp->b_sema); trace_xfs_buf_lock_done(bp, _RET_IP_); @@ -1274,7 +1177,7 @@ xfs_buf_ioend_async( struct xfs_buf *bp) { INIT_WORK(&bp->b_ioend_work, xfs_buf_ioend_work); - queue_work(bp->b_ioend_wq, &bp->b_ioend_work); + queue_work(bp->b_mount->m_buf_workqueue, &bp->b_ioend_work); } void @@ -1293,7 +1196,7 @@ xfs_buf_ioerror_alert( struct xfs_buf *bp, const char *func) { - xfs_alert(bp->b_target->bt_mount, + xfs_alert(bp->b_mount, "metadata I/O error in \"%s\" at daddr 0x%llx len %d error %d", func, (uint64_t)XFS_BUF_ADDR(bp), bp->b_length, -bp->b_error); @@ -1312,10 +1215,8 @@ xfs_bwrite( XBF_WRITE_FAIL | XBF_DONE); error = xfs_buf_submit(bp); - if (error) { - xfs_force_shutdown(bp->b_target->bt_mount, - SHUTDOWN_META_IO_ERROR); - } + if (error) + xfs_force_shutdown(bp->b_mount, SHUTDOWN_META_IO_ERROR); return error; } @@ -1441,21 +1342,8 @@ _xfs_buf_ioapply( */ bp->b_error = 0; - /* - * Initialize the I/O completion workqueue if we haven't yet or the - * submitter has not opted to specify a custom one. - */ - if (!bp->b_ioend_wq) - bp->b_ioend_wq = bp->b_target->bt_mount->m_buf_workqueue; - if (bp->b_flags & XBF_WRITE) { op = REQ_OP_WRITE; - if (bp->b_flags & XBF_SYNCIO) - op_flags = REQ_SYNC; - if (bp->b_flags & XBF_FUA) - op_flags |= REQ_FUA; - if (bp->b_flags & XBF_FLUSH) - op_flags |= REQ_PREFLUSH; /* * Run the write verifier callback function if it exists. If @@ -1465,12 +1353,12 @@ _xfs_buf_ioapply( if (bp->b_ops) { bp->b_ops->verify_write(bp); if (bp->b_error) { - xfs_force_shutdown(bp->b_target->bt_mount, + xfs_force_shutdown(bp->b_mount, SHUTDOWN_CORRUPT_INCORE); return; } } else if (bp->b_bn != XFS_BUF_DADDR_NULL) { - struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_mount *mp = bp->b_mount; /* * non-crc filesystems don't attach verifiers during @@ -1502,7 +1390,7 @@ _xfs_buf_ioapply( * subsequent call. */ offset = bp->b_offset; - size = BBTOB(bp->b_io_length); + size = BBTOB(bp->b_length); blk_start_plug(&plug); for (i = 0; i < bp->b_map_count; i++) { xfs_buf_ioapply_map(bp, i, &offset, &size, op, op_flags); @@ -1548,7 +1436,7 @@ __xfs_buf_submit( ASSERT(!(bp->b_flags & _XBF_DELWRI_Q)); /* on shutdown we stale and complete the buffer immediately */ - if (XFS_FORCED_SHUTDOWN(bp->b_target->bt_mount)) { + if (XFS_FORCED_SHUTDOWN(bp->b_mount)) { xfs_buf_ioerror(bp, -EIO); bp->b_flags &= ~XBF_DONE; xfs_buf_stale(bp); @@ -1618,16 +1506,11 @@ xfs_buf_offset( return page_address(page) + (offset & (PAGE_SIZE-1)); } -/* - * Move data into or out of a buffer. - */ void -xfs_buf_iomove( - xfs_buf_t *bp, /* buffer to process */ - size_t boff, /* starting buffer offset */ - size_t bsize, /* length to copy */ - void *data, /* data address */ - xfs_buf_rw_t mode) /* read/write/zero flag */ +xfs_buf_zero( + struct xfs_buf *bp, + size_t boff, + size_t bsize) { size_t bend; @@ -1640,23 +1523,13 @@ xfs_buf_iomove( page_offset = (boff + bp->b_offset) & ~PAGE_MASK; page = bp->b_pages[page_index]; csize = min_t(size_t, PAGE_SIZE - page_offset, - BBTOB(bp->b_io_length) - boff); + BBTOB(bp->b_length) - boff); ASSERT((csize + page_offset) <= PAGE_SIZE); - switch (mode) { - case XBRW_ZERO: - memset(page_address(page) + page_offset, 0, csize); - break; - case XBRW_READ: - memcpy(data, page_address(page) + page_offset, csize); - break; - case XBRW_WRITE: - memcpy(page_address(page) + page_offset, data, csize); - } + memset(page_address(page) + page_offset, 0, csize); boff += csize; - data += csize; } } @@ -2203,9 +2076,45 @@ void xfs_buf_set_ref(struct xfs_buf *bp, int lru_ref) * This allows userspace to disrupt buffer caching for debug/testing * purposes. */ - if (XFS_TEST_ERROR(false, bp->b_target->bt_mount, - XFS_ERRTAG_BUF_LRU_REF)) + if (XFS_TEST_ERROR(false, bp->b_mount, XFS_ERRTAG_BUF_LRU_REF)) lru_ref = 0; atomic_set(&bp->b_lru_ref, lru_ref); } + +/* + * Verify an on-disk magic value against the magic value specified in the + * verifier structure. The verifier magic is in disk byte order so the caller is + * expected to pass the value directly from disk. + */ +bool +xfs_verify_magic( + struct xfs_buf *bp, + __be32 dmagic) +{ + struct xfs_mount *mp = bp->b_mount; + int idx; + + idx = xfs_sb_version_hascrc(&mp->m_sb); + if (unlikely(WARN_ON(!bp->b_ops || !bp->b_ops->magic[idx]))) + return false; + return dmagic == bp->b_ops->magic[idx]; +} +/* + * Verify an on-disk magic value against the magic value specified in the + * verifier structure. The verifier magic is in disk byte order so the caller is + * expected to pass the value directly from disk. + */ +bool +xfs_verify_magic16( + struct xfs_buf *bp, + __be16 dmagic) +{ + struct xfs_mount *mp = bp->b_mount; + int idx; + + idx = xfs_sb_version_hascrc(&mp->m_sb); + if (unlikely(WARN_ON(!bp->b_ops || !bp->b_ops->magic16[idx]))) + return false; + return dmagic == bp->b_ops->magic16[idx]; +} diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index b9f5511ea998..c6e57a3f409e 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -21,12 +21,6 @@ #define XFS_BUF_DADDR_NULL ((xfs_daddr_t) (-1LL)) -typedef enum { - XBRW_READ = 1, /* transfer into target memory */ - XBRW_WRITE = 2, /* transfer from target memory */ - XBRW_ZERO = 3, /* Zero target memory */ -} xfs_buf_rw_t; - #define XBF_READ (1 << 0) /* buffer intended for reading from device */ #define XBF_WRITE (1 << 1) /* buffer intended for writing to device */ #define XBF_READ_AHEAD (1 << 2) /* asynchronous read-ahead */ @@ -34,12 +28,7 @@ typedef enum { #define XBF_ASYNC (1 << 4) /* initiator will not wait for completion */ #define XBF_DONE (1 << 5) /* all pages in the buffer uptodate */ #define XBF_STALE (1 << 6) /* buffer has been staled, do not find it */ -#define XBF_WRITE_FAIL (1 << 24)/* async writes have failed on this buffer */ - -/* I/O hints for the BIO layer */ -#define XBF_SYNCIO (1 << 10)/* treat this buffer as synchronous I/O */ -#define XBF_FUA (1 << 11)/* force cache write through mode */ -#define XBF_FLUSH (1 << 12)/* flush the disk cache before a write */ +#define XBF_WRITE_FAIL (1 << 7) /* async writes have failed on this buffer */ /* flags used only as arguments to access routines */ #define XBF_TRYLOCK (1 << 16)/* lock requested, but do not wait */ @@ -49,7 +38,6 @@ typedef enum { #define _XBF_PAGES (1 << 20)/* backed by refcounted pages */ #define _XBF_KMEM (1 << 21)/* backed by heap memory */ #define _XBF_DELWRI_Q (1 << 22)/* buffer on a delwri queue */ -#define _XBF_COMPOUND (1 << 23)/* compound buffer */ typedef unsigned int xfs_buf_flags_t; @@ -62,15 +50,11 @@ typedef unsigned int xfs_buf_flags_t; { XBF_DONE, "DONE" }, \ { XBF_STALE, "STALE" }, \ { XBF_WRITE_FAIL, "WRITE_FAIL" }, \ - { XBF_SYNCIO, "SYNCIO" }, \ - { XBF_FUA, "FUA" }, \ - { XBF_FLUSH, "FLUSH" }, \ { XBF_TRYLOCK, "TRYLOCK" }, /* should never be set */\ { XBF_UNMAPPED, "UNMAPPED" }, /* ditto */\ { _XBF_PAGES, "PAGES" }, \ { _XBF_KMEM, "KMEM" }, \ - { _XBF_DELWRI_Q, "DELWRI_Q" }, \ - { _XBF_COMPOUND, "COMPOUND" } + { _XBF_DELWRI_Q, "DELWRI_Q" } /* @@ -125,6 +109,10 @@ struct xfs_buf_map { struct xfs_buf_ops { char *name; + union { + __be32 magic[2]; /* v4 and v5 on disk magic values */ + __be16 magic16[2]; /* v4 and v5 on disk magic values */ + }; void (*verify_read)(struct xfs_buf *); void (*verify_write)(struct xfs_buf *); xfs_failaddr_t (*verify_struct)(struct xfs_buf *bp); @@ -157,13 +145,13 @@ typedef struct xfs_buf { wait_queue_head_t b_waiters; /* unpin waiters */ struct list_head b_list; struct xfs_perag *b_pag; /* contains rbtree root */ + struct xfs_mount *b_mount; xfs_buftarg_t *b_target; /* buffer target (device) */ void *b_addr; /* virtual address of buffer */ struct work_struct b_ioend_work; - struct workqueue_struct *b_ioend_wq; /* I/O completion wq */ xfs_buf_iodone_t b_iodone; /* I/O completion function */ struct completion b_iowait; /* queue for I/O waiters */ - void *b_log_item; + struct xfs_buf_log_item *b_log_item; struct list_head b_li_list; /* Log items list head */ struct xfs_trans *b_transp; struct page **b_pages; /* array of page pointers */ @@ -171,7 +159,6 @@ typedef struct xfs_buf { struct xfs_buf_map *b_maps; /* compound buffer map */ struct xfs_buf_map __b_map; /* inline compound buffer map */ int b_map_count; - int b_io_length; /* IO size in BBs */ atomic_t b_pin_count; /* pin count */ atomic_t b_io_remaining; /* #outstanding I/O requests */ unsigned int b_page_count; /* size of page array */ @@ -205,21 +192,6 @@ struct xfs_buf *xfs_buf_incore(struct xfs_buftarg *target, xfs_daddr_t blkno, size_t numblks, xfs_buf_flags_t flags); -struct xfs_buf *_xfs_buf_alloc(struct xfs_buftarg *target, - struct xfs_buf_map *map, int nmaps, - xfs_buf_flags_t flags); - -static inline struct xfs_buf * -xfs_buf_alloc( - struct xfs_buftarg *target, - xfs_daddr_t blkno, - size_t numblks, - xfs_buf_flags_t flags) -{ - DEFINE_SINGLE_BUF_MAP(map, blkno, numblks); - return _xfs_buf_alloc(target, &map, 1, flags); -} - struct xfs_buf *xfs_buf_get_map(struct xfs_buftarg *target, struct xfs_buf_map *map, int nmaps, xfs_buf_flags_t flags); @@ -235,11 +207,10 @@ static inline struct xfs_buf * xfs_buf_get( struct xfs_buftarg *target, xfs_daddr_t blkno, - size_t numblks, - xfs_buf_flags_t flags) + size_t numblks) { DEFINE_SINGLE_BUF_MAP(map, blkno, numblks); - return xfs_buf_get_map(target, &map, 1, flags); + return xfs_buf_get_map(target, &map, 1, 0); } static inline struct xfs_buf * @@ -265,9 +236,6 @@ xfs_buf_readahead( return xfs_buf_readahead_map(target, &map, 1, ops); } -void xfs_buf_set_empty(struct xfs_buf *bp, size_t numblks); -int xfs_buf_associate_memory(struct xfs_buf *bp, void *mem, size_t length); - struct xfs_buf *xfs_buf_get_uncached(struct xfs_buftarg *target, size_t numblks, int flags); int xfs_buf_read_uncached(struct xfs_buftarg *target, xfs_daddr_t daddr, @@ -301,10 +269,7 @@ static inline int xfs_buf_submit(struct xfs_buf *bp) return __xfs_buf_submit(bp, wait); } -extern void xfs_buf_iomove(xfs_buf_t *, size_t, size_t, void *, - xfs_buf_rw_t); -#define xfs_buf_zero(bp, off, len) \ - xfs_buf_iomove((bp), (off), (len), NULL, XBRW_ZERO) +void xfs_buf_zero(struct xfs_buf *bp, size_t boff, size_t bsize); /* Buffer Utility Routines */ extern void *xfs_buf_offset(struct xfs_buf *, size_t); @@ -385,6 +350,8 @@ extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int); #define xfs_getsize_buftarg(buftarg) block_size((buftarg)->bt_bdev) #define xfs_readonly_buftarg(buftarg) bdev_read_only((buftarg)->bt_bdev) -int xfs_buf_ensure_ops(struct xfs_buf *bp, const struct xfs_buf_ops *ops); +int xfs_buf_reverify(struct xfs_buf *bp, const struct xfs_buf_ops *ops); +bool xfs_verify_magic(struct xfs_buf *bp, __be32 dmagic); +bool xfs_verify_magic16(struct xfs_buf *bp, __be16 dmagic); #endif /* __XFS_BUF_H__ */ diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index 010db5f8fb00..7dcaec54a20b 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -5,19 +5,17 @@ */ #include "xfs.h" #include "xfs_fs.h" +#include "xfs_shared.h" #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_bit.h" -#include "xfs_sb.h" #include "xfs_mount.h" #include "xfs_trans.h" #include "xfs_buf_item.h" #include "xfs_trans_priv.h" -#include "xfs_error.h" #include "xfs_trace.h" #include "xfs_log.h" -#include "xfs_inode.h" kmem_zone_t *xfs_buf_item_zone; @@ -520,7 +518,7 @@ xfs_buf_item_push( /* has a previous flush failed due to IO errors? */ if ((bp->b_flags & XBF_WRITE_FAIL) && ___ratelimit(&xfs_buf_write_fail_rl_state, "XFS: Failing async write")) { - xfs_warn(bp->b_target->bt_mount, + xfs_warn(bp->b_mount, "Failing async write on buffer block 0x%llx. Retrying async write.", (long long)bp->b_bn); } @@ -594,7 +592,7 @@ xfs_buf_item_put( * free the item. */ STATIC void -xfs_buf_item_unlock( +xfs_buf_item_release( struct xfs_log_item *lip) { struct xfs_buf_log_item *bip = BUF_ITEM(lip); @@ -605,9 +603,11 @@ xfs_buf_item_unlock( #if defined(DEBUG) || defined(XFS_WARN) bool ordered = bip->bli_flags & XFS_BLI_ORDERED; bool dirty = bip->bli_flags & XFS_BLI_DIRTY; + bool aborted = test_bit(XFS_LI_ABORTED, + &lip->li_flags); #endif - trace_xfs_buf_item_unlock(bip); + trace_xfs_buf_item_release(bip); /* * The bli dirty state should match whether the blf has logged segments @@ -633,10 +633,18 @@ xfs_buf_item_unlock( released = xfs_buf_item_put(bip); if (hold || (stale && !released)) return; - ASSERT(!stale || test_bit(XFS_LI_ABORTED, &lip->li_flags)); + ASSERT(!stale || aborted); xfs_buf_relse(bp); } +STATIC void +xfs_buf_item_committing( + struct xfs_log_item *lip, + xfs_lsn_t commit_lsn) +{ + return xfs_buf_item_release(lip); +} + /* * This is called to find out where the oldest active copy of the * buf log item in the on disk log resides now that the last log @@ -669,25 +677,15 @@ xfs_buf_item_committed( return lsn; } -STATIC void -xfs_buf_item_committing( - struct xfs_log_item *lip, - xfs_lsn_t commit_lsn) -{ -} - -/* - * This is the ops vector shared by all buf log items. - */ static const struct xfs_item_ops xfs_buf_item_ops = { .iop_size = xfs_buf_item_size, .iop_format = xfs_buf_item_format, .iop_pin = xfs_buf_item_pin, .iop_unpin = xfs_buf_item_unpin, - .iop_unlock = xfs_buf_item_unlock, + .iop_release = xfs_buf_item_release, + .iop_committing = xfs_buf_item_committing, .iop_committed = xfs_buf_item_committed, .iop_push = xfs_buf_item_push, - .iop_committing = xfs_buf_item_committing }; STATIC int @@ -741,7 +739,7 @@ xfs_buf_item_init( * this buffer. If we do already have one, there is * nothing to do here so return. */ - ASSERT(bp->b_target->bt_mount == mp); + ASSERT(bp->b_mount == mp); if (bip) { ASSERT(bip->bli_item.li_type == XFS_LI_BUF); ASSERT(!bp->b_transp); @@ -978,9 +976,9 @@ xfs_buf_item_relse( */ void xfs_buf_attach_iodone( - xfs_buf_t *bp, - void (*cb)(xfs_buf_t *, xfs_log_item_t *), - xfs_log_item_t *lip) + struct xfs_buf *bp, + void (*cb)(struct xfs_buf *, struct xfs_log_item *), + struct xfs_log_item *lip) { ASSERT(xfs_buf_islocked(bp)); diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h index 90f65f891fab..4a054b11011a 100644 --- a/fs/xfs/xfs_buf_item.h +++ b/fs/xfs/xfs_buf_item.h @@ -39,7 +39,7 @@ struct xfs_buf_log_item; * locked, and which 128 byte chunks of the buffer are dirty. */ struct xfs_buf_log_item { - xfs_log_item_t bli_item; /* common item structure */ + struct xfs_log_item bli_item; /* common item structure */ struct xfs_buf *bli_buf; /* real buffer pointer */ unsigned int bli_flags; /* misc flags */ unsigned int bli_recur; /* lock recursion count */ @@ -55,8 +55,8 @@ bool xfs_buf_item_put(struct xfs_buf_log_item *); void xfs_buf_item_log(struct xfs_buf_log_item *, uint, uint); bool xfs_buf_item_dirty_format(struct xfs_buf_log_item *); void xfs_buf_attach_iodone(struct xfs_buf *, - void(*)(struct xfs_buf *, xfs_log_item_t *), - xfs_log_item_t *); + void(*)(struct xfs_buf *, struct xfs_log_item *), + struct xfs_log_item *); void xfs_buf_iodone_callbacks(struct xfs_buf *); void xfs_buf_iodone(struct xfs_buf *, struct xfs_log_item *); bool xfs_buf_resubmit_failed_buffers(struct xfs_buf *, diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c index 5142e64e2345..283df898dd9f 100644 --- a/fs/xfs/xfs_dir2_readdir.c +++ b/fs/xfs/xfs_dir2_readdir.c @@ -6,17 +6,14 @@ */ #include "xfs.h" #include "xfs_fs.h" +#include "xfs_shared.h" #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" -#include "xfs_bit.h" #include "xfs_mount.h" -#include "xfs_da_format.h" -#include "xfs_da_btree.h" #include "xfs_inode.h" #include "xfs_dir2.h" #include "xfs_dir2_priv.h" -#include "xfs_error.h" #include "xfs_trace.h" #include "xfs_bmap.h" #include "xfs_trans.h" diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c index 93f07edafd81..8ec7aab89044 100644 --- a/fs/xfs/xfs_discard.c +++ b/fs/xfs/xfs_discard.c @@ -4,19 +4,17 @@ * All Rights Reserved. */ #include "xfs.h" +#include "xfs_shared.h" #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_sb.h" #include "xfs_mount.h" -#include "xfs_quota.h" -#include "xfs_inode.h" #include "xfs_btree.h" #include "xfs_alloc_btree.h" #include "xfs_alloc.h" #include "xfs_error.h" #include "xfs_extent_busy.h" -#include "xfs_discard.h" #include "xfs_trace.h" #include "xfs_log.h" @@ -161,9 +159,19 @@ xfs_ioc_trim( return -EPERM; if (!blk_queue_discard(q)) return -EOPNOTSUPP; + + /* + * We haven't recovered the log, so we cannot use our bnobt-guided + * storage zapping commands. + */ + if (mp->m_flags & XFS_MOUNT_NORECOVERY) + return -EROFS; + if (copy_from_user(&range, urange, sizeof(range))) return -EFAULT; + range.minlen = max_t(u64, granularity, range.minlen); + minlen = BTOBB(range.minlen); /* * Truncating down the len isn't actually quite correct, but using * BBTOB would mean we trivially get overflows for values @@ -178,7 +186,6 @@ xfs_ioc_trim( start = BTOBB(range.start); end = start + BTOBBT(range.len) - 1; - minlen = BTOBB(max_t(u64, granularity, range.minlen)); if (end > XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks) - 1) end = XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)- 1; diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index 87e6dd5326d5..fb1ad4483081 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -14,16 +14,12 @@ #include "xfs_defer.h" #include "xfs_inode.h" #include "xfs_bmap.h" -#include "xfs_bmap_util.h" -#include "xfs_alloc.h" #include "xfs_quota.h" -#include "xfs_error.h" #include "xfs_trans.h" #include "xfs_buf_item.h" #include "xfs_trans_space.h" #include "xfs_trans_priv.h" #include "xfs_qm.h" -#include "xfs_cksum.h" #include "xfs_trace.h" #include "xfs_log.h" #include "xfs_bmap_btree.h" @@ -277,7 +273,8 @@ xfs_dquot_set_prealloc_limits(struct xfs_dquot *dqp) /* * Ensure that the given in-core dquot has a buffer on disk backing it, and - * return the buffer. This is called when the bmapi finds a hole. + * return the buffer locked and held. This is called when the bmapi finds a + * hole. */ STATIC int xfs_dquot_disk_alloc( @@ -355,13 +352,14 @@ xfs_dquot_disk_alloc( * If everything succeeds, the caller of this function is returned a * buffer that is locked and held to the transaction. The caller * is responsible for unlocking any buffer passed back, either - * manually or by committing the transaction. + * manually or by committing the transaction. On error, the buffer is + * released and not passed back. */ xfs_trans_bhold(tp, bp); error = xfs_defer_finish(tpp); - tp = *tpp; if (error) { - xfs_buf_relse(bp); + xfs_trans_bhold_release(*tpp, bp); + xfs_trans_brelse(*tpp, bp); return error; } *bpp = bp; @@ -521,7 +519,6 @@ xfs_qm_dqread_alloc( struct xfs_buf **bpp) { struct xfs_trans *tp; - struct xfs_buf *bp; int error; error = xfs_trans_alloc(mp, &M_RES(mp)->tr_qm_dqalloc, @@ -529,7 +526,7 @@ xfs_qm_dqread_alloc( if (error) goto err; - error = xfs_dquot_disk_alloc(&tp, dqp, &bp); + error = xfs_dquot_disk_alloc(&tp, dqp, bpp); if (error) goto err_cancel; @@ -539,10 +536,10 @@ xfs_qm_dqread_alloc( * Buffer was held to the transaction, so we have to unlock it * manually here because we're not passing it back. */ - xfs_buf_relse(bp); + xfs_buf_relse(*bpp); + *bpp = NULL; goto err; } - *bpp = bp; return 0; err_cancel: @@ -1242,7 +1239,7 @@ xfs_qm_exit(void) /* * Iterate every dquot of a particular type. The caller must ensure that the * particular quota type is active. iter_fn can return negative error codes, - * or XFS_BTREE_QUERY_RANGE_ABORT to indicate that it wants to stop iterating. + * or XFS_ITER_ABORT to indicate that it wants to stop iterating. */ int xfs_qm_dqiterate( diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h index 64bd8640f6e8..4fe85709d55d 100644 --- a/fs/xfs/xfs_dquot.h +++ b/fs/xfs/xfs_dquot.h @@ -34,7 +34,6 @@ typedef struct xfs_dquot { uint dq_flags; /* various flags (XFS_DQ_*) */ struct list_head q_lru; /* global free list of dquots */ struct xfs_mount*q_mount; /* filesystem this relates to */ - struct xfs_trans*q_transp; /* trans this belongs to currently */ uint q_nrefs; /* # active refs from inodes */ xfs_daddr_t q_blkno; /* blkno of dquot buffer */ int q_bufoffset; /* off of dq in buffer (# dquots) */ diff --git a/fs/xfs/xfs_dquot_item.c b/fs/xfs/xfs_dquot_item.c index 7dedd17c4813..282ec5af293e 100644 --- a/fs/xfs/xfs_dquot_item.c +++ b/fs/xfs/xfs_dquot_item.c @@ -5,13 +5,13 @@ */ #include "xfs.h" #include "xfs_fs.h" +#include "xfs_shared.h" #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" #include "xfs_inode.h" #include "xfs_quota.h" -#include "xfs_error.h" #include "xfs_trans.h" #include "xfs_buf_item.h" #include "xfs_trans_priv.h" @@ -94,18 +94,6 @@ xfs_qm_dquot_logitem_unpin( wake_up(&dqp->q_pinwait); } -STATIC xfs_lsn_t -xfs_qm_dquot_logitem_committed( - struct xfs_log_item *lip, - xfs_lsn_t lsn) -{ - /* - * We always re-log the entire dquot when it becomes dirty, - * so, the latest copy _is_ the only one that matters. - */ - return lsn; -} - /* * This is called to wait for the given dquot to be unpinned. * Most of these pin/unpin routines are plagiarized from inode code. @@ -209,14 +197,8 @@ out_unlock: return rval; } -/* - * Unlock the dquot associated with the log item. - * Clear the fields of the dquot and dquot log item that - * are specific to the current transaction. If the - * hold flags is set, do not unlock the dquot. - */ STATIC void -xfs_qm_dquot_logitem_unlock( +xfs_qm_dquot_logitem_release( struct xfs_log_item *lip) { struct xfs_dquot *dqp = DQUOT_ITEM(lip)->qli_dquot; @@ -224,11 +206,6 @@ xfs_qm_dquot_logitem_unlock( ASSERT(XFS_DQ_IS_LOCKED(dqp)); /* - * Clear the transaction pointer in the dquot - */ - dqp->q_transp = NULL; - - /* * dquots are never 'held' from getting unlocked at the end of * a transaction. Their locking and unlocking is hidden inside the * transaction layer, within trans_commit. Hence, no LI_HOLD flag @@ -237,30 +214,22 @@ xfs_qm_dquot_logitem_unlock( xfs_dqunlock(dqp); } -/* - * this needs to stamp an lsn into the dquot, I think. - * rpc's that look at user dquot's would then have to - * push on the dependency recorded in the dquot - */ STATIC void xfs_qm_dquot_logitem_committing( struct xfs_log_item *lip, - xfs_lsn_t lsn) + xfs_lsn_t commit_lsn) { + return xfs_qm_dquot_logitem_release(lip); } -/* - * This is the ops vector for dquots - */ static const struct xfs_item_ops xfs_dquot_item_ops = { .iop_size = xfs_qm_dquot_logitem_size, .iop_format = xfs_qm_dquot_logitem_format, .iop_pin = xfs_qm_dquot_logitem_pin, .iop_unpin = xfs_qm_dquot_logitem_unpin, - .iop_unlock = xfs_qm_dquot_logitem_unlock, - .iop_committed = xfs_qm_dquot_logitem_committed, + .iop_release = xfs_qm_dquot_logitem_release, + .iop_committing = xfs_qm_dquot_logitem_committing, .iop_push = xfs_qm_dquot_logitem_push, - .iop_committing = xfs_qm_dquot_logitem_committing, .iop_error = xfs_dquot_item_error }; @@ -320,26 +289,6 @@ xfs_qm_qoff_logitem_format( } /* - * Pinning has no meaning for an quotaoff item, so just return. - */ -STATIC void -xfs_qm_qoff_logitem_pin( - struct xfs_log_item *lip) -{ -} - -/* - * Since pinning has no meaning for an quotaoff item, unpinning does - * not either. - */ -STATIC void -xfs_qm_qoff_logitem_unpin( - struct xfs_log_item *lip, - int remove) -{ -} - -/* * There isn't much you can do to push a quotaoff item. It is simply * stuck waiting for the log to be flushed to disk. */ @@ -351,28 +300,6 @@ xfs_qm_qoff_logitem_push( return XFS_ITEM_LOCKED; } -/* - * Quotaoff items have no locking or pushing, so return failure - * so that the caller doesn't bother with us. - */ -STATIC void -xfs_qm_qoff_logitem_unlock( - struct xfs_log_item *lip) -{ -} - -/* - * The quotaoff-start-item is logged only once and cannot be moved in the log, - * so simply return the lsn at which it's been logged. - */ -STATIC xfs_lsn_t -xfs_qm_qoff_logitem_committed( - struct xfs_log_item *lip, - xfs_lsn_t lsn) -{ - return lsn; -} - STATIC xfs_lsn_t xfs_qm_qoffend_logitem_committed( struct xfs_log_item *lip, @@ -396,50 +323,17 @@ xfs_qm_qoffend_logitem_committed( return (xfs_lsn_t)-1; } -/* - * XXX rcc - don't know quite what to do with this. I think we can - * just ignore it. The only time that isn't the case is if we allow - * the client to somehow see that quotas have been turned off in which - * we can't allow that to get back until the quotaoff hits the disk. - * So how would that happen? Also, do we need different routines for - * quotaoff start and quotaoff end? I suspect the answer is yes but - * to be sure, I need to look at the recovery code and see how quota off - * recovery is handled (do we roll forward or back or do something else). - * If we roll forwards or backwards, then we need two separate routines, - * one that does nothing and one that stamps in the lsn that matters - * (truly makes the quotaoff irrevocable). If we do something else, - * then maybe we don't need two. - */ -STATIC void -xfs_qm_qoff_logitem_committing( - struct xfs_log_item *lip, - xfs_lsn_t commit_lsn) -{ -} - static const struct xfs_item_ops xfs_qm_qoffend_logitem_ops = { .iop_size = xfs_qm_qoff_logitem_size, .iop_format = xfs_qm_qoff_logitem_format, - .iop_pin = xfs_qm_qoff_logitem_pin, - .iop_unpin = xfs_qm_qoff_logitem_unpin, - .iop_unlock = xfs_qm_qoff_logitem_unlock, .iop_committed = xfs_qm_qoffend_logitem_committed, .iop_push = xfs_qm_qoff_logitem_push, - .iop_committing = xfs_qm_qoff_logitem_committing }; -/* - * This is the ops vector shared by all quotaoff-start log items. - */ static const struct xfs_item_ops xfs_qm_qoff_logitem_ops = { .iop_size = xfs_qm_qoff_logitem_size, .iop_format = xfs_qm_qoff_logitem_format, - .iop_pin = xfs_qm_qoff_logitem_pin, - .iop_unpin = xfs_qm_qoff_logitem_unpin, - .iop_unlock = xfs_qm_qoff_logitem_unlock, - .iop_committed = xfs_qm_qoff_logitem_committed, .iop_push = xfs_qm_qoff_logitem_push, - .iop_committing = xfs_qm_qoff_logitem_committing }; /* diff --git a/fs/xfs/xfs_dquot_item.h b/fs/xfs/xfs_dquot_item.h index db9df710a308..1aed34ccdabc 100644 --- a/fs/xfs/xfs_dquot_item.h +++ b/fs/xfs/xfs_dquot_item.h @@ -12,13 +12,13 @@ struct xfs_mount; struct xfs_qoff_logitem; typedef struct xfs_dq_logitem { - xfs_log_item_t qli_item; /* common portion */ + struct xfs_log_item qli_item; /* common portion */ struct xfs_dquot *qli_dquot; /* dquot ptr */ xfs_lsn_t qli_flush_lsn; /* lsn at last flush */ } xfs_dq_logitem_t; typedef struct xfs_qoff_logitem { - xfs_log_item_t qql_item; /* common portion */ + struct xfs_log_item qql_item; /* common portion */ struct xfs_qoff_logitem *qql_start_lip; /* qoff-start logitem, if any */ unsigned int qql_flags; } xfs_qoff_logitem_t; diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c index 9866f542e77b..544c9482a0ef 100644 --- a/fs/xfs/xfs_error.c +++ b/fs/xfs/xfs_error.c @@ -4,6 +4,7 @@ * All Rights Reserved. */ #include "xfs.h" +#include "xfs_shared.h" #include "xfs_format.h" #include "xfs_fs.h" #include "xfs_log_format.h" @@ -51,6 +52,7 @@ static unsigned int xfs_errortag_random_default[] = { XFS_RANDOM_BUF_LRU_REF, XFS_RANDOM_FORCE_SCRUB_REPAIR, XFS_RANDOM_FORCE_SUMMARY_RECALC, + XFS_RANDOM_IUNLINK_FALLBACK, }; struct xfs_errortag_attr { @@ -159,6 +161,7 @@ XFS_ERRORTAG_ATTR_RW(log_item_pin, XFS_ERRTAG_LOG_ITEM_PIN); XFS_ERRORTAG_ATTR_RW(buf_lru_ref, XFS_ERRTAG_BUF_LRU_REF); XFS_ERRORTAG_ATTR_RW(force_repair, XFS_ERRTAG_FORCE_SCRUB_REPAIR); XFS_ERRORTAG_ATTR_RW(bad_summary, XFS_ERRTAG_FORCE_SUMMARY_RECALC); +XFS_ERRORTAG_ATTR_RW(iunlink_fallback, XFS_ERRTAG_IUNLINK_FALLBACK); static struct attribute *xfs_errortag_attrs[] = { XFS_ERRORTAG_ATTR_LIST(noerror), @@ -195,6 +198,7 @@ static struct attribute *xfs_errortag_attrs[] = { XFS_ERRORTAG_ATTR_LIST(buf_lru_ref), XFS_ERRORTAG_ATTR_LIST(force_repair), XFS_ERRORTAG_ATTR_LIST(bad_summary), + XFS_ERRORTAG_ATTR_LIST(iunlink_fallback), NULL, }; @@ -350,14 +354,15 @@ xfs_buf_verifier_error( size_t bufsz, xfs_failaddr_t failaddr) { - struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_mount *mp = bp->b_mount; xfs_failaddr_t fa; int sz; fa = failaddr ? failaddr : __return_address; __xfs_buf_ioerror(bp, error, fa); - xfs_alert(mp, "Metadata %s detected at %pS, %s block 0x%llx %s", + xfs_alert_tag(mp, XFS_PTAG_VERIFIER_ERROR, + "Metadata %s detected at %pS, %s block 0x%llx %s", bp->b_error == -EFSBADCRC ? "CRC error" : "corruption", fa, bp->b_ops->name, bp->b_bn, name); diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h index 246d3e989c6c..602aa7d62b66 100644 --- a/fs/xfs/xfs_error.h +++ b/fs/xfs/xfs_error.h @@ -98,5 +98,6 @@ extern int xfs_errortag_clearall(struct xfs_mount *mp); #define XFS_PTAG_SHUTDOWN_IOERROR 0x00000020 #define XFS_PTAG_SHUTDOWN_LOGERROR 0x00000040 #define XFS_PTAG_FSBLOCK_ZERO 0x00000080 +#define XFS_PTAG_VERIFIER_ERROR 0x00000100 #endif /* __XFS_ERROR_H__ */ diff --git a/fs/xfs/xfs_export.c b/fs/xfs/xfs_export.c index f2284ceb129f..f1372f9046e3 100644 --- a/fs/xfs/xfs_export.c +++ b/fs/xfs/xfs_export.c @@ -4,18 +4,16 @@ * All Rights Reserved. */ #include "xfs.h" +#include "xfs_shared.h" #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" -#include "xfs_da_format.h" -#include "xfs_da_btree.h" #include "xfs_dir2.h" #include "xfs_export.h" #include "xfs_inode.h" #include "xfs_trans.h" #include "xfs_inode_item.h" -#include "xfs_trace.h" #include "xfs_icache.h" #include "xfs_log.h" #include "xfs_pnfs.h" diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index 74ddf66f4cfe..86f6512d6864 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -9,14 +9,18 @@ #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_bit.h" +#include "xfs_shared.h" #include "xfs_mount.h" +#include "xfs_defer.h" #include "xfs_trans.h" #include "xfs_trans_priv.h" -#include "xfs_buf_item.h" #include "xfs_extfree_item.h" #include "xfs_log.h" #include "xfs_btree.h" #include "xfs_rmap.h" +#include "xfs_alloc.h" +#include "xfs_bmap.h" +#include "xfs_trace.h" kmem_zone_t *xfs_efi_zone; @@ -107,15 +111,6 @@ xfs_efi_item_format( /* - * Pinning has no meaning for an efi item, so just return. - */ -STATIC void -xfs_efi_item_pin( - struct xfs_log_item *lip) -{ -} - -/* * The unpin operation is the last place an EFI is manipulated in the log. It is * either inserted in the AIL or aborted in the event of a log I/O error. In * either case, the EFI transaction has been successfully committed to make it @@ -133,71 +128,22 @@ xfs_efi_item_unpin( } /* - * Efi items have no locking or pushing. However, since EFIs are pulled from - * the AIL when their corresponding EFDs are committed to disk, their situation - * is very similar to being pinned. Return XFS_ITEM_PINNED so that the caller - * will eventually flush the log. This should help in getting the EFI out of - * the AIL. - */ -STATIC uint -xfs_efi_item_push( - struct xfs_log_item *lip, - struct list_head *buffer_list) -{ - return XFS_ITEM_PINNED; -} - -/* * The EFI has been either committed or aborted if the transaction has been * cancelled. If the transaction was cancelled, an EFD isn't going to be * constructed and thus we free the EFI here directly. */ STATIC void -xfs_efi_item_unlock( +xfs_efi_item_release( struct xfs_log_item *lip) { - if (test_bit(XFS_LI_ABORTED, &lip->li_flags)) - xfs_efi_release(EFI_ITEM(lip)); -} - -/* - * The EFI is logged only once and cannot be moved in the log, so simply return - * the lsn at which it's been logged. - */ -STATIC xfs_lsn_t -xfs_efi_item_committed( - struct xfs_log_item *lip, - xfs_lsn_t lsn) -{ - return lsn; -} - -/* - * The EFI dependency tracking op doesn't do squat. It can't because - * it doesn't know where the free extent is coming from. The dependency - * tracking has to be handled by the "enclosing" metadata object. For - * example, for inodes, the inode is locked throughout the extent freeing - * so the dependency should be recorded there. - */ -STATIC void -xfs_efi_item_committing( - struct xfs_log_item *lip, - xfs_lsn_t lsn) -{ + xfs_efi_release(EFI_ITEM(lip)); } -/* - * This is the ops vector shared by all efi log items. - */ static const struct xfs_item_ops xfs_efi_item_ops = { .iop_size = xfs_efi_item_size, .iop_format = xfs_efi_item_format, - .iop_pin = xfs_efi_item_pin, .iop_unpin = xfs_efi_item_unpin, - .iop_unlock = xfs_efi_item_unlock, - .iop_committed = xfs_efi_item_committed, - .iop_push = xfs_efi_item_push, - .iop_committing = xfs_efi_item_committing + .iop_release = xfs_efi_item_release, }; @@ -349,136 +295,298 @@ xfs_efd_item_format( } /* - * Pinning has no meaning for an efd item, so just return. + * The EFD is either committed or aborted if the transaction is cancelled. If + * the transaction is cancelled, drop our reference to the EFI and free the EFD. */ STATIC void -xfs_efd_item_pin( +xfs_efd_item_release( struct xfs_log_item *lip) { + struct xfs_efd_log_item *efdp = EFD_ITEM(lip); + + xfs_efi_release(efdp->efd_efip); + xfs_efd_item_free(efdp); } +static const struct xfs_item_ops xfs_efd_item_ops = { + .flags = XFS_ITEM_RELEASE_WHEN_COMMITTED, + .iop_size = xfs_efd_item_size, + .iop_format = xfs_efd_item_format, + .iop_release = xfs_efd_item_release, +}; + /* - * Since pinning has no meaning for an efd item, unpinning does - * not either. + * Allocate an "extent free done" log item that will hold nextents worth of + * extents. The caller must use all nextents extents, because we are not + * flexible about this at all. */ -STATIC void -xfs_efd_item_unpin( - struct xfs_log_item *lip, - int remove) +static struct xfs_efd_log_item * +xfs_trans_get_efd( + struct xfs_trans *tp, + struct xfs_efi_log_item *efip, + unsigned int nextents) { + struct xfs_efd_log_item *efdp; + + ASSERT(nextents > 0); + + if (nextents > XFS_EFD_MAX_FAST_EXTENTS) { + efdp = kmem_zalloc(sizeof(struct xfs_efd_log_item) + + (nextents - 1) * sizeof(struct xfs_extent), + KM_SLEEP); + } else { + efdp = kmem_zone_zalloc(xfs_efd_zone, KM_SLEEP); + } + + xfs_log_item_init(tp->t_mountp, &efdp->efd_item, XFS_LI_EFD, + &xfs_efd_item_ops); + efdp->efd_efip = efip; + efdp->efd_format.efd_nextents = nextents; + efdp->efd_format.efd_efi_id = efip->efi_format.efi_id; + + xfs_trans_add_item(tp, &efdp->efd_item); + return efdp; } /* - * There isn't much you can do to push on an efd item. It is simply stuck - * waiting for the log to be flushed to disk. + * Free an extent and log it to the EFD. Note that the transaction is marked + * dirty regardless of whether the extent free succeeds or fails to support the + * EFI/EFD lifecycle rules. */ -STATIC uint -xfs_efd_item_push( - struct xfs_log_item *lip, - struct list_head *buffer_list) +static int +xfs_trans_free_extent( + struct xfs_trans *tp, + struct xfs_efd_log_item *efdp, + xfs_fsblock_t start_block, + xfs_extlen_t ext_len, + const struct xfs_owner_info *oinfo, + bool skip_discard) { - return XFS_ITEM_PINNED; + struct xfs_mount *mp = tp->t_mountp; + struct xfs_extent *extp; + uint next_extent; + xfs_agnumber_t agno = XFS_FSB_TO_AGNO(mp, start_block); + xfs_agblock_t agbno = XFS_FSB_TO_AGBNO(mp, + start_block); + int error; + + trace_xfs_bmap_free_deferred(tp->t_mountp, agno, 0, agbno, ext_len); + + error = __xfs_free_extent(tp, start_block, ext_len, + oinfo, XFS_AG_RESV_NONE, skip_discard); + /* + * Mark the transaction dirty, even on error. This ensures the + * transaction is aborted, which: + * + * 1.) releases the EFI and frees the EFD + * 2.) shuts down the filesystem + */ + tp->t_flags |= XFS_TRANS_DIRTY; + set_bit(XFS_LI_DIRTY, &efdp->efd_item.li_flags); + + next_extent = efdp->efd_next_extent; + ASSERT(next_extent < efdp->efd_format.efd_nextents); + extp = &(efdp->efd_format.efd_extents[next_extent]); + extp->ext_start = start_block; + extp->ext_len = ext_len; + efdp->efd_next_extent++; + + return error; } -/* - * The EFD is either committed or aborted if the transaction is cancelled. If - * the transaction is cancelled, drop our reference to the EFI and free the EFD. - */ -STATIC void -xfs_efd_item_unlock( - struct xfs_log_item *lip) +/* Sort bmap items by AG. */ +static int +xfs_extent_free_diff_items( + void *priv, + struct list_head *a, + struct list_head *b) { - struct xfs_efd_log_item *efdp = EFD_ITEM(lip); + struct xfs_mount *mp = priv; + struct xfs_extent_free_item *ra; + struct xfs_extent_free_item *rb; + + ra = container_of(a, struct xfs_extent_free_item, xefi_list); + rb = container_of(b, struct xfs_extent_free_item, xefi_list); + return XFS_FSB_TO_AGNO(mp, ra->xefi_startblock) - + XFS_FSB_TO_AGNO(mp, rb->xefi_startblock); +} - if (test_bit(XFS_LI_ABORTED, &lip->li_flags)) { - xfs_efi_release(efdp->efd_efip); - xfs_efd_item_free(efdp); - } +/* Get an EFI. */ +STATIC void * +xfs_extent_free_create_intent( + struct xfs_trans *tp, + unsigned int count) +{ + struct xfs_efi_log_item *efip; + + ASSERT(tp != NULL); + ASSERT(count > 0); + + efip = xfs_efi_init(tp->t_mountp, count); + ASSERT(efip != NULL); + + /* + * Get a log_item_desc to point at the new item. + */ + xfs_trans_add_item(tp, &efip->efi_item); + return efip; } -/* - * When the efd item is committed to disk, all we need to do is delete our - * reference to our partner efi item and then free ourselves. Since we're - * freeing ourselves we must return -1 to keep the transaction code from further - * referencing this item. - */ -STATIC xfs_lsn_t -xfs_efd_item_committed( - struct xfs_log_item *lip, - xfs_lsn_t lsn) +/* Log a free extent to the intent item. */ +STATIC void +xfs_extent_free_log_item( + struct xfs_trans *tp, + void *intent, + struct list_head *item) { - struct xfs_efd_log_item *efdp = EFD_ITEM(lip); + struct xfs_efi_log_item *efip = intent; + struct xfs_extent_free_item *free; + uint next_extent; + struct xfs_extent *extp; + + free = container_of(item, struct xfs_extent_free_item, xefi_list); + + tp->t_flags |= XFS_TRANS_DIRTY; + set_bit(XFS_LI_DIRTY, &efip->efi_item.li_flags); /* - * Drop the EFI reference regardless of whether the EFD has been - * aborted. Once the EFD transaction is constructed, it is the sole - * responsibility of the EFD to release the EFI (even if the EFI is - * aborted due to log I/O error). + * atomic_inc_return gives us the value after the increment; + * we want to use it as an array index so we need to subtract 1 from + * it. */ - xfs_efi_release(efdp->efd_efip); - xfs_efd_item_free(efdp); + next_extent = atomic_inc_return(&efip->efi_next_extent) - 1; + ASSERT(next_extent < efip->efi_format.efi_nextents); + extp = &efip->efi_format.efi_extents[next_extent]; + extp->ext_start = free->xefi_startblock; + extp->ext_len = free->xefi_blockcount; +} - return (xfs_lsn_t)-1; +/* Get an EFD so we can process all the free extents. */ +STATIC void * +xfs_extent_free_create_done( + struct xfs_trans *tp, + void *intent, + unsigned int count) +{ + return xfs_trans_get_efd(tp, intent, count); } -/* - * The EFD dependency tracking op doesn't do squat. It can't because - * it doesn't know where the free extent is coming from. The dependency - * tracking has to be handled by the "enclosing" metadata object. For - * example, for inodes, the inode is locked throughout the extent freeing - * so the dependency should be recorded there. - */ +/* Process a free extent. */ +STATIC int +xfs_extent_free_finish_item( + struct xfs_trans *tp, + struct list_head *item, + void *done_item, + void **state) +{ + struct xfs_extent_free_item *free; + int error; + + free = container_of(item, struct xfs_extent_free_item, xefi_list); + error = xfs_trans_free_extent(tp, done_item, + free->xefi_startblock, + free->xefi_blockcount, + &free->xefi_oinfo, free->xefi_skip_discard); + kmem_free(free); + return error; +} + +/* Abort all pending EFIs. */ STATIC void -xfs_efd_item_committing( - struct xfs_log_item *lip, - xfs_lsn_t lsn) +xfs_extent_free_abort_intent( + void *intent) { + xfs_efi_release(intent); } -/* - * This is the ops vector shared by all efd log items. - */ -static const struct xfs_item_ops xfs_efd_item_ops = { - .iop_size = xfs_efd_item_size, - .iop_format = xfs_efd_item_format, - .iop_pin = xfs_efd_item_pin, - .iop_unpin = xfs_efd_item_unpin, - .iop_unlock = xfs_efd_item_unlock, - .iop_committed = xfs_efd_item_committed, - .iop_push = xfs_efd_item_push, - .iop_committing = xfs_efd_item_committing +/* Cancel a free extent. */ +STATIC void +xfs_extent_free_cancel_item( + struct list_head *item) +{ + struct xfs_extent_free_item *free; + + free = container_of(item, struct xfs_extent_free_item, xefi_list); + kmem_free(free); +} + +const struct xfs_defer_op_type xfs_extent_free_defer_type = { + .max_items = XFS_EFI_MAX_FAST_EXTENTS, + .diff_items = xfs_extent_free_diff_items, + .create_intent = xfs_extent_free_create_intent, + .abort_intent = xfs_extent_free_abort_intent, + .log_item = xfs_extent_free_log_item, + .create_done = xfs_extent_free_create_done, + .finish_item = xfs_extent_free_finish_item, + .cancel_item = xfs_extent_free_cancel_item, }; /* - * Allocate and initialize an efd item with the given number of extents. + * AGFL blocks are accounted differently in the reserve pools and are not + * inserted into the busy extent list. */ -struct xfs_efd_log_item * -xfs_efd_init( - struct xfs_mount *mp, - struct xfs_efi_log_item *efip, - uint nextents) - +STATIC int +xfs_agfl_free_finish_item( + struct xfs_trans *tp, + struct list_head *item, + void *done_item, + void **state) { - struct xfs_efd_log_item *efdp; - uint size; + struct xfs_mount *mp = tp->t_mountp; + struct xfs_efd_log_item *efdp = done_item; + struct xfs_extent_free_item *free; + struct xfs_extent *extp; + struct xfs_buf *agbp; + int error; + xfs_agnumber_t agno; + xfs_agblock_t agbno; + uint next_extent; + + free = container_of(item, struct xfs_extent_free_item, xefi_list); + ASSERT(free->xefi_blockcount == 1); + agno = XFS_FSB_TO_AGNO(mp, free->xefi_startblock); + agbno = XFS_FSB_TO_AGBNO(mp, free->xefi_startblock); + + trace_xfs_agfl_free_deferred(mp, agno, 0, agbno, free->xefi_blockcount); + + error = xfs_alloc_read_agf(mp, tp, agno, 0, &agbp); + if (!error) + error = xfs_free_agfl_block(tp, agno, agbno, agbp, + &free->xefi_oinfo); - ASSERT(nextents > 0); - if (nextents > XFS_EFD_MAX_FAST_EXTENTS) { - size = (uint)(sizeof(xfs_efd_log_item_t) + - ((nextents - 1) * sizeof(xfs_extent_t))); - efdp = kmem_zalloc(size, KM_SLEEP); - } else { - efdp = kmem_zone_zalloc(xfs_efd_zone, KM_SLEEP); - } + /* + * Mark the transaction dirty, even on error. This ensures the + * transaction is aborted, which: + * + * 1.) releases the EFI and frees the EFD + * 2.) shuts down the filesystem + */ + tp->t_flags |= XFS_TRANS_DIRTY; + set_bit(XFS_LI_DIRTY, &efdp->efd_item.li_flags); - xfs_log_item_init(mp, &efdp->efd_item, XFS_LI_EFD, &xfs_efd_item_ops); - efdp->efd_efip = efip; - efdp->efd_format.efd_nextents = nextents; - efdp->efd_format.efd_efi_id = efip->efi_format.efi_id; + next_extent = efdp->efd_next_extent; + ASSERT(next_extent < efdp->efd_format.efd_nextents); + extp = &(efdp->efd_format.efd_extents[next_extent]); + extp->ext_start = free->xefi_startblock; + extp->ext_len = free->xefi_blockcount; + efdp->efd_next_extent++; - return efdp; + kmem_free(free); + return error; } +/* sub-type with special handling for AGFL deferred frees */ +const struct xfs_defer_op_type xfs_agfl_free_defer_type = { + .max_items = XFS_EFI_MAX_FAST_EXTENTS, + .diff_items = xfs_extent_free_diff_items, + .create_intent = xfs_extent_free_create_intent, + .abort_intent = xfs_extent_free_abort_intent, + .log_item = xfs_extent_free_log_item, + .create_done = xfs_extent_free_create_done, + .finish_item = xfs_agfl_free_finish_item, + .cancel_item = xfs_extent_free_cancel_item, +}; + /* * Process an extent free intent item that was recovered from * the log. We need to free the extents that it describes. diff --git a/fs/xfs/xfs_extfree_item.h b/fs/xfs/xfs_extfree_item.h index 2a6a895ca73e..16aaab06d4ec 100644 --- a/fs/xfs/xfs_extfree_item.h +++ b/fs/xfs/xfs_extfree_item.h @@ -51,7 +51,7 @@ struct kmem_zone; * AIL, so at this point both the EFI and EFD are freed. */ typedef struct xfs_efi_log_item { - xfs_log_item_t efi_item; + struct xfs_log_item efi_item; atomic_t efi_refcount; atomic_t efi_next_extent; unsigned long efi_flags; /* misc flags */ @@ -64,7 +64,7 @@ typedef struct xfs_efi_log_item { * have been freed. */ typedef struct xfs_efd_log_item { - xfs_log_item_t efd_item; + struct xfs_log_item efd_item; xfs_efi_log_item_t *efd_efip; uint efd_next_extent; xfs_efd_log_format_t efd_format; @@ -79,8 +79,6 @@ extern struct kmem_zone *xfs_efi_zone; extern struct kmem_zone *xfs_efd_zone; xfs_efi_log_item_t *xfs_efi_init(struct xfs_mount *, uint); -xfs_efd_log_item_t *xfs_efd_init(struct xfs_mount *, xfs_efi_log_item_t *, - uint); int xfs_efi_copy_format(xfs_log_iovec_t *buf, xfs_efi_log_format_t *dst_efi_fmt); void xfs_efi_item_free(xfs_efi_log_item_t *); diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index e47425071e65..28101bbc0b78 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -10,14 +10,11 @@ #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" -#include "xfs_da_format.h" -#include "xfs_da_btree.h" #include "xfs_inode.h" #include "xfs_trans.h" #include "xfs_inode_item.h" #include "xfs_bmap.h" #include "xfs_bmap_util.h" -#include "xfs_error.h" #include "xfs_dir2.h" #include "xfs_dir2_priv.h" #include "xfs_ioctl.h" @@ -28,9 +25,7 @@ #include "xfs_iomap.h" #include "xfs_reflink.h" -#include <linux/dcache.h> #include <linux/falloc.h> -#include <linux/pagevec.h> #include <linux/backing-dev.h> #include <linux/mman.h> @@ -367,20 +362,7 @@ restart: * lock above. Eventually we should look into a way to avoid * the pointless lock roundtrip. */ - if (likely(!(file->f_mode & FMODE_NOCMTIME))) { - error = file_update_time(file); - if (error) - return error; - } - - /* - * If we're writing the file then make sure to clear the setuid and - * setgid bits if the process is not being run by root. This keeps - * people from modifying setuid and setgid binaries. - */ - if (!IS_NOSEC(inode)) - return file_remove_privs(file); - return 0; + return file_modified(file); } static int @@ -392,6 +374,7 @@ xfs_dio_write_end_io( struct inode *inode = file_inode(iocb->ki_filp); struct xfs_inode *ip = XFS_I(inode); loff_t offset = iocb->ki_pos; + unsigned int nofs_flag; int error = 0; trace_xfs_end_io_direct_write(ip, offset, size); @@ -408,10 +391,17 @@ xfs_dio_write_end_io( */ XFS_STATS_ADD(ip->i_mount, xs_write_bytes, size); + /* + * We can allocate memory here while doing writeback on behalf of + * memory reclaim. To avoid memory allocation deadlocks set the + * task-wide nofs context for the following operations. + */ + nofs_flag = memalloc_nofs_save(); + if (flags & IOMAP_DIO_COW) { error = xfs_reflink_end_cow(ip, offset, size); if (error) - return error; + goto out; } /* @@ -420,8 +410,10 @@ xfs_dio_write_end_io( * earlier allows a racing dio read to find unwritten extents before * they are converted. */ - if (flags & IOMAP_DIO_UNWRITTEN) - return xfs_iomap_write_unwritten(ip, offset, size, true); + if (flags & IOMAP_DIO_UNWRITTEN) { + error = xfs_iomap_write_unwritten(ip, offset, size, true); + goto out; + } /* * We need to update the in-core inode size here so that we don't end up @@ -443,6 +435,8 @@ xfs_dio_write_end_io( spin_unlock(&ip->i_flags_lock); } +out: + memalloc_nofs_restore(nofs_flag); return error; } @@ -507,7 +501,7 @@ xfs_file_dio_aio_write( * We can't properly handle unaligned direct I/O to reflink * files yet, as we can't unshare a partial block. */ - if (xfs_is_reflink_inode(ip)) { + if (xfs_is_cow_inode(ip)) { trace_xfs_reflink_bounce_dio_write(ip, iocb->ki_pos, count); return -EREMCHG; } @@ -517,6 +511,9 @@ xfs_file_dio_aio_write( } if (iocb->ki_flags & IOCB_NOWAIT) { + /* unaligned dio always waits, bail */ + if (unaligned_io) + return -EAGAIN; if (!xfs_ilock_nowait(ip, iolock)) return -EAGAIN; } else { @@ -529,18 +526,14 @@ xfs_file_dio_aio_write( count = iov_iter_count(from); /* - * If we are doing unaligned IO, wait for all other IO to drain, - * otherwise demote the lock if we had to take the exclusive lock - * for other reasons in xfs_file_aio_write_checks. + * If we are doing unaligned IO, we can't allow any other overlapping IO + * in-flight at the same time or we risk data corruption. Wait for all + * other IO to drain before we submit. If the IO is aligned, demote the + * iolock if we had to take the exclusive lock in + * xfs_file_aio_write_checks() for other reasons. */ if (unaligned_io) { - /* If we are going to wait for other DIO to finish, bail */ - if (iocb->ki_flags & IOCB_NOWAIT) { - if (atomic_read(&inode->i_dio_count)) - return -EAGAIN; - } else { - inode_dio_wait(inode); - } + inode_dio_wait(inode); } else if (iolock == XFS_IOLOCK_EXCL) { xfs_ilock_demote(ip, XFS_IOLOCK_EXCL); iolock = XFS_IOLOCK_SHARED; @@ -548,6 +541,14 @@ xfs_file_dio_aio_write( trace_xfs_file_direct_write(ip, count, iocb->ki_pos); ret = iomap_dio_rw(iocb, from, &xfs_iomap_ops, xfs_dio_write_end_io); + + /* + * If unaligned, this is the only IO in-flight. If it has not yet + * completed, wait on it before we release the iolock to prevent + * subsequent overlapping IO. + */ + if (ret == -EIOCBQUEUED && unaligned_io) + inode_dio_wait(inode); out: xfs_iunlock(ip, iolock); @@ -872,14 +873,27 @@ xfs_file_fallocate( goto out_unlock; } - if (mode & FALLOC_FL_ZERO_RANGE) + if (mode & FALLOC_FL_ZERO_RANGE) { error = xfs_zero_file_space(ip, offset, len); - else { - if (mode & FALLOC_FL_UNSHARE_RANGE) { - error = xfs_reflink_unshare(ip, offset, len); - if (error) - goto out_unlock; + } else if (mode & FALLOC_FL_UNSHARE_RANGE) { + error = xfs_reflink_unshare(ip, offset, len); + if (error) + goto out_unlock; + + if (!xfs_is_always_cow_inode(ip)) { + error = xfs_alloc_file_space(ip, offset, len, + XFS_BMAPI_PREALLOC); } + } else { + /* + * If always_cow mode we can't use preallocations and + * thus should not create them. + */ + if (xfs_is_always_cow_inode(ip)) { + error = -EOPNOTSUPP; + goto out_unlock; + } + error = xfs_alloc_file_space(ip, offset, len, XFS_BMAPI_PREALLOC); } @@ -1068,10 +1082,10 @@ xfs_file_llseek( default: return generic_file_llseek(file, offset, whence); case SEEK_HOLE: - offset = iomap_seek_hole(inode, offset, &xfs_iomap_ops); + offset = iomap_seek_hole(inode, offset, &xfs_seek_iomap_ops); break; case SEEK_DATA: - offset = iomap_seek_data(inode, offset, &xfs_iomap_ops); + offset = iomap_seek_data(inode, offset, &xfs_seek_iomap_ops); break; } @@ -1183,11 +1197,14 @@ xfs_file_mmap( struct file *filp, struct vm_area_struct *vma) { + struct dax_device *dax_dev; + + dax_dev = xfs_find_daxdev_for_inode(file_inode(filp)); /* - * We don't support synchronous mappings for non-DAX files. At least - * until someone comes with a sensible use case. + * We don't support synchronous mappings for non-DAX files and + * for DAX files if underneath dax_device is not synchronous. */ - if (!IS_DAX(file_inode(filp)) && (vma->vm_flags & VM_SYNC)) + if (!daxdev_mapping_supported(vma, dax_dev)) return -EOPNOTSUPP; file_accessed(filp); @@ -1203,6 +1220,7 @@ const struct file_operations xfs_file_operations = { .write_iter = xfs_file_write_iter, .splice_read = generic_file_splice_read, .splice_write = iter_file_splice_write, + .iopoll = iomap_dio_iopoll, .unlocked_ioctl = xfs_file_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = xfs_file_compat_ioctl, diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c index 182501373af2..574a7a8b4736 100644 --- a/fs/xfs/xfs_filestream.c +++ b/fs/xfs/xfs_filestream.c @@ -5,22 +5,19 @@ * All Rights Reserved. */ #include "xfs.h" +#include "xfs_shared.h" #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_sb.h" #include "xfs_mount.h" -#include "xfs_defer.h" #include "xfs_inode.h" #include "xfs_bmap.h" -#include "xfs_bmap_util.h" #include "xfs_alloc.h" #include "xfs_mru_cache.h" -#include "xfs_filestream.h" #include "xfs_trace.h" #include "xfs_ag_resv.h" #include "xfs_trans.h" -#include "xfs_shared.h" struct xfs_fstrm_item { struct xfs_mru_cache_elem mru; diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c index 3d76a9e35870..5a8f9641562a 100644 --- a/fs/xfs/xfs_fsmap.c +++ b/fs/xfs/xfs_fsmap.c @@ -9,16 +9,12 @@ #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" -#include "xfs_sb.h" #include "xfs_mount.h" -#include "xfs_defer.h" #include "xfs_inode.h" #include "xfs_trans.h" -#include "xfs_error.h" #include "xfs_btree.h" #include "xfs_rmap_btree.h" #include "xfs_trace.h" -#include "xfs_log.h" #include "xfs_rmap.h" #include "xfs_alloc.h" #include "xfs_bit.h" diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index f3ef70c542e1..3e61d0cc23f8 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -11,15 +11,11 @@ #include "xfs_trans_resv.h" #include "xfs_sb.h" #include "xfs_mount.h" -#include "xfs_defer.h" #include "xfs_trans.h" #include "xfs_error.h" -#include "xfs_btree.h" #include "xfs_alloc.h" #include "xfs_fsops.h" #include "xfs_trans_space.h" -#include "xfs_rtalloc.h" -#include "xfs_trace.h" #include "xfs_log.h" #include "xfs_ag.h" #include "xfs_ag_resv.h" @@ -251,9 +247,9 @@ xfs_growfs_data( if (mp->m_sb.sb_imax_pct) { uint64_t icount = mp->m_sb.sb_dblocks * mp->m_sb.sb_imax_pct; do_div(icount, 100); - mp->m_maxicount = XFS_FSB_TO_INO(mp, icount); + M_IGEO(mp)->maxicount = XFS_FSB_TO_INO(mp, icount); } else - mp->m_maxicount = 0; + M_IGEO(mp)->maxicount = 0; /* Update secondary superblocks now the physical grow has completed */ error = xfs_update_secondary_sbs(mp); @@ -289,7 +285,7 @@ xfs_growfs_log( * exported through ioctl XFS_IOC_FSCOUNTS */ -int +void xfs_fs_counts( xfs_mount_t *mp, xfs_fsop_counts_t *cnt) @@ -302,7 +298,6 @@ xfs_fs_counts( spin_lock(&mp->m_sb_lock); cnt->freertx = mp->m_sb.sb_frextents; spin_unlock(&mp->m_sb_lock); - return 0; } /* @@ -533,6 +528,7 @@ xfs_fs_reserve_ag_blocks( int error = 0; int err2; + mp->m_finobt_nores = false; for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) { pag = xfs_perag_get(mp, agno); err2 = xfs_ag_resv_init(pag, NULL); diff --git a/fs/xfs/xfs_fsops.h b/fs/xfs/xfs_fsops.h index d023db0862c2..92869f6ec8d3 100644 --- a/fs/xfs/xfs_fsops.h +++ b/fs/xfs/xfs_fsops.h @@ -8,7 +8,7 @@ extern int xfs_growfs_data(xfs_mount_t *mp, xfs_growfs_data_t *in); extern int xfs_growfs_log(xfs_mount_t *mp, xfs_growfs_log_t *in); -extern int xfs_fs_counts(xfs_mount_t *mp, xfs_fsop_counts_t *cnt); +extern void xfs_fs_counts(xfs_mount_t *mp, xfs_fsop_counts_t *cnt); extern int xfs_reserve_blocks(xfs_mount_t *mp, uint64_t *inval, xfs_fsop_resblks_t *outval); extern int xfs_fs_goingdown(xfs_mount_t *mp, uint32_t inflags); diff --git a/fs/xfs/xfs_globals.c b/fs/xfs/xfs_globals.c index 5169e84ae382..fa55ab8b8d80 100644 --- a/fs/xfs/xfs_globals.c +++ b/fs/xfs/xfs_globals.c @@ -4,7 +4,6 @@ * All Rights Reserved. */ #include "xfs.h" -#include "xfs_sysctl.h" /* * Tunable XFS parameters. xfs_params is required even when CONFIG_SYSCTL=n, @@ -16,7 +15,7 @@ xfs_param_t xfs_params = { /* MIN DFLT MAX */ .sgid_inherit = { 0, 0, 1 }, .symlink_mode = { 0, 0, 1 }, - .panic_mask = { 0, 0, 255 }, + .panic_mask = { 0, 0, 256 }, .error_level = { 0, 3, 11 }, .syncd_timer = { 1*100, 30*100, 7200*100}, .stats_clear = { 0, 0, 1 }, @@ -41,4 +40,7 @@ struct xfs_globals xfs_globals = { #else .bug_on_assert = false, /* assert failures WARN() */ #endif +#ifdef DEBUG + .pwork_threads = -1, /* automatic thread detection */ +#endif }; diff --git a/fs/xfs/xfs_health.c b/fs/xfs/xfs_health.c new file mode 100644 index 000000000000..8e0cb05a7142 --- /dev/null +++ b/fs/xfs/xfs_health.c @@ -0,0 +1,388 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Copyright (C) 2019 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <darrick.wong@oracle.com> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_sb.h" +#include "xfs_mount.h" +#include "xfs_inode.h" +#include "xfs_trace.h" +#include "xfs_health.h" + +/* + * Warn about metadata corruption that we detected but haven't fixed, and + * make sure we're not sitting on anything that would get in the way of + * recovery. + */ +void +xfs_health_unmount( + struct xfs_mount *mp) +{ + struct xfs_perag *pag; + xfs_agnumber_t agno; + unsigned int sick = 0; + unsigned int checked = 0; + bool warn = false; + + if (XFS_FORCED_SHUTDOWN(mp)) + return; + + /* Measure AG corruption levels. */ + for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) { + pag = xfs_perag_get(mp, agno); + xfs_ag_measure_sickness(pag, &sick, &checked); + if (sick) { + trace_xfs_ag_unfixed_corruption(mp, agno, sick); + warn = true; + } + xfs_perag_put(pag); + } + + /* Measure realtime volume corruption levels. */ + xfs_rt_measure_sickness(mp, &sick, &checked); + if (sick) { + trace_xfs_rt_unfixed_corruption(mp, sick); + warn = true; + } + + /* + * Measure fs corruption and keep the sample around for the warning. + * See the note below for why we exempt FS_COUNTERS. + */ + xfs_fs_measure_sickness(mp, &sick, &checked); + if (sick & ~XFS_SICK_FS_COUNTERS) { + trace_xfs_fs_unfixed_corruption(mp, sick); + warn = true; + } + + if (warn) { + xfs_warn(mp, +"Uncorrected metadata errors detected; please run xfs_repair."); + + /* + * We discovered uncorrected metadata problems at some point + * during this filesystem mount and have advised the + * administrator to run repair once the unmount completes. + * + * However, we must be careful -- when FSCOUNTERS are flagged + * unhealthy, the unmount procedure omits writing the clean + * unmount record to the log so that the next mount will run + * recovery and recompute the summary counters. In other + * words, we leave a dirty log to get the counters fixed. + * + * Unfortunately, xfs_repair cannot recover dirty logs, so if + * there were filesystem problems, FSCOUNTERS was flagged, and + * the administrator takes our advice to run xfs_repair, + * they'll have to zap the log before repairing structures. + * We don't really want to encourage this, so we mark the + * FSCOUNTERS healthy so that a subsequent repair run won't see + * a dirty log. + */ + if (sick & XFS_SICK_FS_COUNTERS) + xfs_fs_mark_healthy(mp, XFS_SICK_FS_COUNTERS); + } +} + +/* Mark unhealthy per-fs metadata. */ +void +xfs_fs_mark_sick( + struct xfs_mount *mp, + unsigned int mask) +{ + ASSERT(!(mask & ~XFS_SICK_FS_PRIMARY)); + trace_xfs_fs_mark_sick(mp, mask); + + spin_lock(&mp->m_sb_lock); + mp->m_fs_sick |= mask; + mp->m_fs_checked |= mask; + spin_unlock(&mp->m_sb_lock); +} + +/* Mark a per-fs metadata healed. */ +void +xfs_fs_mark_healthy( + struct xfs_mount *mp, + unsigned int mask) +{ + ASSERT(!(mask & ~XFS_SICK_FS_PRIMARY)); + trace_xfs_fs_mark_healthy(mp, mask); + + spin_lock(&mp->m_sb_lock); + mp->m_fs_sick &= ~mask; + mp->m_fs_checked |= mask; + spin_unlock(&mp->m_sb_lock); +} + +/* Sample which per-fs metadata are unhealthy. */ +void +xfs_fs_measure_sickness( + struct xfs_mount *mp, + unsigned int *sick, + unsigned int *checked) +{ + spin_lock(&mp->m_sb_lock); + *sick = mp->m_fs_sick; + *checked = mp->m_fs_checked; + spin_unlock(&mp->m_sb_lock); +} + +/* Mark unhealthy realtime metadata. */ +void +xfs_rt_mark_sick( + struct xfs_mount *mp, + unsigned int mask) +{ + ASSERT(!(mask & ~XFS_SICK_RT_PRIMARY)); + trace_xfs_rt_mark_sick(mp, mask); + + spin_lock(&mp->m_sb_lock); + mp->m_rt_sick |= mask; + mp->m_rt_checked |= mask; + spin_unlock(&mp->m_sb_lock); +} + +/* Mark a realtime metadata healed. */ +void +xfs_rt_mark_healthy( + struct xfs_mount *mp, + unsigned int mask) +{ + ASSERT(!(mask & ~XFS_SICK_RT_PRIMARY)); + trace_xfs_rt_mark_healthy(mp, mask); + + spin_lock(&mp->m_sb_lock); + mp->m_rt_sick &= ~mask; + mp->m_rt_checked |= mask; + spin_unlock(&mp->m_sb_lock); +} + +/* Sample which realtime metadata are unhealthy. */ +void +xfs_rt_measure_sickness( + struct xfs_mount *mp, + unsigned int *sick, + unsigned int *checked) +{ + spin_lock(&mp->m_sb_lock); + *sick = mp->m_rt_sick; + *checked = mp->m_rt_checked; + spin_unlock(&mp->m_sb_lock); +} + +/* Mark unhealthy per-ag metadata. */ +void +xfs_ag_mark_sick( + struct xfs_perag *pag, + unsigned int mask) +{ + ASSERT(!(mask & ~XFS_SICK_AG_PRIMARY)); + trace_xfs_ag_mark_sick(pag->pag_mount, pag->pag_agno, mask); + + spin_lock(&pag->pag_state_lock); + pag->pag_sick |= mask; + pag->pag_checked |= mask; + spin_unlock(&pag->pag_state_lock); +} + +/* Mark per-ag metadata ok. */ +void +xfs_ag_mark_healthy( + struct xfs_perag *pag, + unsigned int mask) +{ + ASSERT(!(mask & ~XFS_SICK_AG_PRIMARY)); + trace_xfs_ag_mark_healthy(pag->pag_mount, pag->pag_agno, mask); + + spin_lock(&pag->pag_state_lock); + pag->pag_sick &= ~mask; + pag->pag_checked |= mask; + spin_unlock(&pag->pag_state_lock); +} + +/* Sample which per-ag metadata are unhealthy. */ +void +xfs_ag_measure_sickness( + struct xfs_perag *pag, + unsigned int *sick, + unsigned int *checked) +{ + spin_lock(&pag->pag_state_lock); + *sick = pag->pag_sick; + *checked = pag->pag_checked; + spin_unlock(&pag->pag_state_lock); +} + +/* Mark the unhealthy parts of an inode. */ +void +xfs_inode_mark_sick( + struct xfs_inode *ip, + unsigned int mask) +{ + ASSERT(!(mask & ~XFS_SICK_INO_PRIMARY)); + trace_xfs_inode_mark_sick(ip, mask); + + spin_lock(&ip->i_flags_lock); + ip->i_sick |= mask; + ip->i_checked |= mask; + spin_unlock(&ip->i_flags_lock); +} + +/* Mark parts of an inode healed. */ +void +xfs_inode_mark_healthy( + struct xfs_inode *ip, + unsigned int mask) +{ + ASSERT(!(mask & ~XFS_SICK_INO_PRIMARY)); + trace_xfs_inode_mark_healthy(ip, mask); + + spin_lock(&ip->i_flags_lock); + ip->i_sick &= ~mask; + ip->i_checked |= mask; + spin_unlock(&ip->i_flags_lock); +} + +/* Sample which parts of an inode are unhealthy. */ +void +xfs_inode_measure_sickness( + struct xfs_inode *ip, + unsigned int *sick, + unsigned int *checked) +{ + spin_lock(&ip->i_flags_lock); + *sick = ip->i_sick; + *checked = ip->i_checked; + spin_unlock(&ip->i_flags_lock); +} + +/* Mappings between internal sick masks and ioctl sick masks. */ + +struct ioctl_sick_map { + unsigned int sick_mask; + unsigned int ioctl_mask; +}; + +static const struct ioctl_sick_map fs_map[] = { + { XFS_SICK_FS_COUNTERS, XFS_FSOP_GEOM_SICK_COUNTERS}, + { XFS_SICK_FS_UQUOTA, XFS_FSOP_GEOM_SICK_UQUOTA }, + { XFS_SICK_FS_GQUOTA, XFS_FSOP_GEOM_SICK_GQUOTA }, + { XFS_SICK_FS_PQUOTA, XFS_FSOP_GEOM_SICK_PQUOTA }, + { 0, 0 }, +}; + +static const struct ioctl_sick_map rt_map[] = { + { XFS_SICK_RT_BITMAP, XFS_FSOP_GEOM_SICK_RT_BITMAP }, + { XFS_SICK_RT_SUMMARY, XFS_FSOP_GEOM_SICK_RT_SUMMARY }, + { 0, 0 }, +}; + +static inline void +xfgeo_health_tick( + struct xfs_fsop_geom *geo, + unsigned int sick, + unsigned int checked, + const struct ioctl_sick_map *m) +{ + if (checked & m->sick_mask) + geo->checked |= m->ioctl_mask; + if (sick & m->sick_mask) + geo->sick |= m->ioctl_mask; +} + +/* Fill out fs geometry health info. */ +void +xfs_fsop_geom_health( + struct xfs_mount *mp, + struct xfs_fsop_geom *geo) +{ + const struct ioctl_sick_map *m; + unsigned int sick; + unsigned int checked; + + geo->sick = 0; + geo->checked = 0; + + xfs_fs_measure_sickness(mp, &sick, &checked); + for (m = fs_map; m->sick_mask; m++) + xfgeo_health_tick(geo, sick, checked, m); + + xfs_rt_measure_sickness(mp, &sick, &checked); + for (m = rt_map; m->sick_mask; m++) + xfgeo_health_tick(geo, sick, checked, m); +} + +static const struct ioctl_sick_map ag_map[] = { + { XFS_SICK_AG_SB, XFS_AG_GEOM_SICK_SB }, + { XFS_SICK_AG_AGF, XFS_AG_GEOM_SICK_AGF }, + { XFS_SICK_AG_AGFL, XFS_AG_GEOM_SICK_AGFL }, + { XFS_SICK_AG_AGI, XFS_AG_GEOM_SICK_AGI }, + { XFS_SICK_AG_BNOBT, XFS_AG_GEOM_SICK_BNOBT }, + { XFS_SICK_AG_CNTBT, XFS_AG_GEOM_SICK_CNTBT }, + { XFS_SICK_AG_INOBT, XFS_AG_GEOM_SICK_INOBT }, + { XFS_SICK_AG_FINOBT, XFS_AG_GEOM_SICK_FINOBT }, + { XFS_SICK_AG_RMAPBT, XFS_AG_GEOM_SICK_RMAPBT }, + { XFS_SICK_AG_REFCNTBT, XFS_AG_GEOM_SICK_REFCNTBT }, + { 0, 0 }, +}; + +/* Fill out ag geometry health info. */ +void +xfs_ag_geom_health( + struct xfs_perag *pag, + struct xfs_ag_geometry *ageo) +{ + const struct ioctl_sick_map *m; + unsigned int sick; + unsigned int checked; + + ageo->ag_sick = 0; + ageo->ag_checked = 0; + + xfs_ag_measure_sickness(pag, &sick, &checked); + for (m = ag_map; m->sick_mask; m++) { + if (checked & m->sick_mask) + ageo->ag_checked |= m->ioctl_mask; + if (sick & m->sick_mask) + ageo->ag_sick |= m->ioctl_mask; + } +} + +static const struct ioctl_sick_map ino_map[] = { + { XFS_SICK_INO_CORE, XFS_BS_SICK_INODE }, + { XFS_SICK_INO_BMBTD, XFS_BS_SICK_BMBTD }, + { XFS_SICK_INO_BMBTA, XFS_BS_SICK_BMBTA }, + { XFS_SICK_INO_BMBTC, XFS_BS_SICK_BMBTC }, + { XFS_SICK_INO_DIR, XFS_BS_SICK_DIR }, + { XFS_SICK_INO_XATTR, XFS_BS_SICK_XATTR }, + { XFS_SICK_INO_SYMLINK, XFS_BS_SICK_SYMLINK }, + { XFS_SICK_INO_PARENT, XFS_BS_SICK_PARENT }, + { 0, 0 }, +}; + +/* Fill out bulkstat health info. */ +void +xfs_bulkstat_health( + struct xfs_inode *ip, + struct xfs_bulkstat *bs) +{ + const struct ioctl_sick_map *m; + unsigned int sick; + unsigned int checked; + + bs->bs_sick = 0; + bs->bs_checked = 0; + + xfs_inode_measure_sickness(ip, &sick, &checked); + for (m = ino_map; m->sick_mask; m++) { + if (checked & m->sick_mask) + bs->bs_checked |= m->ioctl_mask; + if (sick & m->sick_mask) + bs->bs_sick |= m->ioctl_mask; + } +} diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 245483cc282b..0b0fd10a36d4 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -5,13 +5,13 @@ */ #include "xfs.h" #include "xfs_fs.h" +#include "xfs_shared.h" #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_sb.h" #include "xfs_mount.h" #include "xfs_inode.h" -#include "xfs_error.h" #include "xfs_trans.h" #include "xfs_trans_priv.h" #include "xfs_inode_item.h" @@ -23,8 +23,6 @@ #include "xfs_dquot.h" #include "xfs_reflink.h" -#include <linux/kthread.h> -#include <linux/freezer.h> #include <linux/iversion.h> /* @@ -70,6 +68,11 @@ xfs_inode_alloc( ip->i_flags = 0; ip->i_delayed_blks = 0; memset(&ip->i_d, 0, sizeof(ip->i_d)); + ip->i_sick = 0; + ip->i_checked = 0; + INIT_WORK(&ip->i_ioend_work, xfs_end_io); + INIT_LIST_HEAD(&ip->i_ioend_list); + spin_lock_init(&ip->i_ioend_lock); return ip; } @@ -446,6 +449,8 @@ xfs_iget_cache_hit( ip->i_flags |= XFS_INEW; xfs_inode_clear_reclaim_tag(pag, ip->i_ino); inode->i_state = I_NEW; + ip->i_sick = 0; + ip->i_checked = 0; ASSERT(!rwsem_is_locked(&inode->i_rwsem)); init_rwsem(&inode->i_rwsem); @@ -1815,7 +1820,7 @@ xfs_inode_clear_cowblocks_tag( /* Disable post-EOF and CoW block auto-reclamation. */ void -xfs_icache_disable_reclaim( +xfs_stop_block_reaping( struct xfs_mount *mp) { cancel_delayed_work_sync(&mp->m_eofblocks_work); @@ -1824,7 +1829,7 @@ xfs_icache_disable_reclaim( /* Enable post-EOF and CoW block auto-reclamation. */ void -xfs_icache_enable_reclaim( +xfs_start_block_reaping( struct xfs_mount *mp) { xfs_queue_eofblocks(mp); diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h index 26c0626f1f75..48f1fd2bb6ad 100644 --- a/fs/xfs/xfs_icache.h +++ b/fs/xfs/xfs_icache.h @@ -119,7 +119,7 @@ xfs_fs_eofblocks_from_user( int xfs_icache_inode_is_allocated(struct xfs_mount *mp, struct xfs_trans *tp, xfs_ino_t ino, bool *inuse); -void xfs_icache_disable_reclaim(struct xfs_mount *mp); -void xfs_icache_enable_reclaim(struct xfs_mount *mp); +void xfs_stop_block_reaping(struct xfs_mount *mp); +void xfs_start_block_reaping(struct xfs_mount *mp); #endif diff --git a/fs/xfs/xfs_icreate_item.c b/fs/xfs/xfs_icreate_item.c index 8381d34cb102..d99a0a3e5f40 100644 --- a/fs/xfs/xfs_icreate_item.c +++ b/fs/xfs/xfs_icreate_item.c @@ -6,14 +6,9 @@ #include "xfs.h" #include "xfs_fs.h" #include "xfs_shared.h" -#include "xfs_format.h" #include "xfs_log_format.h" -#include "xfs_trans_resv.h" -#include "xfs_bit.h" -#include "xfs_mount.h" #include "xfs_trans.h" #include "xfs_trans_priv.h" -#include "xfs_error.h" #include "xfs_icreate_item.h" #include "xfs_log.h" @@ -56,80 +51,18 @@ xfs_icreate_item_format( sizeof(struct xfs_icreate_log)); } - -/* Pinning has no meaning for the create item, so just return. */ STATIC void -xfs_icreate_item_pin( +xfs_icreate_item_release( struct xfs_log_item *lip) { + kmem_zone_free(xfs_icreate_zone, ICR_ITEM(lip)); } - -/* pinning has no meaning for the create item, so just return. */ -STATIC void -xfs_icreate_item_unpin( - struct xfs_log_item *lip, - int remove) -{ -} - -STATIC void -xfs_icreate_item_unlock( - struct xfs_log_item *lip) -{ - struct xfs_icreate_item *icp = ICR_ITEM(lip); - - if (test_bit(XFS_LI_ABORTED, &lip->li_flags)) - kmem_zone_free(xfs_icreate_zone, icp); - return; -} - -/* - * Because we have ordered buffers being tracked in the AIL for the inode - * creation, we don't need the create item after this. Hence we can free - * the log item and return -1 to tell the caller we're done with the item. - */ -STATIC xfs_lsn_t -xfs_icreate_item_committed( - struct xfs_log_item *lip, - xfs_lsn_t lsn) -{ - struct xfs_icreate_item *icp = ICR_ITEM(lip); - - kmem_zone_free(xfs_icreate_zone, icp); - return (xfs_lsn_t)-1; -} - -/* item can never get into the AIL */ -STATIC uint -xfs_icreate_item_push( - struct xfs_log_item *lip, - struct list_head *buffer_list) -{ - ASSERT(0); - return XFS_ITEM_SUCCESS; -} - -/* Ordered buffers do the dependency tracking here, so this does nothing. */ -STATIC void -xfs_icreate_item_committing( - struct xfs_log_item *lip, - xfs_lsn_t lsn) -{ -} - -/* - * This is the ops vector shared by all buf log items. - */ static const struct xfs_item_ops xfs_icreate_item_ops = { + .flags = XFS_ITEM_RELEASE_WHEN_COMMITTED, .iop_size = xfs_icreate_item_size, .iop_format = xfs_icreate_item_format, - .iop_pin = xfs_icreate_item_pin, - .iop_unpin = xfs_icreate_item_unpin, - .iop_push = xfs_icreate_item_push, - .iop_unlock = xfs_icreate_item_unlock, - .iop_committed = xfs_icreate_item_committed, - .iop_committing = xfs_icreate_item_committing, + .iop_release = xfs_icreate_item_release, }; diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index ae667ba74a1c..6467d5e1df2d 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -3,7 +3,6 @@ * Copyright (c) 2000-2006 Silicon Graphics, Inc. * All Rights Reserved. */ -#include <linux/log2.h> #include <linux/iversion.h> #include "xfs.h" @@ -16,10 +15,7 @@ #include "xfs_mount.h" #include "xfs_defer.h" #include "xfs_inode.h" -#include "xfs_da_format.h" -#include "xfs_da_btree.h" #include "xfs_dir2.h" -#include "xfs_attr_sf.h" #include "xfs_attr.h" #include "xfs_trans_space.h" #include "xfs_trans.h" @@ -32,7 +28,6 @@ #include "xfs_error.h" #include "xfs_quota.h" #include "xfs_filestream.h" -#include "xfs_cksum.h" #include "xfs_trace.h" #include "xfs_icache.h" #include "xfs_symlink.h" @@ -40,7 +35,6 @@ #include "xfs_log.h" #include "xfs_bmap_btree.h" #include "xfs_reflink.h" -#include "xfs_dir2_priv.h" kmem_zone_t *xfs_inode_zone; @@ -441,12 +435,12 @@ xfs_lock_inumorder(int lock_mode, int subclass) */ static void xfs_lock_inodes( - xfs_inode_t **ips, - int inodes, - uint lock_mode) + struct xfs_inode **ips, + int inodes, + uint lock_mode) { - int attempts = 0, i, j, try_lock; - xfs_log_item_t *lp; + int attempts = 0, i, j, try_lock; + struct xfs_log_item *lp; /* * Currently supports between 2 and 5 inodes with exclusive locking. We @@ -485,7 +479,7 @@ again: */ if (!try_lock) { for (j = (i - 1); j >= 0 && !try_lock; j--) { - lp = (xfs_log_item_t *)ips[j]->i_itemp; + lp = &ips[j]->i_itemp->ili_item; if (lp && test_bit(XFS_LI_IN_AIL, &lp->li_flags)) try_lock++; } @@ -551,7 +545,7 @@ xfs_lock_two_inodes( struct xfs_inode *temp; uint mode_temp; int attempts = 0; - xfs_log_item_t *lp; + struct xfs_log_item *lp; ASSERT(hweight32(ip0_mode) == 1); ASSERT(hweight32(ip1_mode) == 1); @@ -585,7 +579,7 @@ xfs_lock_two_inodes( * the second lock. If we can't get it, we must release the first one * and try again. */ - lp = (xfs_log_item_t *)ip0->i_itemp; + lp = &ip0->i_itemp->ili_item; if (lp && test_bit(XFS_LI_IN_AIL, &lp->li_flags)) { if (!xfs_ilock_nowait(ip1, xfs_lock_inumorder(ip1_mode, 1))) { xfs_iunlock(ip0, ip0_mode); @@ -1116,7 +1110,7 @@ xfs_droplink( /* * Increment the link count on an inode & log the change. */ -static int +static void xfs_bumplink( xfs_trans_t *tp, xfs_inode_t *ip) @@ -1126,7 +1120,6 @@ xfs_bumplink( ASSERT(ip->i_d.di_version > 1); inc_nlink(VFS_I(ip)); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - return 0; } int @@ -1235,9 +1228,7 @@ xfs_create( if (error) goto out_trans_cancel; - error = xfs_bumplink(tp, dp); - if (error) - goto out_trans_cancel; + xfs_bumplink(tp, dp); } /* @@ -1332,7 +1323,7 @@ xfs_create_tmpfile( if (error) goto out_trans_cancel; - error = xfs_dir_ialloc(&tp, dp, mode, 1, 0, prid, &ip); + error = xfs_dir_ialloc(&tp, dp, mode, 0, 0, prid, &ip); if (error) goto out_trans_cancel; @@ -1454,9 +1445,7 @@ xfs_link( xfs_trans_ichgtime(tp, tdp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); xfs_trans_log_inode(tp, tdp, XFS_ILOG_CORE); - error = xfs_bumplink(tp, sip); - if (error) - goto error_return; + xfs_bumplink(tp, sip); /* * If this is a synchronous mount, make sure that the @@ -1754,7 +1743,7 @@ xfs_inactive_ifree( * now remains allocated and sits on the unlinked list until the fs is * repaired. */ - if (unlikely(mp->m_inotbt_nores)) { + if (unlikely(mp->m_finobt_nores)) { error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ifree, XFS_IFREE_SPACE_RES(mp), 0, XFS_TRANS_RESERVE, &tp); @@ -1907,86 +1896,510 @@ xfs_inactive( } /* - * This is called when the inode's link count goes to 0 or we are creating a - * tmpfile via O_TMPFILE. In the case of a tmpfile, @ignore_linkcount will be - * set to true as the link count is dropped to zero by the VFS after we've - * created the file successfully, so we have to add it to the unlinked list - * while the link count is non-zero. + * In-Core Unlinked List Lookups + * ============================= + * + * Every inode is supposed to be reachable from some other piece of metadata + * with the exception of the root directory. Inodes with a connection to a + * file descriptor but not linked from anywhere in the on-disk directory tree + * are collectively known as unlinked inodes, though the filesystem itself + * maintains links to these inodes so that on-disk metadata are consistent. + * + * XFS implements a per-AG on-disk hash table of unlinked inodes. The AGI + * header contains a number of buckets that point to an inode, and each inode + * record has a pointer to the next inode in the hash chain. This + * singly-linked list causes scaling problems in the iunlink remove function + * because we must walk that list to find the inode that points to the inode + * being removed from the unlinked hash bucket list. + * + * What if we modelled the unlinked list as a collection of records capturing + * "X.next_unlinked = Y" relations? If we indexed those records on Y, we'd + * have a fast way to look up unlinked list predecessors, which avoids the + * slow list walk. That's exactly what we do here (in-core) with a per-AG + * rhashtable. + * + * Because this is a backref cache, we ignore operational failures since the + * iunlink code can fall back to the slow bucket walk. The only errors that + * should bubble out are for obviously incorrect situations. + * + * All users of the backref cache MUST hold the AGI buffer lock to serialize + * access or have otherwise provided for concurrency control. + */ + +/* Capture a "X.next_unlinked = Y" relationship. */ +struct xfs_iunlink { + struct rhash_head iu_rhash_head; + xfs_agino_t iu_agino; /* X */ + xfs_agino_t iu_next_unlinked; /* Y */ +}; + +/* Unlinked list predecessor lookup hashtable construction */ +static int +xfs_iunlink_obj_cmpfn( + struct rhashtable_compare_arg *arg, + const void *obj) +{ + const xfs_agino_t *key = arg->key; + const struct xfs_iunlink *iu = obj; + + if (iu->iu_next_unlinked != *key) + return 1; + return 0; +} + +static const struct rhashtable_params xfs_iunlink_hash_params = { + .min_size = XFS_AGI_UNLINKED_BUCKETS, + .key_len = sizeof(xfs_agino_t), + .key_offset = offsetof(struct xfs_iunlink, + iu_next_unlinked), + .head_offset = offsetof(struct xfs_iunlink, iu_rhash_head), + .automatic_shrinking = true, + .obj_cmpfn = xfs_iunlink_obj_cmpfn, +}; + +/* + * Return X, where X.next_unlinked == @agino. Returns NULLAGINO if no such + * relation is found. + */ +static xfs_agino_t +xfs_iunlink_lookup_backref( + struct xfs_perag *pag, + xfs_agino_t agino) +{ + struct xfs_iunlink *iu; + + iu = rhashtable_lookup_fast(&pag->pagi_unlinked_hash, &agino, + xfs_iunlink_hash_params); + return iu ? iu->iu_agino : NULLAGINO; +} + +/* + * Take ownership of an iunlink cache entry and insert it into the hash table. + * If successful, the entry will be owned by the cache; if not, it is freed. + * Either way, the caller does not own @iu after this call. + */ +static int +xfs_iunlink_insert_backref( + struct xfs_perag *pag, + struct xfs_iunlink *iu) +{ + int error; + + error = rhashtable_insert_fast(&pag->pagi_unlinked_hash, + &iu->iu_rhash_head, xfs_iunlink_hash_params); + /* + * Fail loudly if there already was an entry because that's a sign of + * corruption of in-memory data. Also fail loudly if we see an error + * code we didn't anticipate from the rhashtable code. Currently we + * only anticipate ENOMEM. + */ + if (error) { + WARN(error != -ENOMEM, "iunlink cache insert error %d", error); + kmem_free(iu); + } + /* + * Absorb any runtime errors that aren't a result of corruption because + * this is a cache and we can always fall back to bucket list scanning. + */ + if (error != 0 && error != -EEXIST) + error = 0; + return error; +} + +/* Remember that @prev_agino.next_unlinked = @this_agino. */ +static int +xfs_iunlink_add_backref( + struct xfs_perag *pag, + xfs_agino_t prev_agino, + xfs_agino_t this_agino) +{ + struct xfs_iunlink *iu; + + if (XFS_TEST_ERROR(false, pag->pag_mount, XFS_ERRTAG_IUNLINK_FALLBACK)) + return 0; + + iu = kmem_zalloc(sizeof(*iu), KM_SLEEP | KM_NOFS); + iu->iu_agino = prev_agino; + iu->iu_next_unlinked = this_agino; + + return xfs_iunlink_insert_backref(pag, iu); +} + +/* + * Replace X.next_unlinked = @agino with X.next_unlinked = @next_unlinked. + * If @next_unlinked is NULLAGINO, we drop the backref and exit. If there + * wasn't any such entry then we don't bother. + */ +static int +xfs_iunlink_change_backref( + struct xfs_perag *pag, + xfs_agino_t agino, + xfs_agino_t next_unlinked) +{ + struct xfs_iunlink *iu; + int error; + + /* Look up the old entry; if there wasn't one then exit. */ + iu = rhashtable_lookup_fast(&pag->pagi_unlinked_hash, &agino, + xfs_iunlink_hash_params); + if (!iu) + return 0; + + /* + * Remove the entry. This shouldn't ever return an error, but if we + * couldn't remove the old entry we don't want to add it again to the + * hash table, and if the entry disappeared on us then someone's + * violated the locking rules and we need to fail loudly. Either way + * we cannot remove the inode because internal state is or would have + * been corrupt. + */ + error = rhashtable_remove_fast(&pag->pagi_unlinked_hash, + &iu->iu_rhash_head, xfs_iunlink_hash_params); + if (error) + return error; + + /* If there is no new next entry just free our item and return. */ + if (next_unlinked == NULLAGINO) { + kmem_free(iu); + return 0; + } + + /* Update the entry and re-add it to the hash table. */ + iu->iu_next_unlinked = next_unlinked; + return xfs_iunlink_insert_backref(pag, iu); +} + +/* Set up the in-core predecessor structures. */ +int +xfs_iunlink_init( + struct xfs_perag *pag) +{ + return rhashtable_init(&pag->pagi_unlinked_hash, + &xfs_iunlink_hash_params); +} + +/* Free the in-core predecessor structures. */ +static void +xfs_iunlink_free_item( + void *ptr, + void *arg) +{ + struct xfs_iunlink *iu = ptr; + bool *freed_anything = arg; + + *freed_anything = true; + kmem_free(iu); +} + +void +xfs_iunlink_destroy( + struct xfs_perag *pag) +{ + bool freed_anything = false; + + rhashtable_free_and_destroy(&pag->pagi_unlinked_hash, + xfs_iunlink_free_item, &freed_anything); + + ASSERT(freed_anything == false || XFS_FORCED_SHUTDOWN(pag->pag_mount)); +} + +/* + * Point the AGI unlinked bucket at an inode and log the results. The caller + * is responsible for validating the old value. + */ +STATIC int +xfs_iunlink_update_bucket( + struct xfs_trans *tp, + xfs_agnumber_t agno, + struct xfs_buf *agibp, + unsigned int bucket_index, + xfs_agino_t new_agino) +{ + struct xfs_agi *agi = XFS_BUF_TO_AGI(agibp); + xfs_agino_t old_value; + int offset; + + ASSERT(xfs_verify_agino_or_null(tp->t_mountp, agno, new_agino)); + + old_value = be32_to_cpu(agi->agi_unlinked[bucket_index]); + trace_xfs_iunlink_update_bucket(tp->t_mountp, agno, bucket_index, + old_value, new_agino); + + /* + * We should never find the head of the list already set to the value + * passed in because either we're adding or removing ourselves from the + * head of the list. + */ + if (old_value == new_agino) + return -EFSCORRUPTED; + + agi->agi_unlinked[bucket_index] = cpu_to_be32(new_agino); + offset = offsetof(struct xfs_agi, agi_unlinked) + + (sizeof(xfs_agino_t) * bucket_index); + xfs_trans_log_buf(tp, agibp, offset, offset + sizeof(xfs_agino_t) - 1); + return 0; +} + +/* Set an on-disk inode's next_unlinked pointer. */ +STATIC void +xfs_iunlink_update_dinode( + struct xfs_trans *tp, + xfs_agnumber_t agno, + xfs_agino_t agino, + struct xfs_buf *ibp, + struct xfs_dinode *dip, + struct xfs_imap *imap, + xfs_agino_t next_agino) +{ + struct xfs_mount *mp = tp->t_mountp; + int offset; + + ASSERT(xfs_verify_agino_or_null(mp, agno, next_agino)); + + trace_xfs_iunlink_update_dinode(mp, agno, agino, + be32_to_cpu(dip->di_next_unlinked), next_agino); + + dip->di_next_unlinked = cpu_to_be32(next_agino); + offset = imap->im_boffset + + offsetof(struct xfs_dinode, di_next_unlinked); + + /* need to recalc the inode CRC if appropriate */ + xfs_dinode_calc_crc(mp, dip); + xfs_trans_inode_buf(tp, ibp); + xfs_trans_log_buf(tp, ibp, offset, offset + sizeof(xfs_agino_t) - 1); + xfs_inobp_check(mp, ibp); +} + +/* Set an in-core inode's unlinked pointer and return the old value. */ +STATIC int +xfs_iunlink_update_inode( + struct xfs_trans *tp, + struct xfs_inode *ip, + xfs_agnumber_t agno, + xfs_agino_t next_agino, + xfs_agino_t *old_next_agino) +{ + struct xfs_mount *mp = tp->t_mountp; + struct xfs_dinode *dip; + struct xfs_buf *ibp; + xfs_agino_t old_value; + int error; + + ASSERT(xfs_verify_agino_or_null(mp, agno, next_agino)); + + error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp, 0, 0); + if (error) + return error; + + /* Make sure the old pointer isn't garbage. */ + old_value = be32_to_cpu(dip->di_next_unlinked); + if (!xfs_verify_agino_or_null(mp, agno, old_value)) { + error = -EFSCORRUPTED; + goto out; + } + + /* + * Since we're updating a linked list, we should never find that the + * current pointer is the same as the new value, unless we're + * terminating the list. + */ + *old_next_agino = old_value; + if (old_value == next_agino) { + if (next_agino != NULLAGINO) + error = -EFSCORRUPTED; + goto out; + } + + /* Ok, update the new pointer. */ + xfs_iunlink_update_dinode(tp, agno, XFS_INO_TO_AGINO(mp, ip->i_ino), + ibp, dip, &ip->i_imap, next_agino); + return 0; +out: + xfs_trans_brelse(tp, ibp); + return error; +} + +/* + * This is called when the inode's link count has gone to 0 or we are creating + * a tmpfile via O_TMPFILE. The inode @ip must have nlink == 0. * * We place the on-disk inode on a list in the AGI. It will be pulled from this * list when the inode is freed. */ STATIC int xfs_iunlink( - struct xfs_trans *tp, - struct xfs_inode *ip) + struct xfs_trans *tp, + struct xfs_inode *ip) { - xfs_mount_t *mp = tp->t_mountp; - xfs_agi_t *agi; - xfs_dinode_t *dip; - xfs_buf_t *agibp; - xfs_buf_t *ibp; - xfs_agino_t agino; - short bucket_index; - int offset; - int error; + struct xfs_mount *mp = tp->t_mountp; + struct xfs_agi *agi; + struct xfs_buf *agibp; + xfs_agino_t next_agino; + xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, ip->i_ino); + xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ip->i_ino); + short bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS; + int error; + ASSERT(VFS_I(ip)->i_nlink == 0); ASSERT(VFS_I(ip)->i_mode != 0); + trace_xfs_iunlink(ip); - /* - * Get the agi buffer first. It ensures lock ordering - * on the list. - */ - error = xfs_read_agi(mp, tp, XFS_INO_TO_AGNO(mp, ip->i_ino), &agibp); + /* Get the agi buffer first. It ensures lock ordering on the list. */ + error = xfs_read_agi(mp, tp, agno, &agibp); if (error) return error; agi = XFS_BUF_TO_AGI(agibp); /* - * Get the index into the agi hash table for the - * list this inode will go on. + * Get the index into the agi hash table for the list this inode will + * go on. Make sure the pointer isn't garbage and that this inode + * isn't already on the list. */ - agino = XFS_INO_TO_AGINO(mp, ip->i_ino); - ASSERT(agino != 0); - bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS; - ASSERT(agi->agi_unlinked[bucket_index]); - ASSERT(be32_to_cpu(agi->agi_unlinked[bucket_index]) != agino); + next_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]); + if (next_agino == agino || + !xfs_verify_agino_or_null(mp, agno, next_agino)) + return -EFSCORRUPTED; + + if (next_agino != NULLAGINO) { + struct xfs_perag *pag; + xfs_agino_t old_agino; - if (agi->agi_unlinked[bucket_index] != cpu_to_be32(NULLAGINO)) { /* - * There is already another inode in the bucket we need - * to add ourselves to. Add us at the front of the list. - * Here we put the head pointer into our next pointer, - * and then we fall through to point the head at us. + * There is already another inode in the bucket, so point this + * inode to the current head of the list. */ - error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp, - 0, 0); + error = xfs_iunlink_update_inode(tp, ip, agno, next_agino, + &old_agino); if (error) return error; + ASSERT(old_agino == NULLAGINO); - ASSERT(dip->di_next_unlinked == cpu_to_be32(NULLAGINO)); - dip->di_next_unlinked = agi->agi_unlinked[bucket_index]; - offset = ip->i_imap.im_boffset + - offsetof(xfs_dinode_t, di_next_unlinked); + /* + * agino has been unlinked, add a backref from the next inode + * back to agino. + */ + pag = xfs_perag_get(mp, agno); + error = xfs_iunlink_add_backref(pag, agino, next_agino); + xfs_perag_put(pag); + if (error) + return error; + } + + /* Point the head of the list to point to this inode. */ + return xfs_iunlink_update_bucket(tp, agno, agibp, bucket_index, agino); +} + +/* Return the imap, dinode pointer, and buffer for an inode. */ +STATIC int +xfs_iunlink_map_ino( + struct xfs_trans *tp, + xfs_agnumber_t agno, + xfs_agino_t agino, + struct xfs_imap *imap, + struct xfs_dinode **dipp, + struct xfs_buf **bpp) +{ + struct xfs_mount *mp = tp->t_mountp; + int error; - /* need to recalc the inode CRC if appropriate */ - xfs_dinode_calc_crc(mp, dip); + imap->im_blkno = 0; + error = xfs_imap(mp, tp, XFS_AGINO_TO_INO(mp, agno, agino), imap, 0); + if (error) { + xfs_warn(mp, "%s: xfs_imap returned error %d.", + __func__, error); + return error; + } - xfs_trans_inode_buf(tp, ibp); - xfs_trans_log_buf(tp, ibp, offset, - (offset + sizeof(xfs_agino_t) - 1)); - xfs_inobp_check(mp, ibp); + error = xfs_imap_to_bp(mp, tp, imap, dipp, bpp, 0, 0); + if (error) { + xfs_warn(mp, "%s: xfs_imap_to_bp returned error %d.", + __func__, error); + return error; + } + + return 0; +} + +/* + * Walk the unlinked chain from @head_agino until we find the inode that + * points to @target_agino. Return the inode number, map, dinode pointer, + * and inode cluster buffer of that inode as @agino, @imap, @dipp, and @bpp. + * + * @tp, @pag, @head_agino, and @target_agino are input parameters. + * @agino, @imap, @dipp, and @bpp are all output parameters. + * + * Do not call this function if @target_agino is the head of the list. + */ +STATIC int +xfs_iunlink_map_prev( + struct xfs_trans *tp, + xfs_agnumber_t agno, + xfs_agino_t head_agino, + xfs_agino_t target_agino, + xfs_agino_t *agino, + struct xfs_imap *imap, + struct xfs_dinode **dipp, + struct xfs_buf **bpp, + struct xfs_perag *pag) +{ + struct xfs_mount *mp = tp->t_mountp; + xfs_agino_t next_agino; + int error; + + ASSERT(head_agino != target_agino); + *bpp = NULL; + + /* See if our backref cache can find it faster. */ + *agino = xfs_iunlink_lookup_backref(pag, target_agino); + if (*agino != NULLAGINO) { + error = xfs_iunlink_map_ino(tp, agno, *agino, imap, dipp, bpp); + if (error) + return error; + + if (be32_to_cpu((*dipp)->di_next_unlinked) == target_agino) + return 0; + + /* + * If we get here the cache contents were corrupt, so drop the + * buffer and fall back to walking the bucket list. + */ + xfs_trans_brelse(tp, *bpp); + *bpp = NULL; + WARN_ON_ONCE(1); + } + + trace_xfs_iunlink_map_prev_fallback(mp, agno); + + /* Otherwise, walk the entire bucket until we find it. */ + next_agino = head_agino; + while (next_agino != target_agino) { + xfs_agino_t unlinked_agino; + + if (*bpp) + xfs_trans_brelse(tp, *bpp); + + *agino = next_agino; + error = xfs_iunlink_map_ino(tp, agno, next_agino, imap, dipp, + bpp); + if (error) + return error; + + unlinked_agino = be32_to_cpu((*dipp)->di_next_unlinked); + /* + * Make sure this pointer is valid and isn't an obvious + * infinite loop. + */ + if (!xfs_verify_agino(mp, agno, unlinked_agino) || + next_agino == unlinked_agino) { + XFS_CORRUPTION_ERROR(__func__, + XFS_ERRLEVEL_LOW, mp, + *dipp, sizeof(**dipp)); + error = -EFSCORRUPTED; + return error; + } + next_agino = unlinked_agino; } - /* - * Point the bucket head pointer at the inode being inserted. - */ - ASSERT(agino != 0); - agi->agi_unlinked[bucket_index] = cpu_to_be32(agino); - offset = offsetof(xfs_agi_t, agi_unlinked) + - (sizeof(xfs_agino_t) * bucket_index); - xfs_trans_log_buf(tp, agibp, offset, - (offset + sizeof(xfs_agino_t) - 1)); return 0; } @@ -1995,181 +2408,106 @@ xfs_iunlink( */ STATIC int xfs_iunlink_remove( - xfs_trans_t *tp, - xfs_inode_t *ip) + struct xfs_trans *tp, + struct xfs_inode *ip) { - xfs_ino_t next_ino; - xfs_mount_t *mp; - xfs_agi_t *agi; - xfs_dinode_t *dip; - xfs_buf_t *agibp; - xfs_buf_t *ibp; - xfs_agnumber_t agno; - xfs_agino_t agino; - xfs_agino_t next_agino; - xfs_buf_t *last_ibp; - xfs_dinode_t *last_dip = NULL; - short bucket_index; - int offset, last_offset = 0; - int error; + struct xfs_mount *mp = tp->t_mountp; + struct xfs_agi *agi; + struct xfs_buf *agibp; + struct xfs_buf *last_ibp; + struct xfs_dinode *last_dip = NULL; + struct xfs_perag *pag = NULL; + xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, ip->i_ino); + xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ip->i_ino); + xfs_agino_t next_agino; + xfs_agino_t head_agino; + short bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS; + int error; - mp = tp->t_mountp; - agno = XFS_INO_TO_AGNO(mp, ip->i_ino); + trace_xfs_iunlink_remove(ip); - /* - * Get the agi buffer first. It ensures lock ordering - * on the list. - */ + /* Get the agi buffer first. It ensures lock ordering on the list. */ error = xfs_read_agi(mp, tp, agno, &agibp); if (error) return error; - agi = XFS_BUF_TO_AGI(agibp); /* - * Get the index into the agi hash table for the - * list this inode will go on. + * Get the index into the agi hash table for the list this inode will + * go on. Make sure the head pointer isn't garbage. */ - agino = XFS_INO_TO_AGINO(mp, ip->i_ino); - if (!xfs_verify_agino(mp, agno, agino)) - return -EFSCORRUPTED; - bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS; - if (!xfs_verify_agino(mp, agno, - be32_to_cpu(agi->agi_unlinked[bucket_index]))) { + head_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]); + if (!xfs_verify_agino(mp, agno, head_agino)) { XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, agi, sizeof(*agi)); return -EFSCORRUPTED; } - if (be32_to_cpu(agi->agi_unlinked[bucket_index]) == agino) { - /* - * We're at the head of the list. Get the inode's on-disk - * buffer to see if there is anyone after us on the list. - * Only modify our next pointer if it is not already NULLAGINO. - * This saves us the overhead of dealing with the buffer when - * there is no need to change it. - */ - error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp, - 0, 0); - if (error) { - xfs_warn(mp, "%s: xfs_imap_to_bp returned error %d.", - __func__, error); - return error; - } - next_agino = be32_to_cpu(dip->di_next_unlinked); - ASSERT(next_agino != 0); - if (next_agino != NULLAGINO) { - dip->di_next_unlinked = cpu_to_be32(NULLAGINO); - offset = ip->i_imap.im_boffset + - offsetof(xfs_dinode_t, di_next_unlinked); - - /* need to recalc the inode CRC if appropriate */ - xfs_dinode_calc_crc(mp, dip); - - xfs_trans_inode_buf(tp, ibp); - xfs_trans_log_buf(tp, ibp, offset, - (offset + sizeof(xfs_agino_t) - 1)); - xfs_inobp_check(mp, ibp); - } else { - xfs_trans_brelse(tp, ibp); - } - /* - * Point the bucket head pointer at the next inode. - */ - ASSERT(next_agino != 0); - ASSERT(next_agino != agino); - agi->agi_unlinked[bucket_index] = cpu_to_be32(next_agino); - offset = offsetof(xfs_agi_t, agi_unlinked) + - (sizeof(xfs_agino_t) * bucket_index); - xfs_trans_log_buf(tp, agibp, offset, - (offset + sizeof(xfs_agino_t) - 1)); - } else { - /* - * We need to search the list for the inode being freed. - */ - next_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]); - last_ibp = NULL; - while (next_agino != agino) { - struct xfs_imap imap; + /* + * Set our inode's next_unlinked pointer to NULL and then return + * the old pointer value so that we can update whatever was previous + * to us in the list to point to whatever was next in the list. + */ + error = xfs_iunlink_update_inode(tp, ip, agno, NULLAGINO, &next_agino); + if (error) + return error; - if (last_ibp) - xfs_trans_brelse(tp, last_ibp); + /* + * If there was a backref pointing from the next inode back to this + * one, remove it because we've removed this inode from the list. + * + * Later, if this inode was in the middle of the list we'll update + * this inode's backref to point from the next inode. + */ + if (next_agino != NULLAGINO) { + pag = xfs_perag_get(mp, agno); + error = xfs_iunlink_change_backref(pag, next_agino, + NULLAGINO); + if (error) + goto out; + } - imap.im_blkno = 0; - next_ino = XFS_AGINO_TO_INO(mp, agno, next_agino); + if (head_agino == agino) { + /* Point the head of the list to the next unlinked inode. */ + error = xfs_iunlink_update_bucket(tp, agno, agibp, bucket_index, + next_agino); + if (error) + goto out; + } else { + struct xfs_imap imap; + xfs_agino_t prev_agino; - error = xfs_imap(mp, tp, next_ino, &imap, 0); - if (error) { - xfs_warn(mp, - "%s: xfs_imap returned error %d.", - __func__, error); - return error; - } + if (!pag) + pag = xfs_perag_get(mp, agno); - error = xfs_imap_to_bp(mp, tp, &imap, &last_dip, - &last_ibp, 0, 0); - if (error) { - xfs_warn(mp, - "%s: xfs_imap_to_bp returned error %d.", - __func__, error); - return error; - } + /* We need to search the list for the inode being freed. */ + error = xfs_iunlink_map_prev(tp, agno, head_agino, agino, + &prev_agino, &imap, &last_dip, &last_ibp, + pag); + if (error) + goto out; - last_offset = imap.im_boffset; - next_agino = be32_to_cpu(last_dip->di_next_unlinked); - if (!xfs_verify_agino(mp, agno, next_agino)) { - XFS_CORRUPTION_ERROR(__func__, - XFS_ERRLEVEL_LOW, mp, - last_dip, sizeof(*last_dip)); - return -EFSCORRUPTED; - } - } + /* Point the previous inode on the list to the next inode. */ + xfs_iunlink_update_dinode(tp, agno, prev_agino, last_ibp, + last_dip, &imap, next_agino); /* - * Now last_ibp points to the buffer previous to us on the - * unlinked list. Pull us from the list. + * Now we deal with the backref for this inode. If this inode + * pointed at a real inode, change the backref that pointed to + * us to point to our old next. If this inode was the end of + * the list, delete the backref that pointed to us. Note that + * change_backref takes care of deleting the backref if + * next_agino is NULLAGINO. */ - error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp, - 0, 0); - if (error) { - xfs_warn(mp, "%s: xfs_imap_to_bp(2) returned error %d.", - __func__, error); - return error; - } - next_agino = be32_to_cpu(dip->di_next_unlinked); - ASSERT(next_agino != 0); - ASSERT(next_agino != agino); - if (next_agino != NULLAGINO) { - dip->di_next_unlinked = cpu_to_be32(NULLAGINO); - offset = ip->i_imap.im_boffset + - offsetof(xfs_dinode_t, di_next_unlinked); - - /* need to recalc the inode CRC if appropriate */ - xfs_dinode_calc_crc(mp, dip); - - xfs_trans_inode_buf(tp, ibp); - xfs_trans_log_buf(tp, ibp, offset, - (offset + sizeof(xfs_agino_t) - 1)); - xfs_inobp_check(mp, ibp); - } else { - xfs_trans_brelse(tp, ibp); - } - /* - * Point the previous inode on the list to the next inode. - */ - last_dip->di_next_unlinked = cpu_to_be32(next_agino); - ASSERT(next_agino != 0); - offset = last_offset + offsetof(xfs_dinode_t, di_next_unlinked); - - /* need to recalc the inode CRC if appropriate */ - xfs_dinode_calc_crc(mp, last_dip); - - xfs_trans_inode_buf(tp, last_ibp); - xfs_trans_log_buf(tp, last_ibp, offset, - (offset + sizeof(xfs_agino_t) - 1)); - xfs_inobp_check(mp, last_ibp); + error = xfs_iunlink_change_backref(pag, agino, next_agino); + if (error) + goto out; } - return 0; + +out: + if (pag) + xfs_perag_put(pag); + return error; } /* @@ -2193,13 +2531,14 @@ xfs_ifree_cluster( xfs_inode_log_item_t *iip; struct xfs_log_item *lip; struct xfs_perag *pag; + struct xfs_ino_geometry *igeo = M_IGEO(mp); xfs_ino_t inum; inum = xic->first_ino; pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, inum)); - nbufs = mp->m_ialloc_blks / mp->m_blocks_per_cluster; + nbufs = igeo->ialloc_blks / igeo->blocks_per_cluster; - for (j = 0; j < nbufs; j++, inum += mp->m_inodes_per_cluster) { + for (j = 0; j < nbufs; j++, inum += igeo->inodes_per_cluster) { /* * The allocation bitmap tells us which inodes of the chunk were * physically allocated. Skip the cluster if an inode falls into @@ -2207,7 +2546,7 @@ xfs_ifree_cluster( */ ioffset = inum - xic->first_ino; if ((xic->alloc & XFS_INOBT_MASK(ioffset)) == 0) { - ASSERT(ioffset % mp->m_inodes_per_cluster == 0); + ASSERT(ioffset % igeo->inodes_per_cluster == 0); continue; } @@ -2223,7 +2562,7 @@ xfs_ifree_cluster( * to mark all the active inodes on the buffer stale. */ bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno, - mp->m_bsize * mp->m_blocks_per_cluster, + mp->m_bsize * igeo->blocks_per_cluster, XBF_UNMAPPED); if (!bp) @@ -2270,7 +2609,7 @@ xfs_ifree_cluster( * transaction stale above, which means there is no point in * even trying to lock them. */ - for (i = 0; i < mp->m_inodes_per_cluster; i++) { + for (i = 0; i < igeo->inodes_per_cluster; i++) { retry: rcu_read_lock(); ip = radix_tree_lookup(&pag->pag_ici_root, @@ -2748,9 +3087,7 @@ xfs_cross_rename( error = xfs_droplink(tp, dp2); if (error) goto out_trans_abort; - error = xfs_bumplink(tp, dp1); - if (error) - goto out_trans_abort; + xfs_bumplink(tp, dp1); } /* @@ -2774,9 +3111,7 @@ xfs_cross_rename( error = xfs_droplink(tp, dp1); if (error) goto out_trans_abort; - error = xfs_bumplink(tp, dp2); - if (error) - goto out_trans_abort; + xfs_bumplink(tp, dp2); } /* @@ -2833,11 +3168,9 @@ xfs_rename_alloc_whiteout( /* * Prepare the tmpfile inode as if it were created through the VFS. - * Otherwise, the link increment paths will complain about nlink 0->1. - * Drop the link count as done by d_tmpfile(), complete the inode setup - * and flag it as linkable. + * Complete the inode setup and flag it as linkable. nlink is already + * zero, so we can skip the drop_nlink. */ - drop_nlink(VFS_I(tmpfile)); xfs_setup_iops(tmpfile); xfs_finish_inode_setup(tmpfile); VFS_I(tmpfile)->i_state |= I_LINKABLE; @@ -2975,9 +3308,7 @@ xfs_rename( XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); if (new_parent && src_is_directory) { - error = xfs_bumplink(tp, target_dp); - if (error) - goto out_trans_cancel; + xfs_bumplink(tp, target_dp); } } else { /* target_ip != NULL */ /* @@ -3096,9 +3427,7 @@ xfs_rename( */ if (wip) { ASSERT(VFS_I(wip)->i_nlink == 0); - error = xfs_bumplink(tp, wip); - if (error) - goto out_trans_cancel; + xfs_bumplink(tp, wip); error = xfs_iunlink_remove(tp, wip); if (error) goto out_trans_cancel; @@ -3138,28 +3467,27 @@ xfs_iflush_cluster( struct xfs_mount *mp = ip->i_mount; struct xfs_perag *pag; unsigned long first_index, mask; - unsigned long inodes_per_cluster; int cilist_size; struct xfs_inode **cilist; struct xfs_inode *cip; + struct xfs_ino_geometry *igeo = M_IGEO(mp); int nr_found; int clcount = 0; int i; pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); - inodes_per_cluster = mp->m_inode_cluster_size >> mp->m_sb.sb_inodelog; - cilist_size = inodes_per_cluster * sizeof(xfs_inode_t *); + cilist_size = igeo->inodes_per_cluster * sizeof(struct xfs_inode *); cilist = kmem_alloc(cilist_size, KM_MAYFAIL|KM_NOFS); if (!cilist) goto out_put; - mask = ~(((mp->m_inode_cluster_size >> mp->m_sb.sb_inodelog)) - 1); + mask = ~(igeo->inodes_per_cluster - 1); first_index = XFS_INO_TO_AGINO(mp, ip->i_ino) & mask; rcu_read_lock(); /* really need a gang lookup range call here */ nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, (void**)cilist, - first_index, inodes_per_cluster); + first_index, igeo->inodes_per_cluster); if (nr_found == 0) goto out_free; @@ -3267,7 +3595,6 @@ cluster_corrupt_out: * inode buffer and shut down the filesystem. */ rcu_read_unlock(); - xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); /* * We'll always have an inode attached to the buffer for completion @@ -3277,11 +3604,14 @@ cluster_corrupt_out: * xfs_buf_submit(). */ ASSERT(bp->b_iodone); + bp->b_flags |= XBF_ASYNC; bp->b_flags &= ~XBF_DONE; xfs_buf_stale(bp); xfs_buf_ioerror(bp, -EIO); xfs_buf_ioend(bp); + xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); + /* abort the corrupt inode, as it was not attached to the buffer */ xfs_iflush_abort(cip, false); kmem_free(cilist); diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index be2014520155..558173f95a03 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -45,10 +45,18 @@ typedef struct xfs_inode { mrlock_t i_lock; /* inode lock */ mrlock_t i_mmaplock; /* inode mmap IO lock */ atomic_t i_pincount; /* inode pin count */ + + /* + * Bitsets of inode metadata that have been checked and/or are sick. + * Callers must hold i_flags_lock before accessing this field. + */ + uint16_t i_checked; + uint16_t i_sick; + spinlock_t i_flags_lock; /* inode i_flags lock */ /* Miscellaneous state. */ unsigned long i_flags; /* see defined flags below */ - unsigned int i_delayed_blks; /* count of delay alloc blks */ + uint64_t i_delayed_blks; /* count of delay alloc blks */ struct xfs_icdinode i_d; /* most of ondisk inode */ @@ -57,6 +65,11 @@ typedef struct xfs_inode { /* VFS inode */ struct inode i_vnode; /* embedded VFS inode */ + + /* pending io completions */ + spinlock_t i_ioend_lock; + struct work_struct i_ioend_work; + struct list_head i_ioend_list; } xfs_inode_t; /* Convert from vfs inode to xfs inode */ @@ -500,4 +513,9 @@ extern struct kmem_zone *xfs_inode_zone; bool xfs_inode_verify_forks(struct xfs_inode *ip); +int xfs_iunlink_init(struct xfs_perag *pag); +void xfs_iunlink_destroy(struct xfs_perag *pag); + +void xfs_end_io(struct work_struct *work); + #endif /* __XFS_INODE_H__ */ diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index fa1c4fe2ffbf..c9a502eed204 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -5,6 +5,7 @@ */ #include "xfs.h" #include "xfs_fs.h" +#include "xfs_shared.h" #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" @@ -12,7 +13,6 @@ #include "xfs_inode.h" #include "xfs_trans.h" #include "xfs_inode_item.h" -#include "xfs_error.h" #include "xfs_trace.h" #include "xfs_trans_priv.h" #include "xfs_buf_item.h" @@ -565,7 +565,7 @@ out_unlock: * Unlock the inode associated with the inode log item. */ STATIC void -xfs_inode_item_unlock( +xfs_inode_item_release( struct xfs_log_item *lip) { struct xfs_inode_log_item *iip = INODE_ITEM(lip); @@ -621,23 +621,21 @@ xfs_inode_item_committed( STATIC void xfs_inode_item_committing( struct xfs_log_item *lip, - xfs_lsn_t lsn) + xfs_lsn_t commit_lsn) { - INODE_ITEM(lip)->ili_last_lsn = lsn; + INODE_ITEM(lip)->ili_last_lsn = commit_lsn; + return xfs_inode_item_release(lip); } -/* - * This is the ops vector shared by all buf log items. - */ static const struct xfs_item_ops xfs_inode_item_ops = { .iop_size = xfs_inode_item_size, .iop_format = xfs_inode_item_format, .iop_pin = xfs_inode_item_pin, .iop_unpin = xfs_inode_item_unpin, - .iop_unlock = xfs_inode_item_unlock, + .iop_release = xfs_inode_item_release, .iop_committed = xfs_inode_item_committed, .iop_push = xfs_inode_item_push, - .iop_committing = xfs_inode_item_committing, + .iop_committing = xfs_inode_item_committing, .iop_error = xfs_inode_item_error }; diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h index 27081eba220c..07a60e74c39c 100644 --- a/fs/xfs/xfs_inode_item.h +++ b/fs/xfs/xfs_inode_item.h @@ -14,7 +14,7 @@ struct xfs_inode; struct xfs_mount; typedef struct xfs_inode_log_item { - xfs_log_item_t ili_item; /* common portion */ + struct xfs_log_item ili_item; /* common portion */ struct xfs_inode *ili_inode; /* inode ptr */ xfs_lsn_t ili_flush_lsn; /* lsn at last flush */ xfs_lsn_t ili_last_lsn; /* lsn at last transaction */ diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 6ecdbb3af7de..6f7848cd5527 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -11,9 +11,8 @@ #include "xfs_trans_resv.h" #include "xfs_mount.h" #include "xfs_inode.h" -#include "xfs_ioctl.h" -#include "xfs_alloc.h" #include "xfs_rtalloc.h" +#include "xfs_iwalk.h" #include "xfs_itable.h" #include "xfs_error.h" #include "xfs_attr.h" @@ -25,7 +24,6 @@ #include "xfs_export.h" #include "xfs_trace.h" #include "xfs_icache.h" -#include "xfs_symlink.h" #include "xfs_trans.h" #include "xfs_acl.h" #include "xfs_btree.h" @@ -33,15 +31,11 @@ #include "xfs_fsmap.h" #include "scrub/xfs_scrub.h" #include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_health.h" -#include <linux/capability.h> -#include <linux/cred.h> -#include <linux/dcache.h> #include <linux/mount.h> #include <linux/namei.h> -#include <linux/pagemap.h> -#include <linux/slab.h> -#include <linux/exportfs.h> /* * xfs_find_handle maps from userspace xfs_fsop_handlereq structure to @@ -719,16 +713,45 @@ out_unlock: return error; } +/* Return 0 on success or positive error */ +int +xfs_fsbulkstat_one_fmt( + struct xfs_ibulk *breq, + const struct xfs_bulkstat *bstat) +{ + struct xfs_bstat bs1; + + xfs_bulkstat_to_bstat(breq->mp, &bs1, bstat); + if (copy_to_user(breq->ubuffer, &bs1, sizeof(bs1))) + return -EFAULT; + return xfs_ibulk_advance(breq, sizeof(struct xfs_bstat)); +} + +int +xfs_fsinumbers_fmt( + struct xfs_ibulk *breq, + const struct xfs_inumbers *igrp) +{ + struct xfs_inogrp ig1; + + xfs_inumbers_to_inogrp(&ig1, igrp); + if (copy_to_user(breq->ubuffer, &ig1, sizeof(struct xfs_inogrp))) + return -EFAULT; + return xfs_ibulk_advance(breq, sizeof(struct xfs_inogrp)); +} + STATIC int -xfs_ioc_bulkstat( +xfs_ioc_fsbulkstat( xfs_mount_t *mp, unsigned int cmd, void __user *arg) { - xfs_fsop_bulkreq_t bulkreq; - int count; /* # of records returned */ - xfs_ino_t inlast; /* last inode number */ - int done; + struct xfs_fsop_bulkreq bulkreq; + struct xfs_ibulk breq = { + .mp = mp, + .ocount = 0, + }; + xfs_ino_t lastino; int error; /* done = 1 if there are more stats to get and if bulkstat */ @@ -740,79 +763,287 @@ xfs_ioc_bulkstat( if (XFS_FORCED_SHUTDOWN(mp)) return -EIO; - if (copy_from_user(&bulkreq, arg, sizeof(xfs_fsop_bulkreq_t))) + if (copy_from_user(&bulkreq, arg, sizeof(struct xfs_fsop_bulkreq))) return -EFAULT; - if (copy_from_user(&inlast, bulkreq.lastip, sizeof(__s64))) + if (copy_from_user(&lastino, bulkreq.lastip, sizeof(__s64))) return -EFAULT; - if ((count = bulkreq.icount) <= 0) + if (bulkreq.icount <= 0) return -EINVAL; if (bulkreq.ubuffer == NULL) return -EINVAL; - if (cmd == XFS_IOC_FSINUMBERS) - error = xfs_inumbers(mp, &inlast, &count, - bulkreq.ubuffer, xfs_inumbers_fmt); - else if (cmd == XFS_IOC_FSBULKSTAT_SINGLE) - error = xfs_bulkstat_one(mp, inlast, bulkreq.ubuffer, - sizeof(xfs_bstat_t), NULL, &done); - else /* XFS_IOC_FSBULKSTAT */ - error = xfs_bulkstat(mp, &inlast, &count, xfs_bulkstat_one, - sizeof(xfs_bstat_t), bulkreq.ubuffer, - &done); + breq.ubuffer = bulkreq.ubuffer; + breq.icount = bulkreq.icount; + + /* + * FSBULKSTAT_SINGLE expects that *lastip contains the inode number + * that we want to stat. However, FSINUMBERS and FSBULKSTAT expect + * that *lastip contains either zero or the number of the last inode to + * be examined by the previous call and return results starting with + * the next inode after that. The new bulk request back end functions + * take the inode to start with, so we have to compute the startino + * parameter from lastino to maintain correct function. lastino == 0 + * is a special case because it has traditionally meant "first inode + * in filesystem". + */ + if (cmd == XFS_IOC_FSINUMBERS) { + breq.startino = lastino ? lastino + 1 : 0; + error = xfs_inumbers(&breq, xfs_fsinumbers_fmt); + lastino = breq.startino - 1; + } else if (cmd == XFS_IOC_FSBULKSTAT_SINGLE) { + breq.startino = lastino; + breq.icount = 1; + error = xfs_bulkstat_one(&breq, xfs_fsbulkstat_one_fmt); + } else { /* XFS_IOC_FSBULKSTAT */ + breq.startino = lastino ? lastino + 1 : 0; + error = xfs_bulkstat(&breq, xfs_fsbulkstat_one_fmt); + lastino = breq.startino - 1; + } if (error) return error; - if (bulkreq.ocount != NULL) { - if (copy_to_user(bulkreq.lastip, &inlast, - sizeof(xfs_ino_t))) - return -EFAULT; + if (bulkreq.lastip != NULL && + copy_to_user(bulkreq.lastip, &lastino, sizeof(xfs_ino_t))) + return -EFAULT; - if (copy_to_user(bulkreq.ocount, &count, sizeof(count))) - return -EFAULT; + if (bulkreq.ocount != NULL && + copy_to_user(bulkreq.ocount, &breq.ocount, sizeof(__s32))) + return -EFAULT; + + return 0; +} + +/* Return 0 on success or positive error */ +static int +xfs_bulkstat_fmt( + struct xfs_ibulk *breq, + const struct xfs_bulkstat *bstat) +{ + if (copy_to_user(breq->ubuffer, bstat, sizeof(struct xfs_bulkstat))) + return -EFAULT; + return xfs_ibulk_advance(breq, sizeof(struct xfs_bulkstat)); +} + +/* + * Check the incoming bulk request @hdr from userspace and initialize the + * internal @breq bulk request appropriately. Returns 0 if the bulk request + * should proceed; XFS_ITER_ABORT if there's nothing to do; or the usual + * negative error code. + */ +static int +xfs_bulk_ireq_setup( + struct xfs_mount *mp, + struct xfs_bulk_ireq *hdr, + struct xfs_ibulk *breq, + void __user *ubuffer) +{ + if (hdr->icount == 0 || + (hdr->flags & ~XFS_BULK_IREQ_FLAGS_ALL) || + memchr_inv(hdr->reserved, 0, sizeof(hdr->reserved))) + return -EINVAL; + + breq->startino = hdr->ino; + breq->ubuffer = ubuffer; + breq->icount = hdr->icount; + breq->ocount = 0; + breq->flags = 0; + + /* + * The @ino parameter is a special value, so we must look it up here. + * We're not allowed to have IREQ_AGNO, and we only return one inode + * worth of data. + */ + if (hdr->flags & XFS_BULK_IREQ_SPECIAL) { + if (hdr->flags & XFS_BULK_IREQ_AGNO) + return -EINVAL; + + switch (hdr->ino) { + case XFS_BULK_IREQ_SPECIAL_ROOT: + hdr->ino = mp->m_sb.sb_rootino; + break; + default: + return -EINVAL; + } + breq->icount = 1; } + /* + * The IREQ_AGNO flag means that we only want results from a given AG. + * If @hdr->ino is zero, we start iterating in that AG. If @hdr->ino is + * beyond the specified AG then we return no results. + */ + if (hdr->flags & XFS_BULK_IREQ_AGNO) { + if (hdr->agno >= mp->m_sb.sb_agcount) + return -EINVAL; + + if (breq->startino == 0) + breq->startino = XFS_AGINO_TO_INO(mp, hdr->agno, 0); + else if (XFS_INO_TO_AGNO(mp, breq->startino) < hdr->agno) + return -EINVAL; + + breq->flags |= XFS_IBULK_SAME_AG; + + /* Asking for an inode past the end of the AG? We're done! */ + if (XFS_INO_TO_AGNO(mp, breq->startino) > hdr->agno) + return XFS_ITER_ABORT; + } else if (hdr->agno) + return -EINVAL; + + /* Asking for an inode past the end of the FS? We're done! */ + if (XFS_INO_TO_AGNO(mp, breq->startino) >= mp->m_sb.sb_agcount) + return XFS_ITER_ABORT; + return 0; } +/* + * Update the userspace bulk request @hdr to reflect the end state of the + * internal bulk request @breq. + */ +static void +xfs_bulk_ireq_teardown( + struct xfs_bulk_ireq *hdr, + struct xfs_ibulk *breq) +{ + hdr->ino = breq->startino; + hdr->ocount = breq->ocount; +} + +/* Handle the v5 bulkstat ioctl. */ STATIC int -xfs_ioc_fsgeometry_v1( - xfs_mount_t *mp, - void __user *arg) +xfs_ioc_bulkstat( + struct xfs_mount *mp, + unsigned int cmd, + struct xfs_bulkstat_req __user *arg) { - xfs_fsop_geom_t fsgeo; - int error; + struct xfs_bulk_ireq hdr; + struct xfs_ibulk breq = { + .mp = mp, + }; + int error; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (XFS_FORCED_SHUTDOWN(mp)) + return -EIO; + + if (copy_from_user(&hdr, &arg->hdr, sizeof(hdr))) + return -EFAULT; - error = xfs_fs_geometry(&mp->m_sb, &fsgeo, 3); + error = xfs_bulk_ireq_setup(mp, &hdr, &breq, arg->bulkstat); + if (error == XFS_ITER_ABORT) + goto out_teardown; + if (error < 0) + return error; + + error = xfs_bulkstat(&breq, xfs_bulkstat_fmt); if (error) return error; - /* - * Caller should have passed an argument of type - * xfs_fsop_geom_v1_t. This is a proper subset of the - * xfs_fsop_geom_t that xfs_fs_geometry() fills in. - */ - if (copy_to_user(arg, &fsgeo, sizeof(xfs_fsop_geom_v1_t))) +out_teardown: + xfs_bulk_ireq_teardown(&hdr, &breq); + if (copy_to_user(&arg->hdr, &hdr, sizeof(hdr))) return -EFAULT; + + return 0; +} + +STATIC int +xfs_inumbers_fmt( + struct xfs_ibulk *breq, + const struct xfs_inumbers *igrp) +{ + if (copy_to_user(breq->ubuffer, igrp, sizeof(struct xfs_inumbers))) + return -EFAULT; + return xfs_ibulk_advance(breq, sizeof(struct xfs_inumbers)); +} + +/* Handle the v5 inumbers ioctl. */ +STATIC int +xfs_ioc_inumbers( + struct xfs_mount *mp, + unsigned int cmd, + struct xfs_inumbers_req __user *arg) +{ + struct xfs_bulk_ireq hdr; + struct xfs_ibulk breq = { + .mp = mp, + }; + int error; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (XFS_FORCED_SHUTDOWN(mp)) + return -EIO; + + if (copy_from_user(&hdr, &arg->hdr, sizeof(hdr))) + return -EFAULT; + + error = xfs_bulk_ireq_setup(mp, &hdr, &breq, arg->inumbers); + if (error == XFS_ITER_ABORT) + goto out_teardown; + if (error < 0) + return error; + + error = xfs_inumbers(&breq, xfs_inumbers_fmt); + if (error) + return error; + +out_teardown: + xfs_bulk_ireq_teardown(&hdr, &breq); + if (copy_to_user(&arg->hdr, &hdr, sizeof(hdr))) + return -EFAULT; + return 0; } STATIC int xfs_ioc_fsgeometry( - xfs_mount_t *mp, + struct xfs_mount *mp, + void __user *arg, + int struct_version) +{ + struct xfs_fsop_geom fsgeo; + size_t len; + + xfs_fs_geometry(&mp->m_sb, &fsgeo, struct_version); + + if (struct_version <= 3) + len = sizeof(struct xfs_fsop_geom_v1); + else if (struct_version == 4) + len = sizeof(struct xfs_fsop_geom_v4); + else { + xfs_fsop_geom_health(mp, &fsgeo); + len = sizeof(fsgeo); + } + + if (copy_to_user(arg, &fsgeo, len)) + return -EFAULT; + return 0; +} + +STATIC int +xfs_ioc_ag_geometry( + struct xfs_mount *mp, void __user *arg) { - xfs_fsop_geom_t fsgeo; + struct xfs_ag_geometry ageo; int error; - error = xfs_fs_geometry(&mp->m_sb, &fsgeo, 4); + if (copy_from_user(&ageo, arg, sizeof(ageo))) + return -EFAULT; + + error = xfs_ag_get_geometry(mp, ageo.ag_number, &ageo); if (error) return error; - if (copy_to_user(arg, &fsgeo, sizeof(fsgeo))) + if (copy_to_user(arg, &ageo, sizeof(ageo))) return -EFAULT; return 0; } @@ -871,37 +1102,44 @@ xfs_di2lxflags( return flags; } -STATIC int -xfs_ioc_fsgetxattr( - xfs_inode_t *ip, - int attr, - void __user *arg) +static void +xfs_fill_fsxattr( + struct xfs_inode *ip, + bool attr, + struct fsxattr *fa) { - struct fsxattr fa; - - memset(&fa, 0, sizeof(struct fsxattr)); - - xfs_ilock(ip, XFS_ILOCK_SHARED); - fa.fsx_xflags = xfs_ip2xflags(ip); - fa.fsx_extsize = ip->i_d.di_extsize << ip->i_mount->m_sb.sb_blocklog; - fa.fsx_cowextsize = ip->i_d.di_cowextsize << + simple_fill_fsxattr(fa, xfs_ip2xflags(ip)); + fa->fsx_extsize = ip->i_d.di_extsize << ip->i_mount->m_sb.sb_blocklog; + fa->fsx_cowextsize = ip->i_d.di_cowextsize << ip->i_mount->m_sb.sb_blocklog; - fa.fsx_projid = xfs_get_projid(ip); + fa->fsx_projid = xfs_get_projid(ip); if (attr) { if (ip->i_afp) { if (ip->i_afp->if_flags & XFS_IFEXTENTS) - fa.fsx_nextents = xfs_iext_count(ip->i_afp); + fa->fsx_nextents = xfs_iext_count(ip->i_afp); else - fa.fsx_nextents = ip->i_d.di_anextents; + fa->fsx_nextents = ip->i_d.di_anextents; } else - fa.fsx_nextents = 0; + fa->fsx_nextents = 0; } else { if (ip->i_df.if_flags & XFS_IFEXTENTS) - fa.fsx_nextents = xfs_iext_count(&ip->i_df); + fa->fsx_nextents = xfs_iext_count(&ip->i_df); else - fa.fsx_nextents = ip->i_d.di_nextents; + fa->fsx_nextents = ip->i_d.di_nextents; } +} + +STATIC int +xfs_ioc_fsgetxattr( + xfs_inode_t *ip, + int attr, + void __user *arg) +{ + struct fsxattr fa; + + xfs_ilock(ip, XFS_ILOCK_SHARED); + xfs_fill_fsxattr(ip, attr, &fa); xfs_iunlock(ip, XFS_ILOCK_SHARED); if (copy_to_user(arg, &fa, sizeof(fa))) @@ -1027,15 +1265,6 @@ xfs_ioctl_setattr_xflags( if ((fa->fsx_xflags & FS_XFLAG_DAX) && xfs_is_reflink_inode(ip)) return -EINVAL; - /* - * Can't modify an immutable/append-only file unless - * we have appropriate permission. - */ - if (((ip->i_d.di_flags & (XFS_DIFLAG_IMMUTABLE | XFS_DIFLAG_APPEND)) || - (fa->fsx_xflags & (FS_XFLAG_IMMUTABLE | FS_XFLAG_APPEND))) && - !capable(CAP_LINUX_IMMUTABLE)) - return -EPERM; - /* diflags2 only valid for v3 inodes. */ di_flags2 = xfs_flags2diflags2(ip, fa->fsx_xflags); if (di_flags2 && ip->i_d.di_version < 3) @@ -1142,7 +1371,7 @@ xfs_ioctl_setattr_get_trans( error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp); if (error) - return ERR_PTR(error); + goto out_unlock; xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | join_flags); @@ -1194,39 +1423,31 @@ xfs_ioctl_setattr_check_extsize( struct fsxattr *fa) { struct xfs_mount *mp = ip->i_mount; - - if ((fa->fsx_xflags & FS_XFLAG_EXTSIZE) && !S_ISREG(VFS_I(ip)->i_mode)) - return -EINVAL; - - if ((fa->fsx_xflags & FS_XFLAG_EXTSZINHERIT) && - !S_ISDIR(VFS_I(ip)->i_mode)) - return -EINVAL; + xfs_extlen_t size; + xfs_fsblock_t extsize_fsb; if (S_ISREG(VFS_I(ip)->i_mode) && ip->i_d.di_nextents && ((ip->i_d.di_extsize << mp->m_sb.sb_blocklog) != fa->fsx_extsize)) return -EINVAL; - if (fa->fsx_extsize != 0) { - xfs_extlen_t size; - xfs_fsblock_t extsize_fsb; - - extsize_fsb = XFS_B_TO_FSB(mp, fa->fsx_extsize); - if (extsize_fsb > MAXEXTLEN) - return -EINVAL; + if (fa->fsx_extsize == 0) + return 0; - if (XFS_IS_REALTIME_INODE(ip) || - (fa->fsx_xflags & FS_XFLAG_REALTIME)) { - size = mp->m_sb.sb_rextsize << mp->m_sb.sb_blocklog; - } else { - size = mp->m_sb.sb_blocksize; - if (extsize_fsb > mp->m_sb.sb_agblocks / 2) - return -EINVAL; - } + extsize_fsb = XFS_B_TO_FSB(mp, fa->fsx_extsize); + if (extsize_fsb > MAXEXTLEN) + return -EINVAL; - if (fa->fsx_extsize % size) + if (XFS_IS_REALTIME_INODE(ip) || + (fa->fsx_xflags & FS_XFLAG_REALTIME)) { + size = mp->m_sb.sb_rextsize << mp->m_sb.sb_blocklog; + } else { + size = mp->m_sb.sb_blocksize; + if (extsize_fsb > mp->m_sb.sb_agblocks / 2) return -EINVAL; - } else - fa->fsx_xflags &= ~(FS_XFLAG_EXTSIZE | FS_XFLAG_EXTSZINHERIT); + } + + if (fa->fsx_extsize % size) + return -EINVAL; return 0; } @@ -1252,6 +1473,8 @@ xfs_ioctl_setattr_check_cowextsize( struct fsxattr *fa) { struct xfs_mount *mp = ip->i_mount; + xfs_extlen_t size; + xfs_fsblock_t cowextsize_fsb; if (!(fa->fsx_xflags & FS_XFLAG_COWEXTSIZE)) return 0; @@ -1260,25 +1483,19 @@ xfs_ioctl_setattr_check_cowextsize( ip->i_d.di_version != 3) return -EINVAL; - if (!S_ISREG(VFS_I(ip)->i_mode) && !S_ISDIR(VFS_I(ip)->i_mode)) - return -EINVAL; - - if (fa->fsx_cowextsize != 0) { - xfs_extlen_t size; - xfs_fsblock_t cowextsize_fsb; + if (fa->fsx_cowextsize == 0) + return 0; - cowextsize_fsb = XFS_B_TO_FSB(mp, fa->fsx_cowextsize); - if (cowextsize_fsb > MAXEXTLEN) - return -EINVAL; + cowextsize_fsb = XFS_B_TO_FSB(mp, fa->fsx_cowextsize); + if (cowextsize_fsb > MAXEXTLEN) + return -EINVAL; - size = mp->m_sb.sb_blocksize; - if (cowextsize_fsb > mp->m_sb.sb_agblocks / 2) - return -EINVAL; + size = mp->m_sb.sb_blocksize; + if (cowextsize_fsb > mp->m_sb.sb_agblocks / 2) + return -EINVAL; - if (fa->fsx_cowextsize % size) - return -EINVAL; - } else - fa->fsx_xflags &= ~FS_XFLAG_COWEXTSIZE; + if (fa->fsx_cowextsize % size) + return -EINVAL; return 0; } @@ -1292,21 +1509,6 @@ xfs_ioctl_setattr_check_projid( if (fa->fsx_projid > (uint16_t)-1 && !xfs_sb_version_hasprojid32bit(&ip->i_mount->m_sb)) return -EINVAL; - - /* - * Project Quota ID state is only allowed to change from within the init - * namespace. Enforce that restriction only if we are trying to change - * the quota ID state. Everything else is allowed in user namespaces. - */ - if (current_user_ns() == &init_user_ns) - return 0; - - if (xfs_get_projid(ip) != fa->fsx_projid) - return -EINVAL; - if ((fa->fsx_xflags & FS_XFLAG_PROJINHERIT) != - (ip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)) - return -EINVAL; - return 0; } @@ -1315,6 +1517,7 @@ xfs_ioctl_setattr( xfs_inode_t *ip, struct fsxattr *fa) { + struct fsxattr old_fa; struct xfs_mount *mp = ip->i_mount; struct xfs_trans *tp; struct xfs_dquot *udqp = NULL; @@ -1362,7 +1565,6 @@ xfs_ioctl_setattr( goto error_free_dquots; } - if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_PQUOTA_ON(mp) && xfs_get_projid(ip) != fa->fsx_projid) { code = xfs_qm_vop_chown_reserve(tp, ip, udqp, NULL, pdqp, @@ -1371,6 +1573,11 @@ xfs_ioctl_setattr( goto error_trans_cancel; } + xfs_fill_fsxattr(ip, false, &old_fa); + code = vfs_ioc_fssetxattr_check(VFS_I(ip), &old_fa, fa); + if (code) + goto error_trans_cancel; + code = xfs_ioctl_setattr_check_extsize(ip, fa); if (code) goto error_trans_cancel; @@ -1481,6 +1688,7 @@ xfs_ioc_setxflags( { struct xfs_trans *tp; struct fsxattr fa; + struct fsxattr old_fa; unsigned int flags; int join_flags = 0; int error; @@ -1516,6 +1724,13 @@ xfs_ioc_setxflags( goto out_drop_write; } + xfs_fill_fsxattr(ip, false, &old_fa); + error = vfs_ioc_fssetxattr_check(VFS_I(ip), &old_fa, &fa); + if (error) { + xfs_trans_cancel(tp); + goto out_drop_write; + } + error = xfs_ioctl_setattr_xflags(tp, ip, &fa); if (error) { xfs_trans_cancel(tp); @@ -1934,13 +2149,22 @@ xfs_file_ioctl( case XFS_IOC_FSBULKSTAT_SINGLE: case XFS_IOC_FSBULKSTAT: case XFS_IOC_FSINUMBERS: + return xfs_ioc_fsbulkstat(mp, cmd, arg); + + case XFS_IOC_BULKSTAT: return xfs_ioc_bulkstat(mp, cmd, arg); + case XFS_IOC_INUMBERS: + return xfs_ioc_inumbers(mp, cmd, arg); case XFS_IOC_FSGEOMETRY_V1: - return xfs_ioc_fsgeometry_v1(mp, arg); - + return xfs_ioc_fsgeometry(mp, arg, 3); + case XFS_IOC_FSGEOMETRY_V4: + return xfs_ioc_fsgeometry(mp, arg, 4); case XFS_IOC_FSGEOMETRY: - return xfs_ioc_fsgeometry(mp, arg); + return xfs_ioc_fsgeometry(mp, arg, 5); + + case XFS_IOC_AG_GEOMETRY: + return xfs_ioc_ag_geometry(mp, arg); case XFS_IOC_GETVERSION: return put_user(inode->i_generation, (int __user *)arg); @@ -2031,9 +2255,7 @@ xfs_file_ioctl( case XFS_IOC_FSCOUNTS: { xfs_fsop_counts_t out; - error = xfs_fs_counts(mp, &out); - if (error) - return error; + xfs_fs_counts(mp, &out); if (copy_to_user(arg, &out, sizeof(out))) return -EFAULT; diff --git a/fs/xfs/xfs_ioctl.h b/fs/xfs/xfs_ioctl.h index 4b17f67c888a..654c0bb1bcf8 100644 --- a/fs/xfs/xfs_ioctl.h +++ b/fs/xfs/xfs_ioctl.h @@ -77,4 +77,12 @@ xfs_set_dmattrs( uint evmask, uint16_t state); +struct xfs_ibulk; +struct xfs_bstat; +struct xfs_inogrp; + +int xfs_fsbulkstat_one_fmt(struct xfs_ibulk *breq, + const struct xfs_bulkstat *bstat); +int xfs_fsinumbers_fmt(struct xfs_ibulk *breq, const struct xfs_inumbers *igrp); + #endif diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c index 5001dca361e9..7fcf7569743f 100644 --- a/fs/xfs/xfs_ioctl32.c +++ b/fs/xfs/xfs_ioctl32.c @@ -3,23 +3,19 @@ * Copyright (c) 2004-2005 Silicon Graphics, Inc. * All Rights Reserved. */ -#include <linux/compat.h> -#include <linux/ioctl.h> #include <linux/mount.h> -#include <linux/slab.h> -#include <linux/uaccess.h> #include <linux/fsmap.h> #include "xfs.h" #include "xfs_fs.h" +#include "xfs_shared.h" #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" #include "xfs_inode.h" +#include "xfs_iwalk.h" #include "xfs_itable.h" -#include "xfs_error.h" #include "xfs_fsops.h" -#include "xfs_alloc.h" #include "xfs_rtalloc.h" #include "xfs_attr.h" #include "xfs_ioctl.h" @@ -52,12 +48,9 @@ xfs_compat_ioc_fsgeometry_v1( struct xfs_mount *mp, compat_xfs_fsop_geom_v1_t __user *arg32) { - xfs_fsop_geom_t fsgeo; - int error; + struct xfs_fsop_geom fsgeo; - error = xfs_fs_geometry(&mp->m_sb, &fsgeo, 3); - if (error) - return error; + xfs_fs_geometry(&mp->m_sb, &fsgeo, 3); /* The 32-bit variant simply has some padding at the end */ if (copy_to_user(arg32, &fsgeo, sizeof(struct compat_xfs_fsop_geom_v1))) return -EFAULT; @@ -87,27 +80,26 @@ xfs_compat_growfs_rt_copyin( } STATIC int -xfs_inumbers_fmt_compat( - void __user *ubuffer, - const struct xfs_inogrp *buffer, - long count, - long *written) +xfs_fsinumbers_fmt_compat( + struct xfs_ibulk *breq, + const struct xfs_inumbers *ig) { - compat_xfs_inogrp_t __user *p32 = ubuffer; - long i; + struct compat_xfs_inogrp __user *p32 = breq->ubuffer; + struct xfs_inogrp ig1; + struct xfs_inogrp *igrp = &ig1; - for (i = 0; i < count; i++) { - if (put_user(buffer[i].xi_startino, &p32[i].xi_startino) || - put_user(buffer[i].xi_alloccount, &p32[i].xi_alloccount) || - put_user(buffer[i].xi_allocmask, &p32[i].xi_allocmask)) - return -EFAULT; - } - *written = count * sizeof(*p32); - return 0; + xfs_inumbers_to_inogrp(&ig1, ig); + + if (put_user(igrp->xi_startino, &p32->xi_startino) || + put_user(igrp->xi_alloccount, &p32->xi_alloccount) || + put_user(igrp->xi_allocmask, &p32->xi_allocmask)) + return -EFAULT; + + return xfs_ibulk_advance(breq, sizeof(struct compat_xfs_inogrp)); } #else -#define xfs_inumbers_fmt_compat xfs_inumbers_fmt +#define xfs_fsinumbers_fmt_compat xfs_fsinumbers_fmt #endif /* BROKEN_X86_ALIGNMENT */ STATIC int @@ -124,11 +116,14 @@ xfs_ioctl32_bstime_copyin( return 0; } -/* xfs_bstat_t has differing alignment on intel, & bstime_t sizes everywhere */ +/* + * struct xfs_bstat has differing alignment on intel, & bstime_t sizes + * everywhere + */ STATIC int xfs_ioctl32_bstat_copyin( - xfs_bstat_t *bstat, - compat_xfs_bstat_t __user *bstat32) + struct xfs_bstat *bstat, + struct compat_xfs_bstat __user *bstat32) { if (get_user(bstat->bs_ino, &bstat32->bs_ino) || get_user(bstat->bs_mode, &bstat32->bs_mode) || @@ -174,16 +169,15 @@ xfs_bstime_store_compat( /* Return 0 on success or positive error (to xfs_bulkstat()) */ STATIC int -xfs_bulkstat_one_fmt_compat( - void __user *ubuffer, - int ubsize, - int *ubused, - const xfs_bstat_t *buffer) +xfs_fsbulkstat_one_fmt_compat( + struct xfs_ibulk *breq, + const struct xfs_bulkstat *bstat) { - compat_xfs_bstat_t __user *p32 = ubuffer; + struct compat_xfs_bstat __user *p32 = breq->ubuffer; + struct xfs_bstat bs1; + struct xfs_bstat *buffer = &bs1; - if (ubsize < sizeof(*p32)) - return -ENOMEM; + xfs_bulkstat_to_bstat(breq->mp, &bs1, bstat); if (put_user(buffer->bs_ino, &p32->bs_ino) || put_user(buffer->bs_mode, &p32->bs_mode) || @@ -208,37 +202,24 @@ xfs_bulkstat_one_fmt_compat( put_user(buffer->bs_dmstate, &p32->bs_dmstate) || put_user(buffer->bs_aextents, &p32->bs_aextents)) return -EFAULT; - if (ubused) - *ubused = sizeof(*p32); - return 0; -} -STATIC int -xfs_bulkstat_one_compat( - xfs_mount_t *mp, /* mount point for filesystem */ - xfs_ino_t ino, /* inode number to get data for */ - void __user *buffer, /* buffer to place output in */ - int ubsize, /* size of buffer */ - int *ubused, /* bytes used by me */ - int *stat) /* BULKSTAT_RV_... */ -{ - return xfs_bulkstat_one_int(mp, ino, buffer, ubsize, - xfs_bulkstat_one_fmt_compat, - ubused, stat); + return xfs_ibulk_advance(breq, sizeof(struct compat_xfs_bstat)); } /* copied from xfs_ioctl.c */ STATIC int -xfs_compat_ioc_bulkstat( +xfs_compat_ioc_fsbulkstat( xfs_mount_t *mp, unsigned int cmd, - compat_xfs_fsop_bulkreq_t __user *p32) + struct compat_xfs_fsop_bulkreq __user *p32) { u32 addr; - xfs_fsop_bulkreq_t bulkreq; - int count; /* # of records returned */ - xfs_ino_t inlast; /* last inode number */ - int done; + struct xfs_fsop_bulkreq bulkreq; + struct xfs_ibulk breq = { + .mp = mp, + .ocount = 0, + }; + xfs_ino_t lastino; int error; /* @@ -247,9 +228,8 @@ xfs_compat_ioc_bulkstat( * to userpace memory via bulkreq.ubuffer. Normally the compat * functions and structure size are the correct ones to use ... */ - inumbers_fmt_pf inumbers_func = xfs_inumbers_fmt_compat; - bulkstat_one_pf bs_one_func = xfs_bulkstat_one_compat; - size_t bs_one_size = sizeof(struct compat_xfs_bstat); + inumbers_fmt_pf inumbers_func = xfs_fsinumbers_fmt_compat; + bulkstat_one_fmt_pf bs_one_func = xfs_fsbulkstat_one_fmt_compat; #ifdef CONFIG_X86_X32 if (in_x32_syscall()) { @@ -261,9 +241,8 @@ xfs_compat_ioc_bulkstat( * the data written out in compat layout will not match what * x32 userspace expects. */ - inumbers_func = xfs_inumbers_fmt; - bs_one_func = xfs_bulkstat_one; - bs_one_size = sizeof(struct xfs_bstat); + inumbers_func = xfs_fsinumbers_fmt; + bs_one_func = xfs_fsbulkstat_one_fmt; } #endif @@ -287,40 +266,55 @@ xfs_compat_ioc_bulkstat( return -EFAULT; bulkreq.ocount = compat_ptr(addr); - if (copy_from_user(&inlast, bulkreq.lastip, sizeof(__s64))) + if (copy_from_user(&lastino, bulkreq.lastip, sizeof(__s64))) return -EFAULT; - if ((count = bulkreq.icount) <= 0) + if (bulkreq.icount <= 0) return -EINVAL; if (bulkreq.ubuffer == NULL) return -EINVAL; + breq.ubuffer = bulkreq.ubuffer; + breq.icount = bulkreq.icount; + + /* + * FSBULKSTAT_SINGLE expects that *lastip contains the inode number + * that we want to stat. However, FSINUMBERS and FSBULKSTAT expect + * that *lastip contains either zero or the number of the last inode to + * be examined by the previous call and return results starting with + * the next inode after that. The new bulk request back end functions + * take the inode to start with, so we have to compute the startino + * parameter from lastino to maintain correct function. lastino == 0 + * is a special case because it has traditionally meant "first inode + * in filesystem". + */ if (cmd == XFS_IOC_FSINUMBERS_32) { - error = xfs_inumbers(mp, &inlast, &count, - bulkreq.ubuffer, inumbers_func); + breq.startino = lastino ? lastino + 1 : 0; + error = xfs_inumbers(&breq, inumbers_func); + lastino = breq.startino - 1; } else if (cmd == XFS_IOC_FSBULKSTAT_SINGLE_32) { - int res; - - error = bs_one_func(mp, inlast, bulkreq.ubuffer, - bs_one_size, NULL, &res); + breq.startino = lastino; + breq.icount = 1; + error = xfs_bulkstat_one(&breq, bs_one_func); + lastino = breq.startino; } else if (cmd == XFS_IOC_FSBULKSTAT_32) { - error = xfs_bulkstat(mp, &inlast, &count, - bs_one_func, bs_one_size, - bulkreq.ubuffer, &done); - } else + breq.startino = lastino ? lastino + 1 : 0; + error = xfs_bulkstat(&breq, bs_one_func); + lastino = breq.startino - 1; + } else { error = -EINVAL; + } if (error) return error; - if (bulkreq.ocount != NULL) { - if (copy_to_user(bulkreq.lastip, &inlast, - sizeof(xfs_ino_t))) - return -EFAULT; + if (bulkreq.lastip != NULL && + copy_to_user(bulkreq.lastip, &lastino, sizeof(xfs_ino_t))) + return -EFAULT; - if (copy_to_user(bulkreq.ocount, &count, sizeof(count))) - return -EFAULT; - } + if (bulkreq.ocount != NULL && + copy_to_user(bulkreq.ocount, &breq.ocount, sizeof(__s32))) + return -EFAULT; return 0; } @@ -561,7 +555,9 @@ xfs_file_compat_ioctl( switch (cmd) { /* No size or alignment issues on any arch */ case XFS_IOC_DIOINFO: + case XFS_IOC_FSGEOMETRY_V4: case XFS_IOC_FSGEOMETRY: + case XFS_IOC_AG_GEOMETRY: case XFS_IOC_FSGETXATTR: case XFS_IOC_FSSETXATTR: case XFS_IOC_FSGETXATTRA: @@ -578,6 +574,8 @@ xfs_file_compat_ioctl( case XFS_IOC_ERROR_CLEARALL: case FS_IOC_GETFSMAP: case XFS_IOC_SCRUB_METADATA: + case XFS_IOC_BULKSTAT: + case XFS_IOC_INUMBERS: return xfs_file_ioctl(filp, cmd, p); #if !defined(BROKEN_X86_ALIGNMENT) || defined(CONFIG_X86_X32) /* @@ -675,7 +673,7 @@ xfs_file_compat_ioctl( case XFS_IOC_FSBULKSTAT_32: case XFS_IOC_FSBULKSTAT_SINGLE_32: case XFS_IOC_FSINUMBERS_32: - return xfs_compat_ioc_bulkstat(mp, cmd, arg); + return xfs_compat_ioc_fsbulkstat(mp, cmd, arg); case XFS_IOC_FD_TO_HANDLE_32: case XFS_IOC_PATH_TO_HANDLE_32: case XFS_IOC_PATH_TO_FSHANDLE_32: { diff --git a/fs/xfs/xfs_ioctl32.h b/fs/xfs/xfs_ioctl32.h index d28fa824284a..7985344d3aa6 100644 --- a/fs/xfs/xfs_ioctl32.h +++ b/fs/xfs/xfs_ioctl32.h @@ -36,7 +36,7 @@ typedef struct compat_xfs_bstime { __s32 tv_nsec; /* and nanoseconds */ } compat_xfs_bstime_t; -typedef struct compat_xfs_bstat { +struct compat_xfs_bstat { __u64 bs_ino; /* inode number */ __u16 bs_mode; /* type and mode */ __u16 bs_nlink; /* number of links */ @@ -61,14 +61,14 @@ typedef struct compat_xfs_bstat { __u32 bs_dmevmask; /* DMIG event mask */ __u16 bs_dmstate; /* DMIG state info */ __u16 bs_aextents; /* attribute number of extents */ -} __compat_packed compat_xfs_bstat_t; +} __compat_packed; -typedef struct compat_xfs_fsop_bulkreq { +struct compat_xfs_fsop_bulkreq { compat_uptr_t lastip; /* last inode # pointer */ __s32 icount; /* count of entries in buffer */ compat_uptr_t ubuffer; /* user buffer for inode desc. */ compat_uptr_t ocount; /* output count pointer */ -} compat_xfs_fsop_bulkreq_t; +}; #define XFS_IOC_FSBULKSTAT_32 \ _IOWR('X', 101, struct compat_xfs_fsop_bulkreq) @@ -106,7 +106,7 @@ typedef struct compat_xfs_swapext { xfs_off_t sx_offset; /* offset into file */ xfs_off_t sx_length; /* leng from offset */ char sx_pad[16]; /* pad space, unused */ - compat_xfs_bstat_t sx_stat; /* stat of target b4 copy */ + struct compat_xfs_bstat sx_stat; /* stat of target b4 copy */ } __compat_packed compat_xfs_swapext_t; #define XFS_IOC_SWAPEXT_32 _IOWR('X', 109, struct compat_xfs_swapext) @@ -201,11 +201,11 @@ typedef struct compat_xfs_fsop_geom_v1 { #define XFS_IOC_FSGEOMETRY_V1_32 \ _IOR('X', 100, struct compat_xfs_fsop_geom_v1) -typedef struct compat_xfs_inogrp { +struct compat_xfs_inogrp { __u64 xi_startino; /* starting inode number */ __s32 xi_alloccount; /* # bits set in allocmask */ __u64 xi_allocmask; /* mask of allocated inodes */ -} __attribute__((packed)) compat_xfs_inogrp_t; +} __attribute__((packed)); /* These growfs input structures have padding on the end, so must translate */ typedef struct compat_xfs_growfs_data { diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 27c93b5f029d..3a4310d7cb59 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -4,7 +4,6 @@ * Copyright (c) 2016-2018 Christoph Hellwig. * All Rights Reserved. */ -#include <linux/iomap.h> #include "xfs.h" #include "xfs_fs.h" #include "xfs_shared.h" @@ -12,7 +11,6 @@ #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" -#include "xfs_defer.h" #include "xfs_inode.h" #include "xfs_btree.h" #include "xfs_bmap_btree.h" @@ -25,7 +23,6 @@ #include "xfs_inode_item.h" #include "xfs_iomap.h" #include "xfs_trace.h" -#include "xfs_icache.h" #include "xfs_quota.h" #include "xfs_dquot_item.h" #include "xfs_dquot.h" @@ -35,18 +32,40 @@ #define XFS_WRITEIO_ALIGN(mp,off) (((off) >> mp->m_writeio_log) \ << mp->m_writeio_log) -void +static int +xfs_alert_fsblock_zero( + xfs_inode_t *ip, + xfs_bmbt_irec_t *imap) +{ + xfs_alert_tag(ip->i_mount, XFS_PTAG_FSBLOCK_ZERO, + "Access to block zero in inode %llu " + "start_block: %llx start_off: %llx " + "blkcnt: %llx extent-state: %x", + (unsigned long long)ip->i_ino, + (unsigned long long)imap->br_startblock, + (unsigned long long)imap->br_startoff, + (unsigned long long)imap->br_blockcount, + imap->br_state); + return -EFSCORRUPTED; +} + +int xfs_bmbt_to_iomap( struct xfs_inode *ip, struct iomap *iomap, - struct xfs_bmbt_irec *imap) + struct xfs_bmbt_irec *imap, + bool shared) { struct xfs_mount *mp = ip->i_mount; + if (unlikely(!imap->br_startblock && !XFS_IS_REALTIME_INODE(ip))) + return xfs_alert_fsblock_zero(ip, imap); + if (imap->br_startblock == HOLESTARTBLOCK) { iomap->addr = IOMAP_NULL_ADDR; iomap->type = IOMAP_HOLE; - } else if (imap->br_startblock == DELAYSTARTBLOCK) { + } else if (imap->br_startblock == DELAYSTARTBLOCK || + isnullstartblock(imap->br_startblock)) { iomap->addr = IOMAP_NULL_ADDR; iomap->type = IOMAP_DELALLOC; } else { @@ -60,6 +79,13 @@ xfs_bmbt_to_iomap( iomap->length = XFS_FSB_TO_B(mp, imap->br_blockcount); iomap->bdev = xfs_find_bdev_for_inode(VFS_I(ip)); iomap->dax_dev = xfs_find_daxdev_for_inode(VFS_I(ip)); + + if (xfs_ipincount(ip) && + (ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP)) + iomap->flags |= IOMAP_F_DIRTY; + if (shared) + iomap->flags |= IOMAP_F_SHARED; + return 0; } static void @@ -138,23 +164,6 @@ xfs_iomap_eof_align_last_fsb( return 0; } -STATIC int -xfs_alert_fsblock_zero( - xfs_inode_t *ip, - xfs_bmbt_irec_t *imap) -{ - xfs_alert_tag(ip->i_mount, XFS_PTAG_FSBLOCK_ZERO, - "Access to block zero in inode %llu " - "start_block: %llx start_off: %llx " - "blkcnt: %llx extent-state: %x", - (unsigned long long)ip->i_ino, - (unsigned long long)imap->br_startblock, - (unsigned long long)imap->br_startoff, - (unsigned long long)imap->br_blockcount, - imap->br_state); - return -EFSCORRUPTED; -} - int xfs_iomap_write_direct( xfs_inode_t *ip, @@ -383,12 +392,13 @@ xfs_quota_calc_throttle( STATIC xfs_fsblock_t xfs_iomap_prealloc_size( struct xfs_inode *ip, + int whichfork, loff_t offset, loff_t count, struct xfs_iext_cursor *icur) { struct xfs_mount *mp = ip->i_mount; - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); struct xfs_bmbt_irec prev; int shift = 0; @@ -522,15 +532,16 @@ xfs_file_iomap_begin_delay( { struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); xfs_fileoff_t maxbytes_fsb = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes); xfs_fileoff_t end_fsb; - int error = 0, eof = 0; - struct xfs_bmbt_irec got; - struct xfs_iext_cursor icur; + struct xfs_bmbt_irec imap, cmap; + struct xfs_iext_cursor icur, ccur; xfs_fsblock_t prealloc_blocks = 0; + bool eof = false, cow_eof = false, shared = false; + int whichfork = XFS_DATA_FORK; + int error = 0; ASSERT(!XFS_IS_REALTIME_INODE(ip)); ASSERT(!xfs_get_extsz_hint(ip)); @@ -548,7 +559,7 @@ xfs_file_iomap_begin_delay( XFS_STATS_INC(mp, xs_blk_mapw); - if (!(ifp->if_flags & XFS_IFEXTENTS)) { + if (!(ip->i_df.if_flags & XFS_IFEXTENTS)) { error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK); if (error) goto out_unlock; @@ -556,53 +567,101 @@ xfs_file_iomap_begin_delay( end_fsb = min(XFS_B_TO_FSB(mp, offset + count), maxbytes_fsb); - eof = !xfs_iext_lookup_extent(ip, ifp, offset_fsb, &icur, &got); + /* + * Search the data fork fork first to look up our source mapping. We + * always need the data fork map, as we have to return it to the + * iomap code so that the higher level write code can read data in to + * perform read-modify-write cycles for unaligned writes. + */ + eof = !xfs_iext_lookup_extent(ip, &ip->i_df, offset_fsb, &icur, &imap); if (eof) - got.br_startoff = end_fsb; /* fake hole until the end */ + imap.br_startoff = end_fsb; /* fake hole until the end */ + + /* We never need to allocate blocks for zeroing a hole. */ + if ((flags & IOMAP_ZERO) && imap.br_startoff > offset_fsb) { + xfs_hole_to_iomap(ip, iomap, offset_fsb, imap.br_startoff); + goto out_unlock; + } - if (got.br_startoff <= offset_fsb) { + /* + * Search the COW fork extent list even if we did not find a data fork + * extent. This serves two purposes: first this implements the + * speculative preallocation using cowextsize, so that we also unshare + * block adjacent to shared blocks instead of just the shared blocks + * themselves. Second the lookup in the extent list is generally faster + * than going out to the shared extent tree. + */ + if (xfs_is_cow_inode(ip)) { + if (!ip->i_cowfp) { + ASSERT(!xfs_is_reflink_inode(ip)); + xfs_ifork_init_cow(ip); + } + cow_eof = !xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, + &ccur, &cmap); + if (!cow_eof && cmap.br_startoff <= offset_fsb) { + trace_xfs_reflink_cow_found(ip, &cmap); + whichfork = XFS_COW_FORK; + goto done; + } + } + + if (imap.br_startoff <= offset_fsb) { /* * For reflink files we may need a delalloc reservation when * overwriting shared extents. This includes zeroing of * existing extents that contain data. */ - if (xfs_is_reflink_inode(ip) && - ((flags & IOMAP_WRITE) || - got.br_state != XFS_EXT_UNWRITTEN)) { - xfs_trim_extent(&got, offset_fsb, end_fsb - offset_fsb); - error = xfs_reflink_reserve_cow(ip, &got); - if (error) - goto out_unlock; + if (!xfs_is_cow_inode(ip) || + ((flags & IOMAP_ZERO) && imap.br_state != XFS_EXT_NORM)) { + trace_xfs_iomap_found(ip, offset, count, XFS_DATA_FORK, + &imap); + goto done; } - trace_xfs_iomap_found(ip, offset, count, 0, &got); - goto done; - } + xfs_trim_extent(&imap, offset_fsb, end_fsb - offset_fsb); - if (flags & IOMAP_ZERO) { - xfs_hole_to_iomap(ip, iomap, offset_fsb, got.br_startoff); - goto out_unlock; + /* Trim the mapping to the nearest shared extent boundary. */ + error = xfs_inode_need_cow(ip, &imap, &shared); + if (error) + goto out_unlock; + + /* Not shared? Just report the (potentially capped) extent. */ + if (!shared) { + trace_xfs_iomap_found(ip, offset, count, XFS_DATA_FORK, + &imap); + goto done; + } + + /* + * Fork all the shared blocks from our write offset until the + * end of the extent. + */ + whichfork = XFS_COW_FORK; + end_fsb = imap.br_startoff + imap.br_blockcount; + } else { + /* + * We cap the maximum length we map here to MAX_WRITEBACK_PAGES + * pages to keep the chunks of work done where somewhat + * symmetric with the work writeback does. This is a completely + * arbitrary number pulled out of thin air. + * + * Note that the values needs to be less than 32-bits wide until + * the lower level functions are updated. + */ + count = min_t(loff_t, count, 1024 * PAGE_SIZE); + end_fsb = min(XFS_B_TO_FSB(mp, offset + count), maxbytes_fsb); + + if (xfs_is_always_cow_inode(ip)) + whichfork = XFS_COW_FORK; } error = xfs_qm_dqattach_locked(ip, false); if (error) goto out_unlock; - /* - * We cap the maximum length we map here to MAX_WRITEBACK_PAGES pages - * to keep the chunks of work done where somewhat symmetric with the - * work writeback does. This is a completely arbitrary number pulled - * out of thin air as a best guess for initial testing. - * - * Note that the values needs to be less than 32-bits wide until - * the lower level functions are updated. - */ - count = min_t(loff_t, count, 1024 * PAGE_SIZE); - end_fsb = min(XFS_B_TO_FSB(mp, offset + count), maxbytes_fsb); - if (eof) { - prealloc_blocks = xfs_iomap_prealloc_size(ip, offset, count, - &icur); + prealloc_blocks = xfs_iomap_prealloc_size(ip, whichfork, offset, + count, &icur); if (prealloc_blocks) { xfs_extlen_t align; xfs_off_t end_offset; @@ -623,9 +682,11 @@ xfs_file_iomap_begin_delay( } retry: - error = xfs_bmapi_reserve_delalloc(ip, XFS_DATA_FORK, offset_fsb, - end_fsb - offset_fsb, prealloc_blocks, &got, &icur, - eof); + error = xfs_bmapi_reserve_delalloc(ip, whichfork, offset_fsb, + end_fsb - offset_fsb, prealloc_blocks, + whichfork == XFS_DATA_FORK ? &imap : &cmap, + whichfork == XFS_DATA_FORK ? &icur : &ccur, + whichfork == XFS_DATA_FORK ? eof : cow_eof); switch (error) { case 0: break; @@ -647,186 +708,22 @@ retry: * them out if the write happens to fail. */ iomap->flags |= IOMAP_F_NEW; - trace_xfs_iomap_alloc(ip, offset, count, 0, &got); + trace_xfs_iomap_alloc(ip, offset, count, whichfork, + whichfork == XFS_DATA_FORK ? &imap : &cmap); done: - if (isnullstartblock(got.br_startblock)) - got.br_startblock = DELAYSTARTBLOCK; - - if (!got.br_startblock) { - error = xfs_alert_fsblock_zero(ip, &got); - if (error) + if (whichfork == XFS_COW_FORK) { + if (imap.br_startoff > offset_fsb) { + xfs_trim_extent(&cmap, offset_fsb, + imap.br_startoff - offset_fsb); + error = xfs_bmbt_to_iomap(ip, iomap, &cmap, true); goto out_unlock; - } - - xfs_bmbt_to_iomap(ip, iomap, &got); - -out_unlock: - xfs_iunlock(ip, XFS_ILOCK_EXCL); - return error; -} - -/* - * Pass in a delayed allocate extent, convert it to real extents; - * return to the caller the extent we create which maps on top of - * the originating callers request. - * - * Called without a lock on the inode. - * - * We no longer bother to look at the incoming map - all we have to - * guarantee is that whatever we allocate fills the required range. - */ -int -xfs_iomap_write_allocate( - xfs_inode_t *ip, - int whichfork, - xfs_off_t offset, - xfs_bmbt_irec_t *imap, - unsigned int *cow_seq) -{ - xfs_mount_t *mp = ip->i_mount; - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); - xfs_fileoff_t offset_fsb, last_block; - xfs_fileoff_t end_fsb, map_start_fsb; - xfs_filblks_t count_fsb; - xfs_trans_t *tp; - int nimaps; - int error = 0; - int flags = XFS_BMAPI_DELALLOC; - int nres; - - if (whichfork == XFS_COW_FORK) - flags |= XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC; - - /* - * Make sure that the dquots are there. - */ - error = xfs_qm_dqattach(ip); - if (error) - return error; - - offset_fsb = XFS_B_TO_FSBT(mp, offset); - count_fsb = imap->br_blockcount; - map_start_fsb = imap->br_startoff; - - XFS_STATS_ADD(mp, xs_xstrat_bytes, XFS_FSB_TO_B(mp, count_fsb)); - - while (count_fsb != 0) { - /* - * Set up a transaction with which to allocate the - * backing store for the file. Do allocations in a - * loop until we get some space in the range we are - * interested in. The other space that might be allocated - * is in the delayed allocation extent on which we sit - * but before our buffer starts. - */ - nimaps = 0; - while (nimaps == 0) { - nres = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK); - /* - * We have already reserved space for the extent and any - * indirect blocks when creating the delalloc extent, - * there is no need to reserve space in this transaction - * again. - */ - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, 0, - 0, XFS_TRANS_RESERVE, &tp); - if (error) - return error; - - xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, ip, 0); - - /* - * it is possible that the extents have changed since - * we did the read call as we dropped the ilock for a - * while. We have to be careful about truncates or hole - * punchs here - we are not allowed to allocate - * non-delalloc blocks here. - * - * The only protection against truncation is the pages - * for the range we are being asked to convert are - * locked and hence a truncate will block on them - * first. - * - * As a result, if we go beyond the range we really - * need and hit an delalloc extent boundary followed by - * a hole while we have excess blocks in the map, we - * will fill the hole incorrectly and overrun the - * transaction reservation. - * - * Using a single map prevents this as we are forced to - * check each map we look for overlap with the desired - * range and abort as soon as we find it. Also, given - * that we only return a single map, having one beyond - * what we can return is probably a bit silly. - * - * We also need to check that we don't go beyond EOF; - * this is a truncate optimisation as a truncate sets - * the new file size before block on the pages we - * currently have locked under writeback. Because they - * are about to be tossed, we don't need to write them - * back.... - */ - nimaps = 1; - end_fsb = XFS_B_TO_FSB(mp, XFS_ISIZE(ip)); - error = xfs_bmap_last_offset(ip, &last_block, - XFS_DATA_FORK); - if (error) - goto trans_cancel; - - last_block = XFS_FILEOFF_MAX(last_block, end_fsb); - if ((map_start_fsb + count_fsb) > last_block) { - count_fsb = last_block - map_start_fsb; - if (count_fsb == 0) { - error = -EAGAIN; - goto trans_cancel; - } - } - - /* - * From this point onwards we overwrite the imap - * pointer that the caller gave to us. - */ - error = xfs_bmapi_write(tp, ip, map_start_fsb, - count_fsb, flags, nres, imap, - &nimaps); - if (error) - goto trans_cancel; - - error = xfs_trans_commit(tp); - if (error) - goto error0; - - if (whichfork == XFS_COW_FORK) - *cow_seq = READ_ONCE(ifp->if_seq); - xfs_iunlock(ip, XFS_ILOCK_EXCL); - } - - /* - * See if we were able to allocate an extent that - * covers at least part of the callers request - */ - if (!(imap->br_startblock || XFS_IS_REALTIME_INODE(ip))) - return xfs_alert_fsblock_zero(ip, imap); - - if ((offset_fsb >= imap->br_startoff) && - (offset_fsb < (imap->br_startoff + - imap->br_blockcount))) { - XFS_STATS_INC(mp, xs_xstrat_quick); - return 0; } - - /* - * So far we have not mapped the requested part of the - * file, just surrounding data, try again. - */ - count_fsb -= imap->br_blockcount; - map_start_fsb = imap->br_startoff + imap->br_blockcount; + /* ensure we only report blocks we have a reservation for */ + xfs_trim_extent(&imap, cmap.br_startoff, cmap.br_blockcount); + shared = true; } - -trans_cancel: - xfs_trans_cancel(tp); -error0: + error = xfs_bmbt_to_iomap(ip, iomap, &imap, shared); +out_unlock: xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; } @@ -879,7 +776,7 @@ xfs_iomap_write_unwritten( * complete here and might deadlock on the iolock. */ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, - XFS_TRANS_RESERVE | XFS_TRANS_NOFS, &tp); + XFS_TRANS_RESERVE, &tp); if (error) return error; @@ -975,7 +872,7 @@ xfs_ilock_for_iomap( * COW writes may allocate delalloc space or convert unwritten COW * extents, so we need to make sure to take the lock exclusively here. */ - if (xfs_is_reflink_inode(ip) && is_write) { + if (xfs_is_cow_inode(ip) && is_write) { /* * FIXME: It could still overwrite on unshared extents and not * need allocation. @@ -1009,7 +906,7 @@ relock: * check, so if we got ILOCK_SHARED for a write and but we're now a * reflink inode we have to switch to ILOCK_EXCL and relock. */ - if (mode == XFS_ILOCK_SHARED && is_write && xfs_is_reflink_inode(ip)) { + if (mode == XFS_ILOCK_SHARED && is_write && xfs_is_cow_inode(ip)) { xfs_iunlock(ip, mode); mode = XFS_ILOCK_EXCL; goto relock; @@ -1081,23 +978,33 @@ xfs_file_iomap_begin( * Break shared extents if necessary. Checks for non-blocking IO have * been done up front, so we don't need to do them here. */ - if (xfs_is_reflink_inode(ip)) { + if (xfs_is_cow_inode(ip)) { + struct xfs_bmbt_irec cmap; + bool directio = (flags & IOMAP_DIRECT); + /* if zeroing doesn't need COW allocation, then we are done. */ if ((flags & IOMAP_ZERO) && !needs_cow_for_zeroing(&imap, nimaps)) goto out_found; - if (flags & IOMAP_DIRECT) { - /* may drop and re-acquire the ilock */ - error = xfs_reflink_allocate_cow(ip, &imap, &shared, - &lockmode); - if (error) - goto out_unlock; - } else { - error = xfs_reflink_reserve_cow(ip, &imap); - if (error) - goto out_unlock; - } + /* may drop and re-acquire the ilock */ + cmap = imap; + error = xfs_reflink_allocate_cow(ip, &cmap, &shared, &lockmode, + directio); + if (error) + goto out_unlock; + + /* + * For buffered writes we need to report the address of the + * previous block (if there was any) so that the higher level + * write code can perform read-modify-write operations; we + * won't need the CoW fork mapping until writeback. For direct + * I/O, which must be block aligned, we need to report the + * newly allocated address. If the data fork has a hole, copy + * the COW fork mapping to avoid allocating to the data fork. + */ + if (directio || imap.br_startblock == HOLESTARTBLOCK) + imap = cmap; end_fsb = imap.br_startoff + imap.br_blockcount; length = XFS_FSB_TO_B(mp, end_fsb) - offset; @@ -1139,23 +1046,15 @@ xfs_file_iomap_begin( return error; iomap->flags |= IOMAP_F_NEW; - trace_xfs_iomap_alloc(ip, offset, length, 0, &imap); + trace_xfs_iomap_alloc(ip, offset, length, XFS_DATA_FORK, &imap); out_finish: - if (xfs_ipincount(ip) && (ip->i_itemp->ili_fsync_fields - & ~XFS_ILOG_TIMESTAMP)) - iomap->flags |= IOMAP_F_DIRTY; - - xfs_bmbt_to_iomap(ip, iomap, &imap); - - if (shared) - iomap->flags |= IOMAP_F_SHARED; - return 0; + return xfs_bmbt_to_iomap(ip, iomap, &imap, shared); out_found: ASSERT(nimaps); xfs_iunlock(ip, lockmode); - trace_xfs_iomap_found(ip, offset, length, 0, &imap); + trace_xfs_iomap_found(ip, offset, length, XFS_DATA_FORK, &imap); goto out_finish; out_unlock: @@ -1241,6 +1140,92 @@ const struct iomap_ops xfs_iomap_ops = { }; static int +xfs_seek_iomap_begin( + struct inode *inode, + loff_t offset, + loff_t length, + unsigned flags, + struct iomap *iomap) +{ + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); + xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + length); + xfs_fileoff_t cow_fsb = NULLFILEOFF, data_fsb = NULLFILEOFF; + struct xfs_iext_cursor icur; + struct xfs_bmbt_irec imap, cmap; + int error = 0; + unsigned lockmode; + + if (XFS_FORCED_SHUTDOWN(mp)) + return -EIO; + + lockmode = xfs_ilock_data_map_shared(ip); + if (!(ip->i_df.if_flags & XFS_IFEXTENTS)) { + error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK); + if (error) + goto out_unlock; + } + + if (xfs_iext_lookup_extent(ip, &ip->i_df, offset_fsb, &icur, &imap)) { + /* + * If we found a data extent we are done. + */ + if (imap.br_startoff <= offset_fsb) + goto done; + data_fsb = imap.br_startoff; + } else { + /* + * Fake a hole until the end of the file. + */ + data_fsb = min(XFS_B_TO_FSB(mp, offset + length), + XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes)); + } + + /* + * If a COW fork extent covers the hole, report it - capped to the next + * data fork extent: + */ + if (xfs_inode_has_cow_data(ip) && + xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &cmap)) + cow_fsb = cmap.br_startoff; + if (cow_fsb != NULLFILEOFF && cow_fsb <= offset_fsb) { + if (data_fsb < cow_fsb + cmap.br_blockcount) + end_fsb = min(end_fsb, data_fsb); + xfs_trim_extent(&cmap, offset_fsb, end_fsb); + error = xfs_bmbt_to_iomap(ip, iomap, &cmap, true); + /* + * This is a COW extent, so we must probe the page cache + * because there could be dirty page cache being backed + * by this extent. + */ + iomap->type = IOMAP_UNWRITTEN; + goto out_unlock; + } + + /* + * Else report a hole, capped to the next found data or COW extent. + */ + if (cow_fsb != NULLFILEOFF && cow_fsb < data_fsb) + imap.br_blockcount = cow_fsb - offset_fsb; + else + imap.br_blockcount = data_fsb - offset_fsb; + imap.br_startoff = offset_fsb; + imap.br_startblock = HOLESTARTBLOCK; + imap.br_state = XFS_EXT_NORM; +done: + xfs_trim_extent(&imap, offset_fsb, end_fsb); + error = xfs_bmbt_to_iomap(ip, iomap, &imap, false); +out_unlock: + xfs_iunlock(ip, lockmode); + return error; +} + +const struct iomap_ops xfs_seek_iomap_ops = { + .iomap_begin = xfs_seek_iomap_begin, +}; + +static int xfs_xattr_iomap_begin( struct inode *inode, loff_t offset, @@ -1273,12 +1258,10 @@ xfs_xattr_iomap_begin( out_unlock: xfs_iunlock(ip, lockmode); - if (!error) { - ASSERT(nimaps); - xfs_bmbt_to_iomap(ip, iomap, &imap); - } - - return error; + if (error) + return error; + ASSERT(nimaps); + return xfs_bmbt_to_iomap(ip, iomap, &imap, false); } const struct iomap_ops xfs_xattr_iomap_ops = { diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h index c6170548831b..5c2f6aa6d78f 100644 --- a/fs/xfs/xfs_iomap.h +++ b/fs/xfs/xfs_iomap.h @@ -13,12 +13,10 @@ struct xfs_bmbt_irec; int xfs_iomap_write_direct(struct xfs_inode *, xfs_off_t, size_t, struct xfs_bmbt_irec *, int); -int xfs_iomap_write_allocate(struct xfs_inode *, int, xfs_off_t, - struct xfs_bmbt_irec *, unsigned int *); int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, xfs_off_t, bool); -void xfs_bmbt_to_iomap(struct xfs_inode *, struct iomap *, - struct xfs_bmbt_irec *); +int xfs_bmbt_to_iomap(struct xfs_inode *, struct iomap *, + struct xfs_bmbt_irec *, bool shared); xfs_extlen_t xfs_eof_alignment(struct xfs_inode *ip, xfs_extlen_t extsize); static inline xfs_filblks_t @@ -42,6 +40,7 @@ xfs_aligned_fsb_count( } extern const struct iomap_ops xfs_iomap_ops; +extern const struct iomap_ops xfs_seek_iomap_ops; extern const struct iomap_ops xfs_xattr_iomap_ops; #endif /* __XFS_IOMAP_H__*/ diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index f48ffd7a8d3e..ff3c1fae5357 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -10,30 +10,20 @@ #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" -#include "xfs_da_format.h" #include "xfs_inode.h" -#include "xfs_bmap.h" -#include "xfs_bmap_util.h" #include "xfs_acl.h" #include "xfs_quota.h" -#include "xfs_error.h" #include "xfs_attr.h" #include "xfs_trans.h" #include "xfs_trace.h" #include "xfs_icache.h" #include "xfs_symlink.h" -#include "xfs_da_btree.h" #include "xfs_dir2.h" -#include "xfs_trans_space.h" #include "xfs_iomap.h" -#include "xfs_defer.h" -#include <linux/capability.h> #include <linux/xattr.h> #include <linux/posix_acl.h> #include <linux/security.h> -#include <linux/iomap.h> -#include <linux/slab.h> #include <linux/iversion.h> /* @@ -191,9 +181,18 @@ xfs_generic_create( xfs_setup_iops(ip); - if (tmpfile) + if (tmpfile) { + /* + * The VFS requires that any inode fed to d_tmpfile must have + * nlink == 1 so that it can decrement the nlink in d_tmpfile. + * However, we created the temp file with nlink == 0 because + * we're not allowed to put an inode with nlink > 0 on the + * unlinked list. Therefore we have to set nlink to 1 so that + * d_tmpfile can immediately set it back to zero. + */ + set_nlink(inode, 1); d_tmpfile(dentry, inode); - else + } else d_instantiate(dentry, inode); xfs_finish_inode_setup(ip); @@ -522,6 +521,10 @@ xfs_vn_getattr( } } + /* + * Note: If you add another clause to set an attribute flag, please + * update attributes_mask below. + */ if (ip->i_d.di_flags & XFS_DIFLAG_IMMUTABLE) stat->attributes |= STATX_ATTR_IMMUTABLE; if (ip->i_d.di_flags & XFS_DIFLAG_APPEND) @@ -529,6 +532,10 @@ xfs_vn_getattr( if (ip->i_d.di_flags & XFS_DIFLAG_NODUMP) stat->attributes |= STATX_ATTR_NODUMP; + stat->attributes_mask |= (STATX_ATTR_IMMUTABLE | + STATX_ATTR_APPEND | + STATX_ATTR_NODUMP); + switch (inode->i_mode & S_IFMT) { case S_IFBLK: case S_IFCHR: diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index 942e4aa5e729..a8a06bb78ea8 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -14,45 +14,66 @@ #include "xfs_btree.h" #include "xfs_ialloc.h" #include "xfs_ialloc_btree.h" +#include "xfs_iwalk.h" #include "xfs_itable.h" #include "xfs_error.h" -#include "xfs_trace.h" #include "xfs_icache.h" +#include "xfs_health.h" /* - * Return stat information for one inode. - * Return 0 if ok, else errno. + * Bulk Stat + * ========= + * + * Use the inode walking functions to fill out struct xfs_bulkstat for every + * allocated inode, then pass the stat information to some externally provided + * iteration function. */ -int + +struct xfs_bstat_chunk { + bulkstat_one_fmt_pf formatter; + struct xfs_ibulk *breq; + struct xfs_bulkstat *buf; +}; + +/* + * Fill out the bulkstat info for a single inode and report it somewhere. + * + * bc->breq->lastino is effectively the inode cursor as we walk through the + * filesystem. Therefore, we update it any time we need to move the cursor + * forward, regardless of whether or not we're sending any bstat information + * back to userspace. If the inode is internal metadata or, has been freed + * out from under us, we just simply keep going. + * + * However, if any other type of error happens we want to stop right where we + * are so that userspace will call back with exact number of the bad inode and + * we can send back an error code. + * + * Note that if the formatter tells us there's no space left in the buffer we + * move the cursor forward and abort the walk. + */ +STATIC int xfs_bulkstat_one_int( - struct xfs_mount *mp, /* mount point for filesystem */ - xfs_ino_t ino, /* inode to get data for */ - void __user *buffer, /* buffer to place output in */ - int ubsize, /* size of buffer */ - bulkstat_one_fmt_pf formatter, /* formatter, copy to user */ - int *ubused, /* bytes used by me */ - int *stat) /* BULKSTAT_RV_... */ + struct xfs_mount *mp, + struct xfs_trans *tp, + xfs_ino_t ino, + struct xfs_bstat_chunk *bc) { struct xfs_icdinode *dic; /* dinode core info pointer */ struct xfs_inode *ip; /* incore inode pointer */ struct inode *inode; - struct xfs_bstat *buf; /* return buffer */ - int error = 0; /* error value */ + struct xfs_bulkstat *buf = bc->buf; + int error = -EINVAL; - *stat = BULKSTAT_RV_NOTHING; + if (xfs_internal_inum(mp, ino)) + goto out_advance; - if (!buffer || xfs_internal_inum(mp, ino)) - return -EINVAL; - - buf = kmem_zalloc(sizeof(*buf), KM_SLEEP | KM_MAYFAIL); - if (!buf) - return -ENOMEM; - - error = xfs_iget(mp, NULL, ino, + error = xfs_iget(mp, tp, ino, (XFS_IGET_DONTCACHE | XFS_IGET_UNTRUSTED), XFS_ILOCK_SHARED, &ip); + if (error == -ENOENT || error == -EINVAL) + goto out_advance; if (error) - goto out_free; + goto out; ASSERT(ip != NULL); ASSERT(ip->i_imap.im_blkno != 0); @@ -63,36 +84,35 @@ xfs_bulkstat_one_int( /* xfs_iget returns the following without needing * further change. */ - buf->bs_projid_lo = dic->di_projid_lo; - buf->bs_projid_hi = dic->di_projid_hi; + buf->bs_projectid = xfs_get_projid(ip); buf->bs_ino = ino; buf->bs_uid = dic->di_uid; buf->bs_gid = dic->di_gid; buf->bs_size = dic->di_size; buf->bs_nlink = inode->i_nlink; - buf->bs_atime.tv_sec = inode->i_atime.tv_sec; - buf->bs_atime.tv_nsec = inode->i_atime.tv_nsec; - buf->bs_mtime.tv_sec = inode->i_mtime.tv_sec; - buf->bs_mtime.tv_nsec = inode->i_mtime.tv_nsec; - buf->bs_ctime.tv_sec = inode->i_ctime.tv_sec; - buf->bs_ctime.tv_nsec = inode->i_ctime.tv_nsec; + buf->bs_atime = inode->i_atime.tv_sec; + buf->bs_atime_nsec = inode->i_atime.tv_nsec; + buf->bs_mtime = inode->i_mtime.tv_sec; + buf->bs_mtime_nsec = inode->i_mtime.tv_nsec; + buf->bs_ctime = inode->i_ctime.tv_sec; + buf->bs_ctime_nsec = inode->i_ctime.tv_nsec; + buf->bs_btime = dic->di_crtime.t_sec; + buf->bs_btime_nsec = dic->di_crtime.t_nsec; buf->bs_gen = inode->i_generation; buf->bs_mode = inode->i_mode; buf->bs_xflags = xfs_ip2xflags(ip); - buf->bs_extsize = dic->di_extsize << mp->m_sb.sb_blocklog; + buf->bs_extsize_blks = dic->di_extsize; buf->bs_extents = dic->di_nextents; - memset(buf->bs_pad, 0, sizeof(buf->bs_pad)); - buf->bs_dmevmask = dic->di_dmevmask; - buf->bs_dmstate = dic->di_dmstate; + xfs_bulkstat_health(ip, buf); buf->bs_aextents = dic->di_anextents; buf->bs_forkoff = XFS_IFORK_BOFF(ip); + buf->bs_version = XFS_BULKSTAT_VERSION_V5; if (dic->di_version == 3) { if (dic->di_flags2 & XFS_DIFLAG2_COWEXTSIZE) - buf->bs_cowextsize = dic->di_cowextsize << - mp->m_sb.sb_blocklog; + buf->bs_cowextsize_blks = dic->di_cowextsize; } switch (dic->di_format) { @@ -116,385 +136,121 @@ xfs_bulkstat_one_int( xfs_iunlock(ip, XFS_ILOCK_SHARED); xfs_irele(ip); - error = formatter(buffer, ubsize, ubused, buf); - if (!error) - *stat = BULKSTAT_RV_DIDONE; + error = bc->formatter(bc->breq, buf); + if (error == XFS_IBULK_ABORT) + goto out_advance; + if (error) + goto out; - out_free: - kmem_free(buf); +out_advance: + /* + * Advance the cursor to the inode that comes after the one we just + * looked at. We want the caller to move along if the bulkstat + * information was copied successfully; if we tried to grab the inode + * but it's no longer allocated; or if it's internal metadata. + */ + bc->breq->startino = ino + 1; +out: return error; } -/* Return 0 on success or positive error */ -STATIC int -xfs_bulkstat_one_fmt( - void __user *ubuffer, - int ubsize, - int *ubused, - const xfs_bstat_t *buffer) -{ - if (ubsize < sizeof(*buffer)) - return -ENOMEM; - if (copy_to_user(ubuffer, buffer, sizeof(*buffer))) - return -EFAULT; - if (ubused) - *ubused = sizeof(*buffer); - return 0; -} - +/* Bulkstat a single inode. */ int xfs_bulkstat_one( - xfs_mount_t *mp, /* mount point for filesystem */ - xfs_ino_t ino, /* inode number to get data for */ - void __user *buffer, /* buffer to place output in */ - int ubsize, /* size of buffer */ - int *ubused, /* bytes used by me */ - int *stat) /* BULKSTAT_RV_... */ + struct xfs_ibulk *breq, + bulkstat_one_fmt_pf formatter) { - return xfs_bulkstat_one_int(mp, ino, buffer, ubsize, - xfs_bulkstat_one_fmt, ubused, stat); -} + struct xfs_bstat_chunk bc = { + .formatter = formatter, + .breq = breq, + }; + int error; -/* - * Loop over all clusters in a chunk for a given incore inode allocation btree - * record. Do a readahead if there are any allocated inodes in that cluster. - */ -STATIC void -xfs_bulkstat_ichunk_ra( - struct xfs_mount *mp, - xfs_agnumber_t agno, - struct xfs_inobt_rec_incore *irec) -{ - xfs_agblock_t agbno; - struct blk_plug plug; - int i; /* inode chunk index */ - - agbno = XFS_AGINO_TO_AGBNO(mp, irec->ir_startino); - - blk_start_plug(&plug); - for (i = 0; i < XFS_INODES_PER_CHUNK; - i += mp->m_inodes_per_cluster, agbno += mp->m_blocks_per_cluster) { - if (xfs_inobt_maskn(i, mp->m_inodes_per_cluster) & - ~irec->ir_free) { - xfs_btree_reada_bufs(mp, agno, agbno, - mp->m_blocks_per_cluster, - &xfs_inode_buf_ops); - } - } - blk_finish_plug(&plug); -} + ASSERT(breq->icount == 1); -/* - * Lookup the inode chunk that the given inode lives in and then get the record - * if we found the chunk. If the inode was not the last in the chunk and there - * are some left allocated, update the data for the pointed-to record as well as - * return the count of grabbed inodes. - */ -STATIC int -xfs_bulkstat_grab_ichunk( - struct xfs_btree_cur *cur, /* btree cursor */ - xfs_agino_t agino, /* starting inode of chunk */ - int *icount,/* return # of inodes grabbed */ - struct xfs_inobt_rec_incore *irec) /* btree record */ -{ - int idx; /* index into inode chunk */ - int stat; - int error = 0; - - /* Lookup the inode chunk that this inode lives in */ - error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &stat); - if (error) - return error; - if (!stat) { - *icount = 0; - return error; - } + bc.buf = kmem_zalloc(sizeof(struct xfs_bulkstat), + KM_SLEEP | KM_MAYFAIL); + if (!bc.buf) + return -ENOMEM; - /* Get the record, should always work */ - error = xfs_inobt_get_rec(cur, irec, &stat); - if (error) - return error; - XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, stat == 1); + error = xfs_bulkstat_one_int(breq->mp, NULL, breq->startino, &bc); - /* Check if the record contains the inode in request */ - if (irec->ir_startino + XFS_INODES_PER_CHUNK <= agino) { - *icount = 0; - return 0; - } + kmem_free(bc.buf); - idx = agino - irec->ir_startino + 1; - if (idx < XFS_INODES_PER_CHUNK && - (xfs_inobt_maskn(idx, XFS_INODES_PER_CHUNK - idx) & ~irec->ir_free)) { - int i; - - /* We got a right chunk with some left inodes allocated at it. - * Grab the chunk record. Mark all the uninteresting inodes - * free -- because they're before our start point. - */ - for (i = 0; i < idx; i++) { - if (XFS_INOBT_MASK(i) & ~irec->ir_free) - irec->ir_freecount++; - } - - irec->ir_free |= xfs_inobt_maskn(0, idx); - *icount = irec->ir_count - irec->ir_freecount; - } + /* + * If we reported one inode to userspace then we abort because we hit + * the end of the buffer. Don't leak that back to userspace. + */ + if (error == XFS_IWALK_ABORT) + error = 0; - return 0; + return error; } -#define XFS_BULKSTAT_UBLEFT(ubleft) ((ubleft) >= statstruct_size) - -struct xfs_bulkstat_agichunk { - char __user **ac_ubuffer;/* pointer into user's buffer */ - int ac_ubleft; /* bytes left in user's buffer */ - int ac_ubelem; /* spaces used in user's buffer */ -}; - -/* - * Process inodes in chunk with a pointer to a formatter function - * that will iget the inode and fill in the appropriate structure. - */ static int -xfs_bulkstat_ag_ichunk( - struct xfs_mount *mp, - xfs_agnumber_t agno, - struct xfs_inobt_rec_incore *irbp, - bulkstat_one_pf formatter, - size_t statstruct_size, - struct xfs_bulkstat_agichunk *acp, - xfs_agino_t *last_agino) +xfs_bulkstat_iwalk( + struct xfs_mount *mp, + struct xfs_trans *tp, + xfs_ino_t ino, + void *data) { - char __user **ubufp = acp->ac_ubuffer; - int chunkidx; - int error = 0; - xfs_agino_t agino = irbp->ir_startino; - - for (chunkidx = 0; chunkidx < XFS_INODES_PER_CHUNK; - chunkidx++, agino++) { - int fmterror; - int ubused; - - /* inode won't fit in buffer, we are done */ - if (acp->ac_ubleft < statstruct_size) - break; - - /* Skip if this inode is free */ - if (XFS_INOBT_MASK(chunkidx) & irbp->ir_free) - continue; - - /* Get the inode and fill in a single buffer */ - ubused = statstruct_size; - error = formatter(mp, XFS_AGINO_TO_INO(mp, agno, agino), - *ubufp, acp->ac_ubleft, &ubused, &fmterror); - - if (fmterror == BULKSTAT_RV_GIVEUP || - (error && error != -ENOENT && error != -EINVAL)) { - acp->ac_ubleft = 0; - ASSERT(error); - break; - } - - /* be careful not to leak error if at end of chunk */ - if (fmterror == BULKSTAT_RV_NOTHING || error) { - error = 0; - continue; - } - - *ubufp += ubused; - acp->ac_ubleft -= ubused; - acp->ac_ubelem++; - } - - /* - * Post-update *last_agino. At this point, agino will always point one - * inode past the last inode we processed successfully. Hence we - * substract that inode when setting the *last_agino cursor so that we - * return the correct cookie to userspace. On the next bulkstat call, - * the inode under the lastino cookie will be skipped as we have already - * processed it here. - */ - *last_agino = agino - 1; + int error; + error = xfs_bulkstat_one_int(mp, tp, ino, data); + /* bulkstat just skips over missing inodes */ + if (error == -ENOENT || error == -EINVAL) + return 0; return error; } /* - * Return stat information in bulk (by-inode) for the filesystem. + * Check the incoming lastino parameter. + * + * We allow any inode value that could map to physical space inside the + * filesystem because if there are no inodes there, bulkstat moves on to the + * next chunk. In other words, the magic agino value of zero takes us to the + * first chunk in the AG, and an agino value past the end of the AG takes us to + * the first chunk in the next AG. + * + * Therefore we can end early if the requested inode is beyond the end of the + * filesystem or doesn't map properly. */ -int /* error status */ -xfs_bulkstat( - xfs_mount_t *mp, /* mount point for filesystem */ - xfs_ino_t *lastinop, /* last inode returned */ - int *ubcountp, /* size of buffer/count returned */ - bulkstat_one_pf formatter, /* func that'd fill a single buf */ - size_t statstruct_size, /* sizeof struct filling */ - char __user *ubuffer, /* buffer with inode stats */ - int *done) /* 1 if there are more stats to get */ +static inline bool +xfs_bulkstat_already_done( + struct xfs_mount *mp, + xfs_ino_t startino) { - xfs_buf_t *agbp; /* agi header buffer */ - xfs_agino_t agino; /* inode # in allocation group */ - xfs_agnumber_t agno; /* allocation group number */ - xfs_btree_cur_t *cur; /* btree cursor for ialloc btree */ - xfs_inobt_rec_incore_t *irbuf; /* start of irec buffer */ - int nirbuf; /* size of irbuf */ - int ubcount; /* size of user's buffer */ - struct xfs_bulkstat_agichunk ac; - int error = 0; + xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, startino); + xfs_agino_t agino = XFS_INO_TO_AGINO(mp, startino); - /* - * Get the last inode value, see if there's nothing to do. - */ - agno = XFS_INO_TO_AGNO(mp, *lastinop); - agino = XFS_INO_TO_AGINO(mp, *lastinop); - if (agno >= mp->m_sb.sb_agcount || - *lastinop != XFS_AGINO_TO_INO(mp, agno, agino)) { - *done = 1; - *ubcountp = 0; - return 0; - } + return agno >= mp->m_sb.sb_agcount || + startino != XFS_AGINO_TO_INO(mp, agno, agino); +} - ubcount = *ubcountp; /* statstruct's */ - ac.ac_ubuffer = &ubuffer; - ac.ac_ubleft = ubcount * statstruct_size; /* bytes */; - ac.ac_ubelem = 0; +/* Return stat information in bulk (by-inode) for the filesystem. */ +int +xfs_bulkstat( + struct xfs_ibulk *breq, + bulkstat_one_fmt_pf formatter) +{ + struct xfs_bstat_chunk bc = { + .formatter = formatter, + .breq = breq, + }; + int error; - *ubcountp = 0; - *done = 0; + if (xfs_bulkstat_already_done(breq->mp, breq->startino)) + return 0; - irbuf = kmem_zalloc_large(PAGE_SIZE * 4, KM_SLEEP); - if (!irbuf) + bc.buf = kmem_zalloc(sizeof(struct xfs_bulkstat), + KM_SLEEP | KM_MAYFAIL); + if (!bc.buf) return -ENOMEM; - nirbuf = (PAGE_SIZE * 4) / sizeof(*irbuf); - /* - * Loop over the allocation groups, starting from the last - * inode returned; 0 means start of the allocation group. - */ - while (agno < mp->m_sb.sb_agcount) { - struct xfs_inobt_rec_incore *irbp = irbuf; - struct xfs_inobt_rec_incore *irbufend = irbuf + nirbuf; - bool end_of_ag = false; - int icount = 0; - int stat; - - error = xfs_ialloc_read_agi(mp, NULL, agno, &agbp); - if (error) - break; - /* - * Allocate and initialize a btree cursor for ialloc btree. - */ - cur = xfs_inobt_init_cursor(mp, NULL, agbp, agno, - XFS_BTNUM_INO); - if (agino > 0) { - /* - * In the middle of an allocation group, we need to get - * the remainder of the chunk we're in. - */ - struct xfs_inobt_rec_incore r; - - error = xfs_bulkstat_grab_ichunk(cur, agino, &icount, &r); - if (error) - goto del_cursor; - if (icount) { - irbp->ir_startino = r.ir_startino; - irbp->ir_holemask = r.ir_holemask; - irbp->ir_count = r.ir_count; - irbp->ir_freecount = r.ir_freecount; - irbp->ir_free = r.ir_free; - irbp++; - } - /* Increment to the next record */ - error = xfs_btree_increment(cur, 0, &stat); - } else { - /* Start of ag. Lookup the first inode chunk */ - error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &stat); - } - if (error || stat == 0) { - end_of_ag = true; - goto del_cursor; - } - - /* - * Loop through inode btree records in this ag, - * until we run out of inodes or space in the buffer. - */ - while (irbp < irbufend && icount < ubcount) { - struct xfs_inobt_rec_incore r; - - error = xfs_inobt_get_rec(cur, &r, &stat); - if (error || stat == 0) { - end_of_ag = true; - goto del_cursor; - } - - /* - * If this chunk has any allocated inodes, save it. - * Also start read-ahead now for this chunk. - */ - if (r.ir_freecount < r.ir_count) { - xfs_bulkstat_ichunk_ra(mp, agno, &r); - irbp->ir_startino = r.ir_startino; - irbp->ir_holemask = r.ir_holemask; - irbp->ir_count = r.ir_count; - irbp->ir_freecount = r.ir_freecount; - irbp->ir_free = r.ir_free; - irbp++; - icount += r.ir_count - r.ir_freecount; - } - error = xfs_btree_increment(cur, 0, &stat); - if (error || stat == 0) { - end_of_ag = true; - goto del_cursor; - } - cond_resched(); - } - - /* - * Drop the btree buffers and the agi buffer as we can't hold any - * of the locks these represent when calling iget. If there is a - * pending error, then we are done. - */ -del_cursor: - xfs_btree_del_cursor(cur, error); - xfs_buf_relse(agbp); - if (error) - break; - /* - * Now format all the good inodes into the user's buffer. The - * call to xfs_bulkstat_ag_ichunk() sets up the agino pointer - * for the next loop iteration. - */ - irbufend = irbp; - for (irbp = irbuf; - irbp < irbufend && ac.ac_ubleft >= statstruct_size; - irbp++) { - error = xfs_bulkstat_ag_ichunk(mp, agno, irbp, - formatter, statstruct_size, &ac, - &agino); - if (error) - break; - - cond_resched(); - } - - /* - * If we've run out of space or had a formatting error, we - * are now done - */ - if (ac.ac_ubleft < statstruct_size || error) - break; - - if (end_of_ag) { - agno++; - agino = 0; - } - } - /* - * Done, we're either out of filesystem or space to put the data. - */ - kmem_free(irbuf); - *ubcountp = ac.ac_ubelem; + error = xfs_iwalk(breq->mp, NULL, breq->startino, breq->flags, + xfs_bulkstat_iwalk, breq->icount, &bc); + + kmem_free(bc.buf); /* * We found some inodes, so clear the error status and return them. @@ -503,135 +259,136 @@ del_cursor: * triggered again and propagated to userspace as there will be no * formatted inodes in the buffer. */ - if (ac.ac_ubelem) + if (breq->ocount > 0) error = 0; - /* - * If we ran out of filesystem, lastino will point off the end of - * the filesystem so the next call will return immediately. - */ - *lastinop = XFS_AGINO_TO_INO(mp, agno, agino); - if (agno >= mp->m_sb.sb_agcount) - *done = 1; - return error; } -int -xfs_inumbers_fmt( - void __user *ubuffer, /* buffer to write to */ - const struct xfs_inogrp *buffer, /* buffer to read from */ - long count, /* # of elements to read */ - long *written) /* # of bytes written */ +/* Convert bulkstat (v5) to bstat (v1). */ +void +xfs_bulkstat_to_bstat( + struct xfs_mount *mp, + struct xfs_bstat *bs1, + const struct xfs_bulkstat *bstat) { - if (copy_to_user(ubuffer, buffer, count * sizeof(*buffer))) - return -EFAULT; - *written = count * sizeof(*buffer); - return 0; + memset(bs1, 0, sizeof(struct xfs_bstat)); + bs1->bs_ino = bstat->bs_ino; + bs1->bs_mode = bstat->bs_mode; + bs1->bs_nlink = bstat->bs_nlink; + bs1->bs_uid = bstat->bs_uid; + bs1->bs_gid = bstat->bs_gid; + bs1->bs_rdev = bstat->bs_rdev; + bs1->bs_blksize = bstat->bs_blksize; + bs1->bs_size = bstat->bs_size; + bs1->bs_atime.tv_sec = bstat->bs_atime; + bs1->bs_mtime.tv_sec = bstat->bs_mtime; + bs1->bs_ctime.tv_sec = bstat->bs_ctime; + bs1->bs_atime.tv_nsec = bstat->bs_atime_nsec; + bs1->bs_mtime.tv_nsec = bstat->bs_mtime_nsec; + bs1->bs_ctime.tv_nsec = bstat->bs_ctime_nsec; + bs1->bs_blocks = bstat->bs_blocks; + bs1->bs_xflags = bstat->bs_xflags; + bs1->bs_extsize = XFS_FSB_TO_B(mp, bstat->bs_extsize_blks); + bs1->bs_extents = bstat->bs_extents; + bs1->bs_gen = bstat->bs_gen; + bs1->bs_projid_lo = bstat->bs_projectid & 0xFFFF; + bs1->bs_forkoff = bstat->bs_forkoff; + bs1->bs_projid_hi = bstat->bs_projectid >> 16; + bs1->bs_sick = bstat->bs_sick; + bs1->bs_checked = bstat->bs_checked; + bs1->bs_cowextsize = XFS_FSB_TO_B(mp, bstat->bs_cowextsize_blks); + bs1->bs_dmevmask = 0; + bs1->bs_dmstate = 0; + bs1->bs_aextents = bstat->bs_aextents; +} + +struct xfs_inumbers_chunk { + inumbers_fmt_pf formatter; + struct xfs_ibulk *breq; +}; + +/* + * INUMBERS + * ======== + * This is how we export inode btree records to userspace, so that XFS tools + * can figure out where inodes are allocated. + */ + +/* + * Format the inode group structure and report it somewhere. + * + * Similar to xfs_bulkstat_one_int, lastino is the inode cursor as we walk + * through the filesystem so we move it forward unless there was a runtime + * error. If the formatter tells us the buffer is now full we also move the + * cursor forward and abort the walk. + */ +STATIC int +xfs_inumbers_walk( + struct xfs_mount *mp, + struct xfs_trans *tp, + xfs_agnumber_t agno, + const struct xfs_inobt_rec_incore *irec, + void *data) +{ + struct xfs_inumbers inogrp = { + .xi_startino = XFS_AGINO_TO_INO(mp, agno, irec->ir_startino), + .xi_alloccount = irec->ir_count - irec->ir_freecount, + .xi_allocmask = ~irec->ir_free, + .xi_version = XFS_INUMBERS_VERSION_V5, + }; + struct xfs_inumbers_chunk *ic = data; + int error; + + error = ic->formatter(ic->breq, &inogrp); + if (error && error != XFS_IBULK_ABORT) + return error; + + ic->breq->startino = XFS_AGINO_TO_INO(mp, agno, irec->ir_startino) + + XFS_INODES_PER_CHUNK; + return error; } /* * Return inode number table for the filesystem. */ -int /* error status */ +int xfs_inumbers( - struct xfs_mount *mp,/* mount point for filesystem */ - xfs_ino_t *lastino,/* last inode returned */ - int *count,/* size of buffer/count returned */ - void __user *ubuffer,/* buffer with inode descriptions */ + struct xfs_ibulk *breq, inumbers_fmt_pf formatter) { - xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, *lastino); - xfs_agino_t agino = XFS_INO_TO_AGINO(mp, *lastino); - struct xfs_btree_cur *cur = NULL; - struct xfs_buf *agbp = NULL; - struct xfs_inogrp *buffer; - int bcount; - int left = *count; - int bufidx = 0; + struct xfs_inumbers_chunk ic = { + .formatter = formatter, + .breq = breq, + }; int error = 0; - *count = 0; - if (agno >= mp->m_sb.sb_agcount || - *lastino != XFS_AGINO_TO_INO(mp, agno, agino)) - return error; + if (xfs_bulkstat_already_done(breq->mp, breq->startino)) + return 0; - bcount = min(left, (int)(PAGE_SIZE / sizeof(*buffer))); - buffer = kmem_zalloc(bcount * sizeof(*buffer), KM_SLEEP); - do { - struct xfs_inobt_rec_incore r; - int stat; - - if (!agbp) { - error = xfs_ialloc_read_agi(mp, NULL, agno, &agbp); - if (error) - break; - - cur = xfs_inobt_init_cursor(mp, NULL, agbp, agno, - XFS_BTNUM_INO); - error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_GE, - &stat); - if (error) - break; - if (!stat) - goto next_ag; - } - - error = xfs_inobt_get_rec(cur, &r, &stat); - if (error) - break; - if (!stat) - goto next_ag; - - agino = r.ir_startino + XFS_INODES_PER_CHUNK - 1; - buffer[bufidx].xi_startino = - XFS_AGINO_TO_INO(mp, agno, r.ir_startino); - buffer[bufidx].xi_alloccount = r.ir_count - r.ir_freecount; - buffer[bufidx].xi_allocmask = ~r.ir_free; - if (++bufidx == bcount) { - long written; - - error = formatter(ubuffer, buffer, bufidx, &written); - if (error) - break; - ubuffer += written; - *count += bufidx; - bufidx = 0; - } - if (!--left) - break; - - error = xfs_btree_increment(cur, 0, &stat); - if (error) - break; - if (stat) - continue; - -next_ag: - xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); - cur = NULL; - xfs_buf_relse(agbp); - agbp = NULL; - agino = 0; - agno++; - } while (agno < mp->m_sb.sb_agcount); - - if (!error) { - if (bufidx) { - long written; - - error = formatter(ubuffer, buffer, bufidx, &written); - if (!error) - *count += bufidx; - } - *lastino = XFS_AGINO_TO_INO(mp, agno, agino); - } + error = xfs_inobt_walk(breq->mp, NULL, breq->startino, breq->flags, + xfs_inumbers_walk, breq->icount, &ic); - kmem_free(buffer); - if (cur) - xfs_btree_del_cursor(cur, error); - if (agbp) - xfs_buf_relse(agbp); + /* + * We found some inode groups, so clear the error status and return + * them. The lastino pointer will point directly at the inode that + * triggered any error that occurred, so on the next call the error + * will be triggered again and propagated to userspace as there will be + * no formatted inode groups in the buffer. + */ + if (breq->ocount > 0) + error = 0; return error; } + +/* Convert an inumbers (v5) struct to a inogrp (v1) struct. */ +void +xfs_inumbers_to_inogrp( + struct xfs_inogrp *ig1, + const struct xfs_inumbers *ig) +{ + ig1->xi_startino = ig->xi_startino; + ig1->xi_alloccount = ig->xi_alloccount; + ig1->xi_allocmask = ig->xi_allocmask; +} diff --git a/fs/xfs/xfs_itable.h b/fs/xfs/xfs_itable.h index 8a822285b671..e90c1fc5b981 100644 --- a/fs/xfs/xfs_itable.h +++ b/fs/xfs/xfs_itable.h @@ -5,83 +5,55 @@ #ifndef __XFS_ITABLE_H__ #define __XFS_ITABLE_H__ -/* - * xfs_bulkstat() is used to fill in xfs_bstat structures as well as dm_stat - * structures (by the dmi library). This is a pointer to a formatter function - * that will iget the inode and fill in the appropriate structure. - * see xfs_bulkstat_one() and xfs_dm_bulkstat_one() in dmapi_xfs.c - */ -typedef int (*bulkstat_one_pf)(struct xfs_mount *mp, - xfs_ino_t ino, - void __user *buffer, - int ubsize, - int *ubused, - int *stat); +/* In-memory representation of a userspace request for batch inode data. */ +struct xfs_ibulk { + struct xfs_mount *mp; + void __user *ubuffer; /* user output buffer */ + xfs_ino_t startino; /* start with this inode */ + unsigned int icount; /* number of elements in ubuffer */ + unsigned int ocount; /* number of records returned */ + unsigned int flags; /* see XFS_IBULK_FLAG_* */ +}; + +/* Only iterate within the same AG as startino */ +#define XFS_IBULK_SAME_AG (XFS_IWALK_SAME_AG) + +/* Return value that means we want to abort the walk. */ +#define XFS_IBULK_ABORT (XFS_IWALK_ABORT) /* - * Values for stat return value. + * Advance the user buffer pointer by one record of the given size. If the + * buffer is now full, return the appropriate error code. */ -#define BULKSTAT_RV_NOTHING 0 -#define BULKSTAT_RV_DIDONE 1 -#define BULKSTAT_RV_GIVEUP 2 +static inline int +xfs_ibulk_advance( + struct xfs_ibulk *breq, + size_t bytes) +{ + char __user *b = breq->ubuffer; + + breq->ubuffer = b + bytes; + breq->ocount++; + return breq->ocount == breq->icount ? XFS_IBULK_ABORT : 0; +} /* * Return stat information in bulk (by-inode) for the filesystem. */ -int /* error status */ -xfs_bulkstat( - xfs_mount_t *mp, /* mount point for filesystem */ - xfs_ino_t *lastino, /* last inode returned */ - int *count, /* size of buffer/count returned */ - bulkstat_one_pf formatter, /* func that'd fill a single buf */ - size_t statstruct_size,/* sizeof struct that we're filling */ - char __user *ubuffer,/* buffer with inode stats */ - int *done); /* 1 if there are more stats to get */ - -typedef int (*bulkstat_one_fmt_pf)( /* used size in bytes or negative error */ - void __user *ubuffer, /* buffer to write to */ - int ubsize, /* remaining user buffer sz */ - int *ubused, /* bytes used by formatter */ - const xfs_bstat_t *buffer); /* buffer to read from */ - -int -xfs_bulkstat_one_int( - xfs_mount_t *mp, - xfs_ino_t ino, - void __user *buffer, - int ubsize, - bulkstat_one_fmt_pf formatter, - int *ubused, - int *stat); -int -xfs_bulkstat_one( - xfs_mount_t *mp, - xfs_ino_t ino, - void __user *buffer, - int ubsize, - int *ubused, - int *stat); +typedef int (*bulkstat_one_fmt_pf)(struct xfs_ibulk *breq, + const struct xfs_bulkstat *bstat); -typedef int (*inumbers_fmt_pf)( - void __user *ubuffer, /* buffer to write to */ - const xfs_inogrp_t *buffer, /* buffer to read from */ - long count, /* # of elements to read */ - long *written); /* # of bytes written */ +int xfs_bulkstat_one(struct xfs_ibulk *breq, bulkstat_one_fmt_pf formatter); +int xfs_bulkstat(struct xfs_ibulk *breq, bulkstat_one_fmt_pf formatter); +void xfs_bulkstat_to_bstat(struct xfs_mount *mp, struct xfs_bstat *bs1, + const struct xfs_bulkstat *bstat); -int -xfs_inumbers_fmt( - void __user *ubuffer, /* buffer to write to */ - const xfs_inogrp_t *buffer, /* buffer to read from */ - long count, /* # of elements to read */ - long *written); /* # of bytes written */ +typedef int (*inumbers_fmt_pf)(struct xfs_ibulk *breq, + const struct xfs_inumbers *igrp); -int /* error status */ -xfs_inumbers( - xfs_mount_t *mp, /* mount point for filesystem */ - xfs_ino_t *last, /* last inode returned */ - int *count, /* size of buffer/count returned */ - void __user *buffer, /* buffer with inode info */ - inumbers_fmt_pf formatter); +int xfs_inumbers(struct xfs_ibulk *breq, inumbers_fmt_pf formatter); +void xfs_inumbers_to_inogrp(struct xfs_inogrp *ig1, + const struct xfs_inumbers *ig); #endif /* __XFS_ITABLE_H__ */ diff --git a/fs/xfs/xfs_iwalk.c b/fs/xfs/xfs_iwalk.c new file mode 100644 index 000000000000..8c7d727149ea --- /dev/null +++ b/fs/xfs/xfs_iwalk.c @@ -0,0 +1,720 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2019 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <darrick.wong@oracle.com> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_inode.h" +#include "xfs_btree.h" +#include "xfs_ialloc.h" +#include "xfs_ialloc_btree.h" +#include "xfs_iwalk.h" +#include "xfs_error.h" +#include "xfs_trace.h" +#include "xfs_icache.h" +#include "xfs_health.h" +#include "xfs_trans.h" +#include "xfs_pwork.h" + +/* + * Walking Inodes in the Filesystem + * ================================ + * + * This iterator function walks a subset of filesystem inodes in increasing + * order from @startino until there are no more inodes. For each allocated + * inode it finds, it calls a walk function with the relevant inode number and + * a pointer to caller-provided data. The walk function can return the usual + * negative error code to stop the iteration; 0 to continue the iteration; or + * XFS_IWALK_ABORT to stop the iteration. This return value is returned to the + * caller. + * + * Internally, we allow the walk function to do anything, which means that we + * cannot maintain the inobt cursor or our lock on the AGI buffer. We + * therefore cache the inobt records in kernel memory and only call the walk + * function when our memory buffer is full. @nr_recs is the number of records + * that we've cached, and @sz_recs is the size of our cache. + * + * It is the responsibility of the walk function to ensure it accesses + * allocated inodes, as the inobt records may be stale by the time they are + * acted upon. + */ + +struct xfs_iwalk_ag { + /* parallel work control data; will be null if single threaded */ + struct xfs_pwork pwork; + + struct xfs_mount *mp; + struct xfs_trans *tp; + + /* Where do we start the traversal? */ + xfs_ino_t startino; + + /* Array of inobt records we cache. */ + struct xfs_inobt_rec_incore *recs; + + /* Number of entries allocated for the @recs array. */ + unsigned int sz_recs; + + /* Number of entries in the @recs array that are in use. */ + unsigned int nr_recs; + + /* Inode walk function and data pointer. */ + xfs_iwalk_fn iwalk_fn; + xfs_inobt_walk_fn inobt_walk_fn; + void *data; + + /* + * Make it look like the inodes up to startino are free so that + * bulkstat can start its inode iteration at the correct place without + * needing to special case everywhere. + */ + unsigned int trim_start:1; + + /* Skip empty inobt records? */ + unsigned int skip_empty:1; +}; + +/* + * Loop over all clusters in a chunk for a given incore inode allocation btree + * record. Do a readahead if there are any allocated inodes in that cluster. + */ +STATIC void +xfs_iwalk_ichunk_ra( + struct xfs_mount *mp, + xfs_agnumber_t agno, + struct xfs_inobt_rec_incore *irec) +{ + struct xfs_ino_geometry *igeo = M_IGEO(mp); + xfs_agblock_t agbno; + struct blk_plug plug; + int i; /* inode chunk index */ + + agbno = XFS_AGINO_TO_AGBNO(mp, irec->ir_startino); + + blk_start_plug(&plug); + for (i = 0; i < XFS_INODES_PER_CHUNK; i += igeo->inodes_per_cluster) { + xfs_inofree_t imask; + + imask = xfs_inobt_maskn(i, igeo->inodes_per_cluster); + if (imask & ~irec->ir_free) { + xfs_btree_reada_bufs(mp, agno, agbno, + igeo->blocks_per_cluster, + &xfs_inode_buf_ops); + } + agbno += igeo->blocks_per_cluster; + } + blk_finish_plug(&plug); +} + +/* + * Set the bits in @irec's free mask that correspond to the inodes before + * @agino so that we skip them. This is how we restart an inode walk that was + * interrupted in the middle of an inode record. + */ +STATIC void +xfs_iwalk_adjust_start( + xfs_agino_t agino, /* starting inode of chunk */ + struct xfs_inobt_rec_incore *irec) /* btree record */ +{ + int idx; /* index into inode chunk */ + int i; + + idx = agino - irec->ir_startino; + + /* + * We got a right chunk with some left inodes allocated at it. Grab + * the chunk record. Mark all the uninteresting inodes free because + * they're before our start point. + */ + for (i = 0; i < idx; i++) { + if (XFS_INOBT_MASK(i) & ~irec->ir_free) + irec->ir_freecount++; + } + + irec->ir_free |= xfs_inobt_maskn(0, idx); +} + +/* Allocate memory for a walk. */ +STATIC int +xfs_iwalk_alloc( + struct xfs_iwalk_ag *iwag) +{ + size_t size; + + ASSERT(iwag->recs == NULL); + iwag->nr_recs = 0; + + /* Allocate a prefetch buffer for inobt records. */ + size = iwag->sz_recs * sizeof(struct xfs_inobt_rec_incore); + iwag->recs = kmem_alloc(size, KM_MAYFAIL); + if (iwag->recs == NULL) + return -ENOMEM; + + return 0; +} + +/* Free memory we allocated for a walk. */ +STATIC void +xfs_iwalk_free( + struct xfs_iwalk_ag *iwag) +{ + kmem_free(iwag->recs); + iwag->recs = NULL; +} + +/* For each inuse inode in each cached inobt record, call our function. */ +STATIC int +xfs_iwalk_ag_recs( + struct xfs_iwalk_ag *iwag) +{ + struct xfs_mount *mp = iwag->mp; + struct xfs_trans *tp = iwag->tp; + xfs_ino_t ino; + unsigned int i, j; + xfs_agnumber_t agno; + int error; + + agno = XFS_INO_TO_AGNO(mp, iwag->startino); + for (i = 0; i < iwag->nr_recs; i++) { + struct xfs_inobt_rec_incore *irec = &iwag->recs[i]; + + trace_xfs_iwalk_ag_rec(mp, agno, irec); + + if (xfs_pwork_want_abort(&iwag->pwork)) + return 0; + + if (iwag->inobt_walk_fn) { + error = iwag->inobt_walk_fn(mp, tp, agno, irec, + iwag->data); + if (error) + return error; + } + + if (!iwag->iwalk_fn) + continue; + + for (j = 0; j < XFS_INODES_PER_CHUNK; j++) { + if (xfs_pwork_want_abort(&iwag->pwork)) + return 0; + + /* Skip if this inode is free */ + if (XFS_INOBT_MASK(j) & irec->ir_free) + continue; + + /* Otherwise call our function. */ + ino = XFS_AGINO_TO_INO(mp, agno, irec->ir_startino + j); + error = iwag->iwalk_fn(mp, tp, ino, iwag->data); + if (error) + return error; + } + } + + return 0; +} + +/* Delete cursor and let go of AGI. */ +static inline void +xfs_iwalk_del_inobt( + struct xfs_trans *tp, + struct xfs_btree_cur **curpp, + struct xfs_buf **agi_bpp, + int error) +{ + if (*curpp) { + xfs_btree_del_cursor(*curpp, error); + *curpp = NULL; + } + if (*agi_bpp) { + xfs_trans_brelse(tp, *agi_bpp); + *agi_bpp = NULL; + } +} + +/* + * Set ourselves up for walking inobt records starting from a given point in + * the filesystem. + * + * If caller passed in a nonzero start inode number, load the record from the + * inobt and make the record look like all the inodes before agino are free so + * that we skip them, and then move the cursor to the next inobt record. This + * is how we support starting an iwalk in the middle of an inode chunk. + * + * If the caller passed in a start number of zero, move the cursor to the first + * inobt record. + * + * The caller is responsible for cleaning up the cursor and buffer pointer + * regardless of the error status. + */ +STATIC int +xfs_iwalk_ag_start( + struct xfs_iwalk_ag *iwag, + xfs_agnumber_t agno, + xfs_agino_t agino, + struct xfs_btree_cur **curpp, + struct xfs_buf **agi_bpp, + int *has_more) +{ + struct xfs_mount *mp = iwag->mp; + struct xfs_trans *tp = iwag->tp; + struct xfs_inobt_rec_incore *irec; + int error; + + /* Set up a fresh cursor and empty the inobt cache. */ + iwag->nr_recs = 0; + error = xfs_inobt_cur(mp, tp, agno, XFS_BTNUM_INO, curpp, agi_bpp); + if (error) + return error; + + /* Starting at the beginning of the AG? That's easy! */ + if (agino == 0) + return xfs_inobt_lookup(*curpp, 0, XFS_LOOKUP_GE, has_more); + + /* + * Otherwise, we have to grab the inobt record where we left off, stuff + * the record into our cache, and then see if there are more records. + * We require a lookup cache of at least two elements so that the + * caller doesn't have to deal with tearing down the cursor to walk the + * records. + */ + error = xfs_inobt_lookup(*curpp, agino, XFS_LOOKUP_LE, has_more); + if (error) + return error; + + /* + * If the LE lookup at @agino yields no records, jump ahead to the + * inobt cursor increment to see if there are more records to process. + */ + if (!*has_more) + goto out_advance; + + /* Get the record, should always work */ + irec = &iwag->recs[iwag->nr_recs]; + error = xfs_inobt_get_rec(*curpp, irec, has_more); + if (error) + return error; + XFS_WANT_CORRUPTED_RETURN(mp, *has_more == 1); + + /* + * If the LE lookup yielded an inobt record before the cursor position, + * skip it and see if there's another one after it. + */ + if (irec->ir_startino + XFS_INODES_PER_CHUNK <= agino) + goto out_advance; + + /* + * If agino fell in the middle of the inode record, make it look like + * the inodes up to agino are free so that we don't return them again. + */ + if (iwag->trim_start) + xfs_iwalk_adjust_start(agino, irec); + + /* + * The prefetch calculation is supposed to give us a large enough inobt + * record cache that grab_ichunk can stage a partial first record and + * the loop body can cache a record without having to check for cache + * space until after it reads an inobt record. + */ + iwag->nr_recs++; + ASSERT(iwag->nr_recs < iwag->sz_recs); + +out_advance: + return xfs_btree_increment(*curpp, 0, has_more); +} + +/* + * The inobt record cache is full, so preserve the inobt cursor state and + * run callbacks on the cached inobt records. When we're done, restore the + * cursor state to wherever the cursor would have been had the cache not been + * full (and therefore we could've just incremented the cursor) if *@has_more + * is true. On exit, *@has_more will indicate whether or not the caller should + * try for more inode records. + */ +STATIC int +xfs_iwalk_run_callbacks( + struct xfs_iwalk_ag *iwag, + xfs_agnumber_t agno, + struct xfs_btree_cur **curpp, + struct xfs_buf **agi_bpp, + int *has_more) +{ + struct xfs_mount *mp = iwag->mp; + struct xfs_trans *tp = iwag->tp; + struct xfs_inobt_rec_incore *irec; + xfs_agino_t restart; + int error; + + ASSERT(iwag->nr_recs > 0); + + /* Delete cursor but remember the last record we cached... */ + xfs_iwalk_del_inobt(tp, curpp, agi_bpp, 0); + irec = &iwag->recs[iwag->nr_recs - 1]; + restart = irec->ir_startino + XFS_INODES_PER_CHUNK - 1; + + error = xfs_iwalk_ag_recs(iwag); + if (error) + return error; + + /* ...empty the cache... */ + iwag->nr_recs = 0; + + if (!has_more) + return 0; + + /* ...and recreate the cursor just past where we left off. */ + error = xfs_inobt_cur(mp, tp, agno, XFS_BTNUM_INO, curpp, agi_bpp); + if (error) + return error; + + return xfs_inobt_lookup(*curpp, restart, XFS_LOOKUP_GE, has_more); +} + +/* Walk all inodes in a single AG, from @iwag->startino to the end of the AG. */ +STATIC int +xfs_iwalk_ag( + struct xfs_iwalk_ag *iwag) +{ + struct xfs_mount *mp = iwag->mp; + struct xfs_trans *tp = iwag->tp; + struct xfs_buf *agi_bp = NULL; + struct xfs_btree_cur *cur = NULL; + xfs_agnumber_t agno; + xfs_agino_t agino; + int has_more; + int error = 0; + + /* Set up our cursor at the right place in the inode btree. */ + agno = XFS_INO_TO_AGNO(mp, iwag->startino); + agino = XFS_INO_TO_AGINO(mp, iwag->startino); + error = xfs_iwalk_ag_start(iwag, agno, agino, &cur, &agi_bp, &has_more); + + while (!error && has_more) { + struct xfs_inobt_rec_incore *irec; + + cond_resched(); + if (xfs_pwork_want_abort(&iwag->pwork)) + goto out; + + /* Fetch the inobt record. */ + irec = &iwag->recs[iwag->nr_recs]; + error = xfs_inobt_get_rec(cur, irec, &has_more); + if (error || !has_more) + break; + + /* No allocated inodes in this chunk; skip it. */ + if (iwag->skip_empty && irec->ir_freecount == irec->ir_count) { + error = xfs_btree_increment(cur, 0, &has_more); + if (error) + break; + continue; + } + + /* + * Start readahead for this inode chunk in anticipation of + * walking the inodes. + */ + if (iwag->iwalk_fn) + xfs_iwalk_ichunk_ra(mp, agno, irec); + + /* + * If there's space in the buffer for more records, increment + * the btree cursor and grab more. + */ + if (++iwag->nr_recs < iwag->sz_recs) { + error = xfs_btree_increment(cur, 0, &has_more); + if (error || !has_more) + break; + continue; + } + + /* + * Otherwise, we need to save cursor state and run the callback + * function on the cached records. The run_callbacks function + * is supposed to return a cursor pointing to the record where + * we would be if we had been able to increment like above. + */ + ASSERT(has_more); + error = xfs_iwalk_run_callbacks(iwag, agno, &cur, &agi_bp, + &has_more); + } + + if (iwag->nr_recs == 0 || error) + goto out; + + /* Walk the unprocessed records in the cache. */ + error = xfs_iwalk_run_callbacks(iwag, agno, &cur, &agi_bp, &has_more); + +out: + xfs_iwalk_del_inobt(tp, &cur, &agi_bp, error); + return error; +} + +/* + * We experimentally determined that the reduction in ioctl call overhead + * diminishes when userspace asks for more than 2048 inodes, so we'll cap + * prefetch at this point. + */ +#define IWALK_MAX_INODE_PREFETCH (2048U) + +/* + * Given the number of inodes to prefetch, set the number of inobt records that + * we cache in memory, which controls the number of inodes we try to read + * ahead. Set the maximum if @inodes == 0. + */ +static inline unsigned int +xfs_iwalk_prefetch( + unsigned int inodes) +{ + unsigned int inobt_records; + + /* + * If the caller didn't tell us the number of inodes they wanted, + * assume the maximum prefetch possible for best performance. + * Otherwise, cap prefetch at that maximum so that we don't start an + * absurd amount of prefetch. + */ + if (inodes == 0) + inodes = IWALK_MAX_INODE_PREFETCH; + inodes = min(inodes, IWALK_MAX_INODE_PREFETCH); + + /* Round the inode count up to a full chunk. */ + inodes = round_up(inodes, XFS_INODES_PER_CHUNK); + + /* + * In order to convert the number of inodes to prefetch into an + * estimate of the number of inobt records to cache, we require a + * conversion factor that reflects our expectations of the average + * loading factor of an inode chunk. Based on data gathered, most + * (but not all) filesystems manage to keep the inode chunks totally + * full, so we'll underestimate slightly so that our readahead will + * still deliver the performance we want on aging filesystems: + * + * inobt = inodes / (INODES_PER_CHUNK * (4 / 5)); + * + * The funny math is to avoid integer division. + */ + inobt_records = (inodes * 5) / (4 * XFS_INODES_PER_CHUNK); + + /* + * Allocate enough space to prefetch at least two inobt records so that + * we can cache both the record where the iwalk started and the next + * record. This simplifies the AG inode walk loop setup code. + */ + return max(inobt_records, 2U); +} + +/* + * Walk all inodes in the filesystem starting from @startino. The @iwalk_fn + * will be called for each allocated inode, being passed the inode's number and + * @data. @max_prefetch controls how many inobt records' worth of inodes we + * try to readahead. + */ +int +xfs_iwalk( + struct xfs_mount *mp, + struct xfs_trans *tp, + xfs_ino_t startino, + unsigned int flags, + xfs_iwalk_fn iwalk_fn, + unsigned int inode_records, + void *data) +{ + struct xfs_iwalk_ag iwag = { + .mp = mp, + .tp = tp, + .iwalk_fn = iwalk_fn, + .data = data, + .startino = startino, + .sz_recs = xfs_iwalk_prefetch(inode_records), + .trim_start = 1, + .skip_empty = 1, + .pwork = XFS_PWORK_SINGLE_THREADED, + }; + xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, startino); + int error; + + ASSERT(agno < mp->m_sb.sb_agcount); + ASSERT(!(flags & ~XFS_IWALK_FLAGS_ALL)); + + error = xfs_iwalk_alloc(&iwag); + if (error) + return error; + + for (; agno < mp->m_sb.sb_agcount; agno++) { + error = xfs_iwalk_ag(&iwag); + if (error) + break; + iwag.startino = XFS_AGINO_TO_INO(mp, agno + 1, 0); + if (flags & XFS_INOBT_WALK_SAME_AG) + break; + } + + xfs_iwalk_free(&iwag); + return error; +} + +/* Run per-thread iwalk work. */ +static int +xfs_iwalk_ag_work( + struct xfs_mount *mp, + struct xfs_pwork *pwork) +{ + struct xfs_iwalk_ag *iwag; + int error = 0; + + iwag = container_of(pwork, struct xfs_iwalk_ag, pwork); + if (xfs_pwork_want_abort(pwork)) + goto out; + + error = xfs_iwalk_alloc(iwag); + if (error) + goto out; + + error = xfs_iwalk_ag(iwag); + xfs_iwalk_free(iwag); +out: + kmem_free(iwag); + return error; +} + +/* + * Walk all the inodes in the filesystem using multiple threads to process each + * AG. + */ +int +xfs_iwalk_threaded( + struct xfs_mount *mp, + xfs_ino_t startino, + unsigned int flags, + xfs_iwalk_fn iwalk_fn, + unsigned int inode_records, + bool polled, + void *data) +{ + struct xfs_pwork_ctl pctl; + xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, startino); + unsigned int nr_threads; + int error; + + ASSERT(agno < mp->m_sb.sb_agcount); + ASSERT(!(flags & ~XFS_IWALK_FLAGS_ALL)); + + nr_threads = xfs_pwork_guess_datadev_parallelism(mp); + error = xfs_pwork_init(mp, &pctl, xfs_iwalk_ag_work, "xfs_iwalk", + nr_threads); + if (error) + return error; + + for (; agno < mp->m_sb.sb_agcount; agno++) { + struct xfs_iwalk_ag *iwag; + + if (xfs_pwork_ctl_want_abort(&pctl)) + break; + + iwag = kmem_zalloc(sizeof(struct xfs_iwalk_ag), KM_SLEEP); + iwag->mp = mp; + iwag->iwalk_fn = iwalk_fn; + iwag->data = data; + iwag->startino = startino; + iwag->sz_recs = xfs_iwalk_prefetch(inode_records); + xfs_pwork_queue(&pctl, &iwag->pwork); + startino = XFS_AGINO_TO_INO(mp, agno + 1, 0); + if (flags & XFS_INOBT_WALK_SAME_AG) + break; + } + + if (polled) + xfs_pwork_poll(&pctl); + return xfs_pwork_destroy(&pctl); +} + +/* + * Allow callers to cache up to a page's worth of inobt records. This reflects + * the existing inumbers prefetching behavior. Since the inobt walk does not + * itself do anything with the inobt records, we can set a fairly high limit + * here. + */ +#define MAX_INOBT_WALK_PREFETCH \ + (PAGE_SIZE / sizeof(struct xfs_inobt_rec_incore)) + +/* + * Given the number of records that the user wanted, set the number of inobt + * records that we buffer in memory. Set the maximum if @inobt_records == 0. + */ +static inline unsigned int +xfs_inobt_walk_prefetch( + unsigned int inobt_records) +{ + /* + * If the caller didn't tell us the number of inobt records they + * wanted, assume the maximum prefetch possible for best performance. + */ + if (inobt_records == 0) + inobt_records = MAX_INOBT_WALK_PREFETCH; + + /* + * Allocate enough space to prefetch at least two inobt records so that + * we can cache both the record where the iwalk started and the next + * record. This simplifies the AG inode walk loop setup code. + */ + inobt_records = max(inobt_records, 2U); + + /* + * Cap prefetch at that maximum so that we don't use an absurd amount + * of memory. + */ + return min_t(unsigned int, inobt_records, MAX_INOBT_WALK_PREFETCH); +} + +/* + * Walk all inode btree records in the filesystem starting from @startino. The + * @inobt_walk_fn will be called for each btree record, being passed the incore + * record and @data. @max_prefetch controls how many inobt records we try to + * cache ahead of time. + */ +int +xfs_inobt_walk( + struct xfs_mount *mp, + struct xfs_trans *tp, + xfs_ino_t startino, + unsigned int flags, + xfs_inobt_walk_fn inobt_walk_fn, + unsigned int inobt_records, + void *data) +{ + struct xfs_iwalk_ag iwag = { + .mp = mp, + .tp = tp, + .inobt_walk_fn = inobt_walk_fn, + .data = data, + .startino = startino, + .sz_recs = xfs_inobt_walk_prefetch(inobt_records), + .pwork = XFS_PWORK_SINGLE_THREADED, + }; + xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, startino); + int error; + + ASSERT(agno < mp->m_sb.sb_agcount); + ASSERT(!(flags & ~XFS_INOBT_WALK_FLAGS_ALL)); + + error = xfs_iwalk_alloc(&iwag); + if (error) + return error; + + for (; agno < mp->m_sb.sb_agcount; agno++) { + error = xfs_iwalk_ag(&iwag); + if (error) + break; + iwag.startino = XFS_AGINO_TO_INO(mp, agno + 1, 0); + if (flags & XFS_INOBT_WALK_SAME_AG) + break; + } + + xfs_iwalk_free(&iwag); + return error; +} diff --git a/fs/xfs/xfs_iwalk.h b/fs/xfs/xfs_iwalk.h new file mode 100644 index 000000000000..6c960e10ed4d --- /dev/null +++ b/fs/xfs/xfs_iwalk.h @@ -0,0 +1,46 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (C) 2019 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <darrick.wong@oracle.com> + */ +#ifndef __XFS_IWALK_H__ +#define __XFS_IWALK_H__ + +/* Walk all inodes in the filesystem starting from @startino. */ +typedef int (*xfs_iwalk_fn)(struct xfs_mount *mp, struct xfs_trans *tp, + xfs_ino_t ino, void *data); +/* Return values for xfs_iwalk_fn. */ +#define XFS_IWALK_CONTINUE (XFS_ITER_CONTINUE) +#define XFS_IWALK_ABORT (XFS_ITER_ABORT) + +int xfs_iwalk(struct xfs_mount *mp, struct xfs_trans *tp, xfs_ino_t startino, + unsigned int flags, xfs_iwalk_fn iwalk_fn, + unsigned int inode_records, void *data); +int xfs_iwalk_threaded(struct xfs_mount *mp, xfs_ino_t startino, + unsigned int flags, xfs_iwalk_fn iwalk_fn, + unsigned int inode_records, bool poll, void *data); + +/* Only iterate inodes within the same AG as @startino. */ +#define XFS_IWALK_SAME_AG (0x1) + +#define XFS_IWALK_FLAGS_ALL (XFS_IWALK_SAME_AG) + +/* Walk all inode btree records in the filesystem starting from @startino. */ +typedef int (*xfs_inobt_walk_fn)(struct xfs_mount *mp, struct xfs_trans *tp, + xfs_agnumber_t agno, + const struct xfs_inobt_rec_incore *irec, + void *data); +/* Return value (for xfs_inobt_walk_fn) that aborts the walk immediately. */ +#define XFS_INOBT_WALK_ABORT (XFS_IWALK_ABORT) + +int xfs_inobt_walk(struct xfs_mount *mp, struct xfs_trans *tp, + xfs_ino_t startino, unsigned int flags, + xfs_inobt_walk_fn inobt_walk_fn, unsigned int inobt_records, + void *data); + +/* Only iterate inobt records within the same AG as @startino. */ +#define XFS_INOBT_WALK_SAME_AG (XFS_IWALK_SAME_AG) + +#define XFS_INOBT_WALK_FLAGS_ALL (XFS_INOBT_WALK_SAME_AG) + +#endif /* __XFS_IWALK_H__ */ diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h index edbd5a210df2..ca15105681ca 100644 --- a/fs/xfs/xfs_linux.h +++ b/fs/xfs/xfs_linux.h @@ -110,8 +110,6 @@ typedef __u32 xfs_nlink_t; #define current_restore_flags_nested(sp, f) \ (current->flags = ((current->flags & ~(f)) | (*(sp) & (f)))) -#define spinlock_destroy(lock) - #define NBBY 8 /* number of bits per byte */ /* @@ -221,6 +219,9 @@ static inline uint64_t howmany_64(uint64_t x, uint32_t y) return x; } +int xfs_rw_bdev(struct block_device *bdev, sector_t sector, unsigned int count, + char *data, unsigned int op); + #define ASSERT_ALWAYS(expr) \ (likely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__)) diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index c3b610b687d1..00e9f5c388d3 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -16,13 +16,10 @@ #include "xfs_trans_priv.h" #include "xfs_log.h" #include "xfs_log_priv.h" -#include "xfs_log_recover.h" -#include "xfs_inode.h" #include "xfs_trace.h" -#include "xfs_fsops.h" -#include "xfs_cksum.h" #include "xfs_sysfs.h" #include "xfs_sb.h" +#include "xfs_health.h" kmem_zone_t *xfs_log_ticket_zone; @@ -44,21 +41,14 @@ STATIC int xlog_space_left( struct xlog *log, atomic64_t *head); -STATIC int -xlog_sync( - struct xlog *log, - struct xlog_in_core *iclog); STATIC void xlog_dealloc_log( struct xlog *log); /* local state machine functions */ -STATIC void xlog_state_done_syncing(xlog_in_core_t *iclog, int); -STATIC void -xlog_state_do_callback( - struct xlog *log, - int aborted, - struct xlog_in_core *iclog); +STATIC void xlog_state_done_syncing( + struct xlog_in_core *iclog, + bool aborted); STATIC int xlog_state_get_iclog_space( struct xlog *log, @@ -106,8 +96,7 @@ STATIC void xlog_verify_iclog( struct xlog *log, struct xlog_in_core *iclog, - int count, - bool syncing); + int count); STATIC void xlog_verify_tail_lsn( struct xlog *log, @@ -116,7 +105,7 @@ xlog_verify_tail_lsn( #else #define xlog_verify_dest_ptr(a,b) #define xlog_verify_grant_tail(a) -#define xlog_verify_iclog(a,b,c,d) +#define xlog_verify_iclog(a,b,c) #define xlog_verify_tail_lsn(a,b,c) #endif @@ -540,32 +529,6 @@ xfs_log_done( return lsn; } -/* - * Attaches a new iclog I/O completion callback routine during - * transaction commit. If the log is in error state, a non-zero - * return code is handed back and the caller is responsible for - * executing the callback at an appropriate time. - */ -int -xfs_log_notify( - struct xlog_in_core *iclog, - xfs_log_callback_t *cb) -{ - int abortflg; - - spin_lock(&iclog->ic_callback_lock); - abortflg = (iclog->ic_state & XLOG_STATE_IOERROR); - if (!abortflg) { - ASSERT_ALWAYS((iclog->ic_state == XLOG_STATE_ACTIVE) || - (iclog->ic_state == XLOG_STATE_WANT_SYNC)); - cb->cb_next = NULL; - *(iclog->ic_callback_tail) = cb; - iclog->ic_callback_tail = &(cb->cb_next); - } - spin_unlock(&iclog->ic_callback_lock); - return abortflg; -} - int xfs_log_release_iclog( struct xfs_mount *mp, @@ -806,16 +769,12 @@ xfs_log_mount_finish( * The mount has failed. Cancel the recovery if it hasn't completed and destroy * the log. */ -int +void xfs_log_mount_cancel( struct xfs_mount *mp) { - int error; - - error = xlog_recover_cancel(mp->m_log); + xlog_recover_cancel(mp->m_log); xfs_log_unmount(mp); - - return error; } /* @@ -861,7 +820,7 @@ xfs_log_write_unmount_record( * recalculated during log recovery at next mount. Refer to * xlog_check_unmount_rec for more details. */ - if (XFS_TEST_ERROR((mp->m_flags & XFS_MOUNT_BAD_SUMMARY), mp, + if (XFS_TEST_ERROR(xfs_fs_has_sickness(mp, XFS_SICK_FS_COUNTERS), mp, XFS_ERRTAG_FORCE_SUMMARY_RECALC)) { xfs_alert(mp, "%s: will fix summary counters at next mount", __func__); @@ -931,7 +890,7 @@ xfs_log_unmount_write(xfs_mount_t *mp) * Or, if we are doing a forced umount (typically because of IO errors). */ if (mp->m_flags & XFS_MOUNT_NORECOVERY || - xfs_readonly_buftarg(log->l_mp->m_logdev_targp)) { + xfs_readonly_buftarg(log->l_targ)) { ASSERT(mp->m_flags & XFS_MOUNT_RDONLY); return 0; } @@ -1243,53 +1202,49 @@ xlog_space_left( } -/* - * Log function which is called when an io completes. - * - * The log manager needs its own routine, in order to control what - * happens with the buffer after the write completes. - */ static void -xlog_iodone(xfs_buf_t *bp) +xlog_ioend_work( + struct work_struct *work) { - struct xlog_in_core *iclog = bp->b_log_item; - struct xlog *l = iclog->ic_log; - int aborted = 0; + struct xlog_in_core *iclog = + container_of(work, struct xlog_in_core, ic_end_io_work); + struct xlog *log = iclog->ic_log; + bool aborted = false; + int error; + + error = blk_status_to_errno(iclog->ic_bio.bi_status); +#ifdef DEBUG + /* treat writes with injected CRC errors as failed */ + if (iclog->ic_fail_crc) + error = -EIO; +#endif /* - * Race to shutdown the filesystem if we see an error or the iclog is in - * IOABORT state. The IOABORT state is only set in DEBUG mode to inject - * CRC errors into log recovery. + * Race to shutdown the filesystem if we see an error. */ - if (XFS_TEST_ERROR(bp->b_error, l->l_mp, XFS_ERRTAG_IODONE_IOERR) || - iclog->ic_state & XLOG_STATE_IOABORT) { - if (iclog->ic_state & XLOG_STATE_IOABORT) - iclog->ic_state &= ~XLOG_STATE_IOABORT; - - xfs_buf_ioerror_alert(bp, __func__); - xfs_buf_stale(bp); - xfs_force_shutdown(l->l_mp, SHUTDOWN_LOG_IO_ERROR); + if (XFS_TEST_ERROR(error, log->l_mp, XFS_ERRTAG_IODONE_IOERR)) { + xfs_alert(log->l_mp, "log I/O error %d", error); + xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR); /* * This flag will be propagated to the trans-committed * callback routines to let them know that the log-commit * didn't succeed. */ - aborted = XFS_LI_ABORTED; + aborted = true; } else if (iclog->ic_state & XLOG_STATE_IOERROR) { - aborted = XFS_LI_ABORTED; + aborted = true; } - /* log I/O is always issued ASYNC */ - ASSERT(bp->b_flags & XBF_ASYNC); xlog_state_done_syncing(iclog, aborted); + bio_uninit(&iclog->ic_bio); /* - * drop the buffer lock now that we are done. Nothing references - * the buffer after this, so an unmount waiting on this lock can now - * tear it down safely. As such, it is unsafe to reference the buffer - * (bp) after the unlock as we could race with it being freed. + * Drop the lock to signal that we are done. Nothing references the + * iclog after this, so an unmount waiting on this lock can now tear it + * down safely. As such, it is unsafe to reference the iclog after the + * unlock as we could race with it being freed. */ - xfs_buf_unlock(bp); + up(&iclog->ic_sema); } /* @@ -1300,65 +1255,26 @@ xlog_iodone(xfs_buf_t *bp) * If the filesystem blocksize is too large, we may need to choose a * larger size since the directory code currently logs entire blocks. */ - STATIC void xlog_get_iclog_buffer_size( struct xfs_mount *mp, struct xlog *log) { - int size; - int xhdrs; - if (mp->m_logbufs <= 0) - log->l_iclog_bufs = XLOG_MAX_ICLOGS; - else - log->l_iclog_bufs = mp->m_logbufs; + mp->m_logbufs = XLOG_MAX_ICLOGS; + if (mp->m_logbsize <= 0) + mp->m_logbsize = XLOG_BIG_RECORD_BSIZE; + + log->l_iclog_bufs = mp->m_logbufs; + log->l_iclog_size = mp->m_logbsize; /* - * Buffer size passed in from mount system call. + * # headers = size / 32k - one header holds cycles from 32k of data. */ - if (mp->m_logbsize > 0) { - size = log->l_iclog_size = mp->m_logbsize; - log->l_iclog_size_log = 0; - while (size != 1) { - log->l_iclog_size_log++; - size >>= 1; - } - - if (xfs_sb_version_haslogv2(&mp->m_sb)) { - /* # headers = size / 32k - * one header holds cycles from 32k of data - */ - - xhdrs = mp->m_logbsize / XLOG_HEADER_CYCLE_SIZE; - if (mp->m_logbsize % XLOG_HEADER_CYCLE_SIZE) - xhdrs++; - log->l_iclog_hsize = xhdrs << BBSHIFT; - log->l_iclog_heads = xhdrs; - } else { - ASSERT(mp->m_logbsize <= XLOG_BIG_RECORD_BSIZE); - log->l_iclog_hsize = BBSIZE; - log->l_iclog_heads = 1; - } - goto done; - } - - /* All machines use 32kB buffers by default. */ - log->l_iclog_size = XLOG_BIG_RECORD_BSIZE; - log->l_iclog_size_log = XLOG_BIG_RECORD_BSHIFT; - - /* the default log size is 16k or 32k which is one header sector */ - log->l_iclog_hsize = BBSIZE; - log->l_iclog_heads = 1; - -done: - /* are we being asked to make the sizes selected above visible? */ - if (mp->m_logbufs == 0) - mp->m_logbufs = log->l_iclog_bufs; - if (mp->m_logbsize == 0) - mp->m_logbsize = log->l_iclog_size; -} /* xlog_get_iclog_buffer_size */ - + log->l_iclog_heads = + DIV_ROUND_UP(mp->m_logbsize, XLOG_HEADER_CYCLE_SIZE); + log->l_iclog_hsize = log->l_iclog_heads << BBSHIFT; +} void xfs_log_work_queue( @@ -1421,7 +1337,6 @@ xlog_alloc_log( xlog_rec_header_t *head; xlog_in_core_t **iclogp; xlog_in_core_t *iclog, *prev_iclog=NULL; - xfs_buf_t *bp; int i; int error = -ENOMEM; uint log2_size = 0; @@ -1479,30 +1394,6 @@ xlog_alloc_log( xlog_get_iclog_buffer_size(mp, log); - /* - * Use a NULL block for the extra log buffer used during splits so that - * it will trigger errors if we ever try to do IO on it without first - * having set it up properly. - */ - error = -ENOMEM; - bp = xfs_buf_alloc(mp->m_logdev_targp, XFS_BUF_DADDR_NULL, - BTOBB(log->l_iclog_size), XBF_NO_IOACCT); - if (!bp) - goto out_free_log; - - /* - * The iclogbuf buffer locks are held over IO but we are not going to do - * IO yet. Hence unlock the buffer so that the log IO path can grab it - * when appropriately. - */ - ASSERT(xfs_buf_islocked(bp)); - xfs_buf_unlock(bp); - - /* use high priority wq for log I/O completion */ - bp->b_ioend_wq = mp->m_log_workqueue; - bp->b_iodone = xlog_iodone; - log->l_xbuf = bp; - spin_lock_init(&log->l_icloglock); init_waitqueue_head(&log->l_flush_wait); @@ -1515,29 +1406,22 @@ xlog_alloc_log( * xlog_in_core_t in xfs_log_priv.h for details. */ ASSERT(log->l_iclog_size >= 4096); - for (i=0; i < log->l_iclog_bufs; i++) { - *iclogp = kmem_zalloc(sizeof(xlog_in_core_t), KM_MAYFAIL); - if (!*iclogp) + for (i = 0; i < log->l_iclog_bufs; i++) { + size_t bvec_size = howmany(log->l_iclog_size, PAGE_SIZE) * + sizeof(struct bio_vec); + + iclog = kmem_zalloc(sizeof(*iclog) + bvec_size, KM_MAYFAIL); + if (!iclog) goto out_free_iclog; - iclog = *iclogp; + *iclogp = iclog; iclog->ic_prev = prev_iclog; prev_iclog = iclog; - bp = xfs_buf_get_uncached(mp->m_logdev_targp, - BTOBB(log->l_iclog_size), - XBF_NO_IOACCT); - if (!bp) + iclog->ic_data = kmem_alloc_large(log->l_iclog_size, + KM_MAYFAIL); + if (!iclog->ic_data) goto out_free_iclog; - - ASSERT(xfs_buf_islocked(bp)); - xfs_buf_unlock(bp); - - /* use high priority wq for log I/O completion */ - bp->b_ioend_wq = mp->m_log_workqueue; - bp->b_iodone = xlog_iodone; - iclog->ic_bp = bp; - iclog->ic_data = bp->b_addr; #ifdef DEBUG log->l_iclog_bak[i] = &iclog->ic_header; #endif @@ -1551,36 +1435,43 @@ xlog_alloc_log( head->h_fmt = cpu_to_be32(XLOG_FMT); memcpy(&head->h_fs_uuid, &mp->m_sb.sb_uuid, sizeof(uuid_t)); - iclog->ic_size = BBTOB(bp->b_length) - log->l_iclog_hsize; + iclog->ic_size = log->l_iclog_size - log->l_iclog_hsize; iclog->ic_state = XLOG_STATE_ACTIVE; iclog->ic_log = log; atomic_set(&iclog->ic_refcnt, 0); spin_lock_init(&iclog->ic_callback_lock); - iclog->ic_callback_tail = &(iclog->ic_callback); + INIT_LIST_HEAD(&iclog->ic_callbacks); iclog->ic_datap = (char *)iclog->ic_data + log->l_iclog_hsize; init_waitqueue_head(&iclog->ic_force_wait); init_waitqueue_head(&iclog->ic_write_wait); + INIT_WORK(&iclog->ic_end_io_work, xlog_ioend_work); + sema_init(&iclog->ic_sema, 1); iclogp = &iclog->ic_next; } *iclogp = log->l_iclog; /* complete ring */ log->l_iclog->ic_prev = prev_iclog; /* re-write 1st prev ptr */ + log->l_ioend_workqueue = alloc_workqueue("xfs-log/%s", + WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_HIGHPRI, 0, + mp->m_fsname); + if (!log->l_ioend_workqueue) + goto out_free_iclog; + error = xlog_cil_init(log); if (error) - goto out_free_iclog; + goto out_destroy_workqueue; return log; +out_destroy_workqueue: + destroy_workqueue(log->l_ioend_workqueue); out_free_iclog: for (iclog = log->l_iclog; iclog; iclog = prev_iclog) { prev_iclog = iclog->ic_next; - if (iclog->ic_bp) - xfs_buf_free(iclog->ic_bp); + kmem_free(iclog->ic_data); kmem_free(iclog); } - spinlock_destroy(&log->l_icloglock); - xfs_buf_free(log->l_xbuf); out_free_log: kmem_free(log); out: @@ -1765,42 +1656,155 @@ xlog_cksum( return xfs_end_cksum(crc); } -/* - * The bdstrat callback function for log bufs. This gives us a central - * place to trap bufs in case we get hit by a log I/O error and need to - * shutdown. Actually, in practice, even when we didn't get a log error, - * we transition the iclogs to IOERROR state *after* flushing all existing - * iclogs to disk. This is because we don't want anymore new transactions to be - * started or completed afterwards. - * - * We lock the iclogbufs here so that we can serialise against IO completion - * during unmount. We might be processing a shutdown triggered during unmount, - * and that can occur asynchronously to the unmount thread, and hence we need to - * ensure that completes before tearing down the iclogbufs. Hence we need to - * hold the buffer lock across the log IO to acheive that. - */ -STATIC int -xlog_bdstrat( - struct xfs_buf *bp) +static void +xlog_bio_end_io( + struct bio *bio) { - struct xlog_in_core *iclog = bp->b_log_item; + struct xlog_in_core *iclog = bio->bi_private; - xfs_buf_lock(bp); - if (iclog->ic_state & XLOG_STATE_IOERROR) { - xfs_buf_ioerror(bp, -EIO); - xfs_buf_stale(bp); - xfs_buf_ioend(bp); + queue_work(iclog->ic_log->l_ioend_workqueue, + &iclog->ic_end_io_work); +} + +static void +xlog_map_iclog_data( + struct bio *bio, + void *data, + size_t count) +{ + do { + struct page *page = kmem_to_page(data); + unsigned int off = offset_in_page(data); + size_t len = min_t(size_t, count, PAGE_SIZE - off); + + WARN_ON_ONCE(bio_add_page(bio, page, len, off) != len); + + data += len; + count -= len; + } while (count); +} + +STATIC void +xlog_write_iclog( + struct xlog *log, + struct xlog_in_core *iclog, + uint64_t bno, + unsigned int count, + bool need_flush) +{ + ASSERT(bno < log->l_logBBsize); + + /* + * We lock the iclogbufs here so that we can serialise against I/O + * completion during unmount. We might be processing a shutdown + * triggered during unmount, and that can occur asynchronously to the + * unmount thread, and hence we need to ensure that completes before + * tearing down the iclogbufs. Hence we need to hold the buffer lock + * across the log IO to archieve that. + */ + down(&iclog->ic_sema); + if (unlikely(iclog->ic_state & XLOG_STATE_IOERROR)) { /* * It would seem logical to return EIO here, but we rely on * the log state machine to propagate I/O errors instead of - * doing it here. Similarly, IO completion will unlock the - * buffer, so we don't do it here. + * doing it here. We kick of the state machine and unlock + * the buffer manually, the code needs to be kept in sync + * with the I/O completion path. */ - return 0; + xlog_state_done_syncing(iclog, XFS_LI_ABORTED); + up(&iclog->ic_sema); + return; } - xfs_buf_submit(bp); - return 0; + iclog->ic_io_size = count; + + bio_init(&iclog->ic_bio, iclog->ic_bvec, howmany(count, PAGE_SIZE)); + bio_set_dev(&iclog->ic_bio, log->l_targ->bt_bdev); + iclog->ic_bio.bi_iter.bi_sector = log->l_logBBstart + bno; + iclog->ic_bio.bi_end_io = xlog_bio_end_io; + iclog->ic_bio.bi_private = iclog; + iclog->ic_bio.bi_opf = REQ_OP_WRITE | REQ_META | REQ_SYNC | REQ_FUA; + if (need_flush) + iclog->ic_bio.bi_opf |= REQ_PREFLUSH; + + xlog_map_iclog_data(&iclog->ic_bio, iclog->ic_data, iclog->ic_io_size); + if (is_vmalloc_addr(iclog->ic_data)) + flush_kernel_vmap_range(iclog->ic_data, iclog->ic_io_size); + + /* + * If this log buffer would straddle the end of the log we will have + * to split it up into two bios, so that we can continue at the start. + */ + if (bno + BTOBB(count) > log->l_logBBsize) { + struct bio *split; + + split = bio_split(&iclog->ic_bio, log->l_logBBsize - bno, + GFP_NOIO, &fs_bio_set); + bio_chain(split, &iclog->ic_bio); + submit_bio(split); + + /* restart at logical offset zero for the remainder */ + iclog->ic_bio.bi_iter.bi_sector = log->l_logBBstart; + } + + submit_bio(&iclog->ic_bio); +} + +/* + * We need to bump cycle number for the part of the iclog that is + * written to the start of the log. Watch out for the header magic + * number case, though. + */ +static void +xlog_split_iclog( + struct xlog *log, + void *data, + uint64_t bno, + unsigned int count) +{ + unsigned int split_offset = BBTOB(log->l_logBBsize - bno); + unsigned int i; + + for (i = split_offset; i < count; i += BBSIZE) { + uint32_t cycle = get_unaligned_be32(data + i); + + if (++cycle == XLOG_HEADER_MAGIC_NUM) + cycle++; + put_unaligned_be32(cycle, data + i); + } +} + +static int +xlog_calc_iclog_size( + struct xlog *log, + struct xlog_in_core *iclog, + uint32_t *roundoff) +{ + uint32_t count_init, count; + bool use_lsunit; + + use_lsunit = xfs_sb_version_haslogv2(&log->l_mp->m_sb) && + log->l_mp->m_sb.sb_logsunit > 1; + + /* Add for LR header */ + count_init = log->l_iclog_hsize + iclog->ic_offset; + + /* Round out the log write size */ + if (use_lsunit) { + /* we have a v2 stripe unit to use */ + count = XLOG_LSUNITTOB(log, XLOG_BTOLSUNIT(log, count_init)); + } else { + count = BBTOB(BTOBB(count_init)); + } + + ASSERT(count >= count_init); + *roundoff = count - count_init; + + if (use_lsunit) + ASSERT(*roundoff < log->l_mp->m_sb.sb_logsunit); + else + ASSERT(*roundoff < BBTOB(1)); + return count; } /* @@ -1823,46 +1827,23 @@ xlog_bdstrat( * log will require grabbing the lock though. * * The entire log manager uses a logical block numbering scheme. Only - * log_sync (and then only bwrite()) know about the fact that the log may - * not start with block zero on a given device. The log block start offset - * is added immediately before calling bwrite(). + * xlog_write_iclog knows about the fact that the log may not start with + * block zero on a given device. */ - -STATIC int +STATIC void xlog_sync( struct xlog *log, struct xlog_in_core *iclog) { - xfs_buf_t *bp; - int i; - uint count; /* byte count of bwrite */ - uint count_init; /* initial count before roundup */ - int roundoff; /* roundoff to BB or stripe */ - int split = 0; /* split write into two regions */ - int error; - int v2 = xfs_sb_version_haslogv2(&log->l_mp->m_sb); - int size; + unsigned int count; /* byte count of bwrite */ + unsigned int roundoff; /* roundoff to BB or stripe */ + uint64_t bno; + unsigned int size; + bool need_flush = true, split = false; - XFS_STATS_INC(log->l_mp, xs_log_writes); ASSERT(atomic_read(&iclog->ic_refcnt) == 0); - /* Add for LR header */ - count_init = log->l_iclog_hsize + iclog->ic_offset; - - /* Round out the log write size */ - if (v2 && log->l_mp->m_sb.sb_logsunit > 1) { - /* we have a v2 stripe unit to use */ - count = XLOG_LSUNITTOB(log, XLOG_BTOLSUNIT(log, count_init)); - } else { - count = BBTOB(BTOBB(count_init)); - } - roundoff = count - count_init; - ASSERT(roundoff >= 0); - ASSERT((v2 && log->l_mp->m_sb.sb_logsunit > 1 && - roundoff < log->l_mp->m_sb.sb_logsunit) - || - (log->l_mp->m_sb.sb_logsunit <= 1 && - roundoff < BBTOB(1))); + count = xlog_calc_iclog_size(log, iclog, &roundoff); /* move grant heads by roundoff in sync */ xlog_grant_add_space(log, &log->l_reserve_head.grant, roundoff); @@ -1873,41 +1854,19 @@ xlog_sync( /* real byte length */ size = iclog->ic_offset; - if (v2) + if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) size += roundoff; iclog->ic_header.h_len = cpu_to_be32(size); - bp = iclog->ic_bp; - XFS_BUF_SET_ADDR(bp, BLOCK_LSN(be64_to_cpu(iclog->ic_header.h_lsn))); - + XFS_STATS_INC(log->l_mp, xs_log_writes); XFS_STATS_ADD(log->l_mp, xs_log_blocks, BTOBB(count)); - /* Do we need to split this write into 2 parts? */ - if (XFS_BUF_ADDR(bp) + BTOBB(count) > log->l_logBBsize) { - char *dptr; - - split = count - (BBTOB(log->l_logBBsize - XFS_BUF_ADDR(bp))); - count = BBTOB(log->l_logBBsize - XFS_BUF_ADDR(bp)); - iclog->ic_bwritecnt = 2; + bno = BLOCK_LSN(be64_to_cpu(iclog->ic_header.h_lsn)); - /* - * Bump the cycle numbers at the start of each block in the - * part of the iclog that ends up in the buffer that gets - * written to the start of the log. - * - * Watch out for the header magic number case, though. - */ - dptr = (char *)&iclog->ic_header + count; - for (i = 0; i < split; i += BBSIZE) { - uint32_t cycle = be32_to_cpu(*(__be32 *)dptr); - if (++cycle == XLOG_HEADER_MAGIC_NUM) - cycle++; - *(__be32 *)dptr = cpu_to_be32(cycle); - - dptr += BBSIZE; - } - } else { - iclog->ic_bwritecnt = 1; + /* Do we need to split this write into 2 parts? */ + if (bno + BTOBB(count) > log->l_logBBsize) { + xlog_split_iclog(log, &iclog->ic_header, bno, count); + split = true; } /* calculcate the checksum */ @@ -1920,18 +1879,15 @@ xlog_sync( * write on I/O completion and shutdown the fs. The subsequent mount * detects the bad CRC and attempts to recover. */ +#ifdef DEBUG if (XFS_TEST_ERROR(false, log->l_mp, XFS_ERRTAG_LOG_BAD_CRC)) { iclog->ic_header.h_crc &= cpu_to_le32(0xAAAAAAAA); - iclog->ic_state |= XLOG_STATE_IOABORT; + iclog->ic_fail_crc = true; xfs_warn(log->l_mp, "Intentionally corrupted log record at LSN 0x%llx. Shutdown imminent.", be64_to_cpu(iclog->ic_header.h_lsn)); } - - bp->b_io_length = BTOBB(count); - bp->b_log_item = iclog; - bp->b_flags &= ~XBF_FLUSH; - bp->b_flags |= (XBF_ASYNC | XBF_SYNCIO | XBF_WRITE | XBF_FUA); +#endif /* * Flush the data device before flushing the log to make sure all meta @@ -1941,50 +1897,14 @@ xlog_sync( * synchronously here; for an internal log we can simply use the block * layer state machine for preflushes. */ - if (log->l_mp->m_logdev_targp != log->l_mp->m_ddev_targp) + if (log->l_targ != log->l_mp->m_ddev_targp || split) { xfs_blkdev_issue_flush(log->l_mp->m_ddev_targp); - else - bp->b_flags |= XBF_FLUSH; - - ASSERT(XFS_BUF_ADDR(bp) <= log->l_logBBsize-1); - ASSERT(XFS_BUF_ADDR(bp) + BTOBB(count) <= log->l_logBBsize); - - xlog_verify_iclog(log, iclog, count, true); - - /* account for log which doesn't start at block #0 */ - XFS_BUF_SET_ADDR(bp, XFS_BUF_ADDR(bp) + log->l_logBBstart); - - /* - * Don't call xfs_bwrite here. We do log-syncs even when the filesystem - * is shutting down. - */ - error = xlog_bdstrat(bp); - if (error) { - xfs_buf_ioerror_alert(bp, "xlog_sync"); - return error; + need_flush = false; } - if (split) { - bp = iclog->ic_log->l_xbuf; - XFS_BUF_SET_ADDR(bp, 0); /* logical 0 */ - xfs_buf_associate_memory(bp, - (char *)&iclog->ic_header + count, split); - bp->b_log_item = iclog; - bp->b_flags &= ~XBF_FLUSH; - bp->b_flags |= (XBF_ASYNC | XBF_SYNCIO | XBF_WRITE | XBF_FUA); - - ASSERT(XFS_BUF_ADDR(bp) <= log->l_logBBsize-1); - ASSERT(XFS_BUF_ADDR(bp) + BTOBB(count) <= log->l_logBBsize); - - /* account for internal log which doesn't start at block #0 */ - XFS_BUF_SET_ADDR(bp, XFS_BUF_ADDR(bp) + log->l_logBBstart); - error = xlog_bdstrat(bp); - if (error) { - xfs_buf_ioerror_alert(bp, "xlog_sync (split)"); - return error; - } - } - return 0; -} /* xlog_sync */ + + xlog_verify_iclog(log, iclog, count); + xlog_write_iclog(log, iclog, bno, count, need_flush); +} /* * Deallocate a log structure @@ -2004,31 +1924,21 @@ xlog_dealloc_log( */ iclog = log->l_iclog; for (i = 0; i < log->l_iclog_bufs; i++) { - xfs_buf_lock(iclog->ic_bp); - xfs_buf_unlock(iclog->ic_bp); + down(&iclog->ic_sema); + up(&iclog->ic_sema); iclog = iclog->ic_next; } - /* - * Always need to ensure that the extra buffer does not point to memory - * owned by another log buffer before we free it. Also, cycle the lock - * first to ensure we've completed IO on it. - */ - xfs_buf_lock(log->l_xbuf); - xfs_buf_unlock(log->l_xbuf); - xfs_buf_set_empty(log->l_xbuf, BTOBB(log->l_iclog_size)); - xfs_buf_free(log->l_xbuf); - iclog = log->l_iclog; for (i = 0; i < log->l_iclog_bufs; i++) { - xfs_buf_free(iclog->ic_bp); next_iclog = iclog->ic_next; + kmem_free(iclog->ic_data); kmem_free(iclog); iclog = next_iclog; } - spinlock_destroy(&log->l_icloglock); log->l_mp->m_log = NULL; + destroy_workqueue(log->l_ioend_workqueue); kmem_free(log); } /* xlog_dealloc_log */ @@ -2068,7 +1978,7 @@ xlog_print_tic_res( /* match with XLOG_REG_TYPE_* in xfs_log.h */ #define REG_TYPE_STR(type, str) [XLOG_REG_TYPE_##type] = str - static char *res_type_str[XLOG_REG_TYPE_MAX + 1] = { + static char *res_type_str[] = { REG_TYPE_STR(BFORMAT, "bformat"), REG_TYPE_STR(BCHUNK, "bchunk"), REG_TYPE_STR(EFI_FORMAT, "efi_format"), @@ -2088,8 +1998,15 @@ xlog_print_tic_res( REG_TYPE_STR(UNMOUNT, "unmount"), REG_TYPE_STR(COMMIT, "commit"), REG_TYPE_STR(TRANSHDR, "trans header"), - REG_TYPE_STR(ICREATE, "inode create") + REG_TYPE_STR(ICREATE, "inode create"), + REG_TYPE_STR(RUI_FORMAT, "rui_format"), + REG_TYPE_STR(RUD_FORMAT, "rud_format"), + REG_TYPE_STR(CUI_FORMAT, "cui_format"), + REG_TYPE_STR(CUD_FORMAT, "cud_format"), + REG_TYPE_STR(BUI_FORMAT, "bui_format"), + REG_TYPE_STR(BUD_FORMAT, "bud_format"), }; + BUILD_BUG_ON(ARRAY_SIZE(res_type_str) != XLOG_REG_TYPE_MAX + 1); #undef REG_TYPE_STR xfs_warn(mp, "ticket reservation summary:"); @@ -2602,7 +2519,7 @@ xlog_state_clean_log( if (iclog->ic_state == XLOG_STATE_DIRTY) { iclog->ic_state = XLOG_STATE_ACTIVE; iclog->ic_offset = 0; - ASSERT(iclog->ic_callback == NULL); + ASSERT(list_empty_careful(&iclog->ic_callbacks)); /* * If the number of ops in this iclog indicate it just * contains the dummy transaction, we can @@ -2672,37 +2589,32 @@ xlog_state_clean_log( STATIC xfs_lsn_t xlog_get_lowest_lsn( - struct xlog *log) + struct xlog *log) { - xlog_in_core_t *lsn_log; - xfs_lsn_t lowest_lsn, lsn; + struct xlog_in_core *iclog = log->l_iclog; + xfs_lsn_t lowest_lsn = 0, lsn; - lsn_log = log->l_iclog; - lowest_lsn = 0; do { - if (!(lsn_log->ic_state & (XLOG_STATE_ACTIVE|XLOG_STATE_DIRTY))) { - lsn = be64_to_cpu(lsn_log->ic_header.h_lsn); - if ((lsn && !lowest_lsn) || - (XFS_LSN_CMP(lsn, lowest_lsn) < 0)) { + if (iclog->ic_state & (XLOG_STATE_ACTIVE | XLOG_STATE_DIRTY)) + continue; + + lsn = be64_to_cpu(iclog->ic_header.h_lsn); + if ((lsn && !lowest_lsn) || XFS_LSN_CMP(lsn, lowest_lsn) < 0) lowest_lsn = lsn; - } - } - lsn_log = lsn_log->ic_next; - } while (lsn_log != log->l_iclog); + } while ((iclog = iclog->ic_next) != log->l_iclog); + return lowest_lsn; } - STATIC void xlog_state_do_callback( struct xlog *log, - int aborted, + bool aborted, struct xlog_in_core *ciclog) { xlog_in_core_t *iclog; xlog_in_core_t *first_iclog; /* used to know when we've * processed all iclogs once */ - xfs_log_callback_t *cb, *cb_next; int flushcnt = 0; xfs_lsn_t lowest_lsn; int ioerrors; /* counter: iclogs with errors */ @@ -2813,7 +2725,7 @@ xlog_state_do_callback( */ ASSERT(XFS_LSN_CMP(atomic64_read(&log->l_last_sync_lsn), be64_to_cpu(iclog->ic_header.h_lsn)) <= 0); - if (iclog->ic_callback) + if (!list_empty_careful(&iclog->ic_callbacks)) atomic64_set(&log->l_last_sync_lsn, be64_to_cpu(iclog->ic_header.h_lsn)); @@ -2830,26 +2742,20 @@ xlog_state_do_callback( * callbacks being added. */ spin_lock(&iclog->ic_callback_lock); - cb = iclog->ic_callback; - while (cb) { - iclog->ic_callback_tail = &(iclog->ic_callback); - iclog->ic_callback = NULL; - spin_unlock(&iclog->ic_callback_lock); + while (!list_empty(&iclog->ic_callbacks)) { + LIST_HEAD(tmp); - /* perform callbacks in the order given */ - for (; cb; cb = cb_next) { - cb_next = cb->cb_next; - cb->cb_func(cb->cb_arg, aborted); - } + list_splice_init(&iclog->ic_callbacks, &tmp); + + spin_unlock(&iclog->ic_callback_lock); + xlog_cil_process_committed(&tmp, aborted); spin_lock(&iclog->ic_callback_lock); - cb = iclog->ic_callback; } loopdidcallbacks++; funcdidcallbacks++; spin_lock(&log->l_icloglock); - ASSERT(iclog->ic_callback == NULL); spin_unlock(&iclog->ic_callback_lock); if (!(iclog->ic_state & XLOG_STATE_IOERROR)) iclog->ic_state = XLOG_STATE_DIRTY; @@ -2935,18 +2841,16 @@ xlog_state_do_callback( */ STATIC void xlog_state_done_syncing( - xlog_in_core_t *iclog, - int aborted) + struct xlog_in_core *iclog, + bool aborted) { - struct xlog *log = iclog->ic_log; + struct xlog *log = iclog->ic_log; spin_lock(&log->l_icloglock); ASSERT(iclog->ic_state == XLOG_STATE_SYNCING || iclog->ic_state == XLOG_STATE_IOERROR); ASSERT(atomic_read(&iclog->ic_refcnt) == 0); - ASSERT(iclog->ic_bwritecnt == 1 || iclog->ic_bwritecnt == 2); - /* * If we got an error, either on the first buffer, or in the case of @@ -2954,13 +2858,8 @@ xlog_state_done_syncing( * and none should ever be attempted to be written to disk * again. */ - if (iclog->ic_state != XLOG_STATE_IOERROR) { - if (--iclog->ic_bwritecnt == 1) { - spin_unlock(&log->l_icloglock); - return; - } + if (iclog->ic_state != XLOG_STATE_IOERROR) iclog->ic_state = XLOG_STATE_DONE_SYNC; - } /* * Someone could be sleeping prior to writing out the next @@ -3229,7 +3128,7 @@ xlog_state_release_iclog( * flags after this point. */ if (sync) - return xlog_sync(log, iclog); + xlog_sync(log, iclog); return 0; } /* xlog_state_release_iclog */ @@ -3820,8 +3719,7 @@ STATIC void xlog_verify_iclog( struct xlog *log, struct xlog_in_core *iclog, - int count, - bool syncing) + int count) { xlog_op_header_t *ophead; xlog_in_core_t *icptr; @@ -3865,7 +3763,7 @@ xlog_verify_iclog( /* clientid is only 1 byte */ p = &ophead->oh_clientid; field_offset = p - base_ptr; - if (!syncing || (field_offset & 0x1ff)) { + if (field_offset & 0x1ff) { clientid = ophead->oh_clientid; } else { idx = BTOBBT((char *)&ophead->oh_clientid - iclog->ic_datap); @@ -3888,7 +3786,7 @@ xlog_verify_iclog( /* check length */ p = &ophead->oh_len; field_offset = p - base_ptr; - if (!syncing || (field_offset & 0x1ff)) { + if (field_offset & 0x1ff) { op_len = be32_to_cpu(ophead->oh_len); } else { idx = BTOBBT((uintptr_t)&ophead->oh_len - @@ -4025,7 +3923,7 @@ xfs_log_force_umount( * avoid races. */ wake_up_all(&log->l_cilp->xc_commit_wait); - xlog_state_do_callback(log, XFS_LI_ABORTED, NULL); + xlog_state_do_callback(log, true, NULL); #ifdef XFSERRORDEBUG { diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h index 73a64bf32f6f..84e06805160f 100644 --- a/fs/xfs/xfs_log.h +++ b/fs/xfs/xfs_log.h @@ -6,6 +6,8 @@ #ifndef __XFS_LOG_H__ #define __XFS_LOG_H__ +struct xfs_cil_ctx; + struct xfs_log_vec { struct xfs_log_vec *lv_next; /* next lv in build list */ int lv_niovecs; /* number of iovecs in lv */ @@ -72,16 +74,6 @@ xlog_copy_iovec(struct xfs_log_vec *lv, struct xfs_log_iovec **vecp, } /* - * Structure used to pass callback function and the function's argument - * to the log manager. - */ -typedef struct xfs_log_callback { - struct xfs_log_callback *cb_next; - void (*cb_func)(void *, int); - void *cb_arg; -} xfs_log_callback_t; - -/* * By comparing each component, we don't have to worry about extra * endian issues in treating two 32 bit numbers as one 64 bit number */ @@ -125,12 +117,10 @@ int xfs_log_mount(struct xfs_mount *mp, xfs_daddr_t start_block, int num_bblocks); int xfs_log_mount_finish(struct xfs_mount *mp); -int xfs_log_mount_cancel(struct xfs_mount *); +void xfs_log_mount_cancel(struct xfs_mount *); xfs_lsn_t xlog_assign_tail_lsn(struct xfs_mount *mp); xfs_lsn_t xlog_assign_tail_lsn_locked(struct xfs_mount *mp); void xfs_log_space_wake(struct xfs_mount *mp); -int xfs_log_notify(struct xlog_in_core *iclog, - struct xfs_log_callback *callback_entry); int xfs_log_release_iclog(struct xfs_mount *mp, struct xlog_in_core *iclog); int xfs_log_reserve(struct xfs_mount *mp, @@ -148,6 +138,7 @@ void xfs_log_ticket_put(struct xlog_ticket *ticket); void xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp, xfs_lsn_t *commit_lsn, bool regrant); +void xlog_cil_process_committed(struct list_head *list, bool aborted); bool xfs_log_item_in_current_chkpt(struct xfs_log_item *lip); void xfs_log_work_queue(struct xfs_mount *mp); diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index d3884e08b43c..fa5602d0fd7f 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -10,10 +10,7 @@ #include "xfs_shared.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" -#include "xfs_error.h" -#include "xfs_alloc.h" #include "xfs_extent_busy.h" -#include "xfs_discard.h" #include "xfs_trans.h" #include "xfs_trans_priv.h" #include "xfs_log.h" @@ -246,7 +243,8 @@ xfs_cil_prepare_item( * shadow buffer, so update the the pointer to it appropriately. */ if (!old_lv) { - lv->lv_item->li_ops->iop_pin(lv->lv_item); + if (lv->lv_item->li_ops->iop_pin) + lv->lv_item->li_ops->iop_pin(lv->lv_item); lv->lv_item->li_lv_shadow = NULL; } else if (old_lv != lv) { ASSERT(lv->lv_buf_len != XFS_LOG_VEC_ORDERED); @@ -576,12 +574,24 @@ xlog_discard_busy_extents( */ static void xlog_cil_committed( - void *args, - int abort) + struct xfs_cil_ctx *ctx, + bool abort) { - struct xfs_cil_ctx *ctx = args; struct xfs_mount *mp = ctx->cil->xc_log->l_mp; + /* + * If the I/O failed, we're aborting the commit and already shutdown. + * Wake any commit waiters before aborting the log items so we don't + * block async log pushers on callbacks. Async log pushers explicitly do + * not wait on log force completion because they may be holding locks + * required to unpin items. + */ + if (abort) { + spin_lock(&ctx->cil->xc_push_lock); + wake_up_all(&ctx->cil->xc_commit_wait); + spin_unlock(&ctx->cil->xc_push_lock); + } + xfs_trans_committed_bulk(ctx->cil->xc_log->l_ailp, ctx->lv_chain, ctx->start_lsn, abort); @@ -589,15 +599,7 @@ xlog_cil_committed( xfs_extent_busy_clear(mp, &ctx->busy_extents, (mp->m_flags & XFS_MOUNT_DISCARD) && !abort); - /* - * If we are aborting the commit, wake up anyone waiting on the - * committing list. If we don't, then a shutdown we can leave processes - * waiting in xlog_cil_force_lsn() waiting on a sequence commit that - * will never happen because we aborted it. - */ spin_lock(&ctx->cil->xc_push_lock); - if (abort) - wake_up_all(&ctx->cil->xc_commit_wait); list_del(&ctx->committing); spin_unlock(&ctx->cil->xc_push_lock); @@ -609,6 +611,20 @@ xlog_cil_committed( kmem_free(ctx); } +void +xlog_cil_process_committed( + struct list_head *list, + bool aborted) +{ + struct xfs_cil_ctx *ctx; + + while ((ctx = list_first_entry_or_null(list, + struct xfs_cil_ctx, iclog_entry))) { + list_del(&ctx->iclog_entry); + xlog_cil_committed(ctx, aborted); + } +} + /* * Push the Committed Item List to the log. If @push_seq flag is zero, then it * is a background flush and so we can chose to ignore it. Otherwise, if the @@ -830,12 +846,15 @@ restart: if (commit_lsn == -1) goto out_abort; - /* attach all the transactions w/ busy extents to iclog */ - ctx->log_cb.cb_func = xlog_cil_committed; - ctx->log_cb.cb_arg = ctx; - error = xfs_log_notify(commit_iclog, &ctx->log_cb); - if (error) + spin_lock(&commit_iclog->ic_callback_lock); + if (commit_iclog->ic_state & XLOG_STATE_IOERROR) { + spin_unlock(&commit_iclog->ic_callback_lock); goto out_abort; + } + ASSERT_ALWAYS(commit_iclog->ic_state == XLOG_STATE_ACTIVE || + commit_iclog->ic_state == XLOG_STATE_WANT_SYNC); + list_add_tail(&ctx->iclog_entry, &commit_iclog->ic_callbacks); + spin_unlock(&commit_iclog->ic_callback_lock); /* * now the checkpoint commit is complete and we've attached the @@ -859,7 +878,7 @@ out_skip: out_abort_free_ticket: xfs_log_ticket_put(tic); out_abort: - xlog_cil_committed(ctx, XFS_LI_ABORTED); + xlog_cil_committed(ctx, true); return -EIO; } @@ -979,6 +998,7 @@ xfs_log_commit_cil( { struct xlog *log = mp->m_log; struct xfs_cil *cil = log->l_cilp; + struct xfs_log_item *lip, *next; xfs_lsn_t xc_commit_lsn; /* @@ -1003,7 +1023,7 @@ xfs_log_commit_cil( /* * Once all the items of the transaction have been copied to the CIL, - * the items can be unlocked and freed. + * the items can be unlocked and possibly freed. * * This needs to be done before we drop the CIL context lock because we * have to update state in the log items and unlock them before they go @@ -1012,8 +1032,12 @@ xfs_log_commit_cil( * the log items. This affects (at least) processing of stale buffers, * inodes and EFIs. */ - xfs_trans_free_items(tp, xc_commit_lsn, false); - + trace_xfs_trans_commit_items(tp, _RET_IP_); + list_for_each_entry_safe(lip, next, &tp->t_items, li_trans) { + xfs_trans_del_item(lip); + if (lip->li_ops->iop_committing) + lip->li_ops->iop_committing(lip, xc_commit_lsn); + } xlog_cil_push_background(log); up_read(&cil->xc_ctx_lock); diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index b5f82cb36202..b880c23cb6e4 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -10,7 +10,6 @@ struct xfs_buf; struct xlog; struct xlog_ticket; struct xfs_mount; -struct xfs_log_callback; /* * Flags for log structure @@ -50,7 +49,6 @@ static inline uint xlog_get_client_id(__be32 i) #define XLOG_STATE_CALLBACK 0x0020 /* Callback functions now */ #define XLOG_STATE_DIRTY 0x0040 /* Dirty IC log, not ready for ACTIVE status*/ #define XLOG_STATE_IOERROR 0x0080 /* IO error happened in sync'ing log */ -#define XLOG_STATE_IOABORT 0x0100 /* force abort on I/O completion (debug) */ #define XLOG_STATE_ALL 0x7FFF /* All possible valid flags */ #define XLOG_STATE_NOTUSED 0x8000 /* This IC log not being used */ @@ -179,11 +177,10 @@ typedef struct xlog_ticket { * the iclog. * - ic_forcewait is used to implement synchronous forcing of the iclog to disk. * - ic_next is the pointer to the next iclog in the ring. - * - ic_bp is a pointer to the buffer used to write this incore log to disk. * - ic_log is a pointer back to the global log structure. - * - ic_callback is a linked list of callback function/argument pairs to be - * called after an iclog finishes writing. - * - ic_size is the full size of the header plus data. + * - ic_size is the full size of the log buffer, minus the cycle headers. + * - ic_io_size is the size of the currently pending log buffer write, which + * might be smaller than ic_size * - ic_offset is the current number of bytes written to in this iclog. * - ic_refcnt is bumped when someone is writing to the log. * - ic_state is the state of the iclog. @@ -193,7 +190,7 @@ typedef struct xlog_ticket { * structure cacheline aligned. The following fields can be contended on * by independent processes: * - * - ic_callback_* + * - ic_callbacks * - ic_refcnt * - fields protected by the global l_icloglock * @@ -206,23 +203,28 @@ typedef struct xlog_in_core { wait_queue_head_t ic_write_wait; struct xlog_in_core *ic_next; struct xlog_in_core *ic_prev; - struct xfs_buf *ic_bp; struct xlog *ic_log; - int ic_size; - int ic_offset; - int ic_bwritecnt; + u32 ic_size; + u32 ic_io_size; + u32 ic_offset; unsigned short ic_state; char *ic_datap; /* pointer to iclog data */ /* Callback structures need their own cacheline */ spinlock_t ic_callback_lock ____cacheline_aligned_in_smp; - struct xfs_log_callback *ic_callback; - struct xfs_log_callback **ic_callback_tail; + struct list_head ic_callbacks; /* reference counts need their own cacheline */ atomic_t ic_refcnt ____cacheline_aligned_in_smp; xlog_in_core_2_t *ic_data; #define ic_header ic_data->hic_header +#ifdef DEBUG + bool ic_fail_crc : 1; +#endif + struct semaphore ic_sema; + struct work_struct ic_end_io_work; + struct bio ic_bio; + struct bio_vec ic_bvec[]; } xlog_in_core_t; /* @@ -243,7 +245,7 @@ struct xfs_cil_ctx { int space_used; /* aggregate size of regions */ struct list_head busy_extents; /* busy extents in chkpt */ struct xfs_log_vec *lv_chain; /* logvecs being pushed */ - struct xfs_log_callback log_cb; /* completion callback hook. */ + struct list_head iclog_entry; struct list_head committing; /* ctx committing list */ struct work_struct discard_endio_work; }; @@ -350,9 +352,8 @@ struct xlog { struct xfs_mount *l_mp; /* mount point */ struct xfs_ail *l_ailp; /* AIL log is working with */ struct xfs_cil *l_cilp; /* CIL log is working with */ - struct xfs_buf *l_xbuf; /* extra buffer for log - * wrapping */ struct xfs_buftarg *l_targ; /* buftarg of log */ + struct workqueue_struct *l_ioend_workqueue; /* for I/O completions */ struct delayed_work l_work; /* background flush work */ uint l_flags; uint l_quotaoffs_flag; /* XFS_DQ_*, for QUOTAOFFs */ @@ -361,7 +362,6 @@ struct xlog { int l_iclog_heads; /* # of iclog header sectors */ uint l_sectBBsize; /* sector size in BBs (2^n) */ int l_iclog_size; /* size of log in bytes */ - int l_iclog_size_log; /* log power size of log */ int l_iclog_bufs; /* number of iclog buffers */ xfs_daddr_t l_logBBstart; /* start block of log */ int l_logsize; /* size of log in bytes */ @@ -418,7 +418,7 @@ xlog_recover( extern int xlog_recover_finish( struct xlog *log); -extern int +extern void xlog_recover_cancel(struct xlog *); extern __le32 xlog_cksum(struct xlog *log, struct xlog_rec_header *rhead, diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 9fe88d125f0a..13d1d3e95b88 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -13,8 +13,6 @@ #include "xfs_sb.h" #include "xfs_mount.h" #include "xfs_defer.h" -#include "xfs_da_format.h" -#include "xfs_da_btree.h" #include "xfs_inode.h" #include "xfs_trans.h" #include "xfs_log.h" @@ -26,7 +24,6 @@ #include "xfs_alloc.h" #include "xfs_ialloc.h" #include "xfs_quota.h" -#include "xfs_cksum.h" #include "xfs_trace.h" #include "xfs_icache.h" #include "xfs_bmap_btree.h" @@ -79,7 +76,7 @@ struct xfs_buf_cancel { * are valid, false otherwise. */ static inline bool -xlog_verify_bp( +xlog_verify_bno( struct xlog *log, xfs_daddr_t blk_no, int bbcount) @@ -92,22 +89,19 @@ xlog_verify_bp( } /* - * Allocate a buffer to hold log data. The buffer needs to be able - * to map to a range of nbblks basic blocks at any valid (basic - * block) offset within the log. + * Allocate a buffer to hold log data. The buffer needs to be able to map to + * a range of nbblks basic blocks at any valid offset within the log. */ -STATIC xfs_buf_t * -xlog_get_bp( +static char * +xlog_alloc_buffer( struct xlog *log, int nbblks) { - struct xfs_buf *bp; - /* * Pass log block 0 since we don't have an addr yet, buffer will be * verified on read. */ - if (!xlog_verify_bp(log, 0, nbblks)) { + if (!xlog_verify_bno(log, 0, nbblks)) { xfs_warn(log->l_mp, "Invalid block length (0x%x) for buffer", nbblks); XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp); @@ -115,69 +109,48 @@ xlog_get_bp( } /* - * We do log I/O in units of log sectors (a power-of-2 - * multiple of the basic block size), so we round up the - * requested size to accommodate the basic blocks required - * for complete log sectors. + * We do log I/O in units of log sectors (a power-of-2 multiple of the + * basic block size), so we round up the requested size to accommodate + * the basic blocks required for complete log sectors. * - * In addition, the buffer may be used for a non-sector- - * aligned block offset, in which case an I/O of the - * requested size could extend beyond the end of the - * buffer. If the requested size is only 1 basic block it - * will never straddle a sector boundary, so this won't be - * an issue. Nor will this be a problem if the log I/O is - * done in basic blocks (sector size 1). But otherwise we - * extend the buffer by one extra log sector to ensure - * there's space to accommodate this possibility. + * In addition, the buffer may be used for a non-sector-aligned block + * offset, in which case an I/O of the requested size could extend + * beyond the end of the buffer. If the requested size is only 1 basic + * block it will never straddle a sector boundary, so this won't be an + * issue. Nor will this be a problem if the log I/O is done in basic + * blocks (sector size 1). But otherwise we extend the buffer by one + * extra log sector to ensure there's space to accommodate this + * possibility. */ if (nbblks > 1 && log->l_sectBBsize > 1) nbblks += log->l_sectBBsize; nbblks = round_up(nbblks, log->l_sectBBsize); - - bp = xfs_buf_get_uncached(log->l_mp->m_logdev_targp, nbblks, 0); - if (bp) - xfs_buf_unlock(bp); - return bp; -} - -STATIC void -xlog_put_bp( - xfs_buf_t *bp) -{ - xfs_buf_free(bp); + return kmem_alloc_large(BBTOB(nbblks), KM_MAYFAIL); } /* * Return the address of the start of the given block number's data * in a log buffer. The buffer covers a log sector-aligned region. */ -STATIC char * +static inline unsigned int xlog_align( struct xlog *log, - xfs_daddr_t blk_no, - int nbblks, - struct xfs_buf *bp) + xfs_daddr_t blk_no) { - xfs_daddr_t offset = blk_no & ((xfs_daddr_t)log->l_sectBBsize - 1); - - ASSERT(offset + nbblks <= bp->b_length); - return bp->b_addr + BBTOB(offset); + return BBTOB(blk_no & ((xfs_daddr_t)log->l_sectBBsize - 1)); } - -/* - * nbblks should be uint, but oh well. Just want to catch that 32-bit length. - */ -STATIC int -xlog_bread_noalign( - struct xlog *log, - xfs_daddr_t blk_no, - int nbblks, - struct xfs_buf *bp) +static int +xlog_do_io( + struct xlog *log, + xfs_daddr_t blk_no, + unsigned int nbblks, + char *data, + unsigned int op) { - int error; + int error; - if (!xlog_verify_bp(log, blk_no, nbblks)) { + if (!xlog_verify_bno(log, blk_no, nbblks)) { xfs_warn(log->l_mp, "Invalid log block/length (0x%llx, 0x%x) for buffer", blk_no, nbblks); @@ -187,107 +160,53 @@ xlog_bread_noalign( blk_no = round_down(blk_no, log->l_sectBBsize); nbblks = round_up(nbblks, log->l_sectBBsize); - ASSERT(nbblks > 0); - ASSERT(nbblks <= bp->b_length); - - XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no); - bp->b_flags |= XBF_READ; - bp->b_io_length = nbblks; - bp->b_error = 0; - error = xfs_buf_submit(bp); - if (error && !XFS_FORCED_SHUTDOWN(log->l_mp)) - xfs_buf_ioerror_alert(bp, __func__); + error = xfs_rw_bdev(log->l_targ->bt_bdev, log->l_logBBstart + blk_no, + BBTOB(nbblks), data, op); + if (error && !XFS_FORCED_SHUTDOWN(log->l_mp)) { + xfs_alert(log->l_mp, + "log recovery %s I/O error at daddr 0x%llx len %d error %d", + op == REQ_OP_WRITE ? "write" : "read", + blk_no, nbblks, error); + } return error; } STATIC int -xlog_bread( +xlog_bread_noalign( struct xlog *log, xfs_daddr_t blk_no, int nbblks, - struct xfs_buf *bp, - char **offset) + char *data) { - int error; - - error = xlog_bread_noalign(log, blk_no, nbblks, bp); - if (error) - return error; - - *offset = xlog_align(log, blk_no, nbblks, bp); - return 0; + return xlog_do_io(log, blk_no, nbblks, data, REQ_OP_READ); } -/* - * Read at an offset into the buffer. Returns with the buffer in it's original - * state regardless of the result of the read. - */ STATIC int -xlog_bread_offset( +xlog_bread( struct xlog *log, - xfs_daddr_t blk_no, /* block to read from */ - int nbblks, /* blocks to read */ - struct xfs_buf *bp, - char *offset) + xfs_daddr_t blk_no, + int nbblks, + char *data, + char **offset) { - char *orig_offset = bp->b_addr; - int orig_len = BBTOB(bp->b_length); - int error, error2; - - error = xfs_buf_associate_memory(bp, offset, BBTOB(nbblks)); - if (error) - return error; - - error = xlog_bread_noalign(log, blk_no, nbblks, bp); + int error; - /* must reset buffer pointer even on error */ - error2 = xfs_buf_associate_memory(bp, orig_offset, orig_len); - if (error) - return error; - return error2; + error = xlog_do_io(log, blk_no, nbblks, data, REQ_OP_READ); + if (!error) + *offset = data + xlog_align(log, blk_no); + return error; } -/* - * Write out the buffer at the given block for the given number of blocks. - * The buffer is kept locked across the write and is returned locked. - * This can only be used for synchronous log writes. - */ STATIC int xlog_bwrite( struct xlog *log, xfs_daddr_t blk_no, int nbblks, - struct xfs_buf *bp) + char *data) { - int error; - - if (!xlog_verify_bp(log, blk_no, nbblks)) { - xfs_warn(log->l_mp, - "Invalid log block/length (0x%llx, 0x%x) for buffer", - blk_no, nbblks); - XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp); - return -EFSCORRUPTED; - } - - blk_no = round_down(blk_no, log->l_sectBBsize); - nbblks = round_up(nbblks, log->l_sectBBsize); - - ASSERT(nbblks > 0); - ASSERT(nbblks <= bp->b_length); - - XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no); - xfs_buf_hold(bp); - xfs_buf_lock(bp); - bp->b_io_length = nbblks; - bp->b_error = 0; - - error = xfs_bwrite(bp); - if (error) - xfs_buf_ioerror_alert(bp, __func__); - xfs_buf_relse(bp); - return error; + return xlog_do_io(log, blk_no, nbblks, data, REQ_OP_WRITE); } #ifdef DEBUG @@ -377,10 +296,9 @@ xlog_recover_iodone( * We're not going to bother about retrying * this during recovery. One strike! */ - if (!XFS_FORCED_SHUTDOWN(bp->b_target->bt_mount)) { + if (!XFS_FORCED_SHUTDOWN(bp->b_mount)) { xfs_buf_ioerror_alert(bp, __func__); - xfs_force_shutdown(bp->b_target->bt_mount, - SHUTDOWN_META_IO_ERROR); + xfs_force_shutdown(bp->b_mount, SHUTDOWN_META_IO_ERROR); } } @@ -405,7 +323,7 @@ xlog_recover_iodone( STATIC int xlog_find_cycle_start( struct xlog *log, - struct xfs_buf *bp, + char *buffer, xfs_daddr_t first_blk, xfs_daddr_t *last_blk, uint cycle) @@ -419,7 +337,7 @@ xlog_find_cycle_start( end_blk = *last_blk; mid_blk = BLK_AVG(first_blk, end_blk); while (mid_blk != first_blk && mid_blk != end_blk) { - error = xlog_bread(log, mid_blk, 1, bp, &offset); + error = xlog_bread(log, mid_blk, 1, buffer, &offset); if (error) return error; mid_cycle = xlog_get_cycle(offset); @@ -455,7 +373,7 @@ xlog_find_verify_cycle( { xfs_daddr_t i, j; uint cycle; - xfs_buf_t *bp; + char *buffer; xfs_daddr_t bufblks; char *buf = NULL; int error = 0; @@ -469,7 +387,7 @@ xlog_find_verify_cycle( bufblks = 1 << ffs(nbblks); while (bufblks > log->l_logBBsize) bufblks >>= 1; - while (!(bp = xlog_get_bp(log, bufblks))) { + while (!(buffer = xlog_alloc_buffer(log, bufblks))) { bufblks >>= 1; if (bufblks < log->l_sectBBsize) return -ENOMEM; @@ -480,7 +398,7 @@ xlog_find_verify_cycle( bcount = min(bufblks, (start_blk + nbblks - i)); - error = xlog_bread(log, i, bcount, bp, &buf); + error = xlog_bread(log, i, bcount, buffer, &buf); if (error) goto out; @@ -498,7 +416,7 @@ xlog_find_verify_cycle( *new_blk = -1; out: - xlog_put_bp(bp); + kmem_free(buffer); return error; } @@ -522,7 +440,7 @@ xlog_find_verify_log_record( int extra_bblks) { xfs_daddr_t i; - xfs_buf_t *bp; + char *buffer; char *offset = NULL; xlog_rec_header_t *head = NULL; int error = 0; @@ -532,12 +450,14 @@ xlog_find_verify_log_record( ASSERT(start_blk != 0 || *last_blk != start_blk); - if (!(bp = xlog_get_bp(log, num_blks))) { - if (!(bp = xlog_get_bp(log, 1))) + buffer = xlog_alloc_buffer(log, num_blks); + if (!buffer) { + buffer = xlog_alloc_buffer(log, 1); + if (!buffer) return -ENOMEM; smallmem = 1; } else { - error = xlog_bread(log, start_blk, num_blks, bp, &offset); + error = xlog_bread(log, start_blk, num_blks, buffer, &offset); if (error) goto out; offset += ((num_blks - 1) << BBSHIFT); @@ -554,7 +474,7 @@ xlog_find_verify_log_record( } if (smallmem) { - error = xlog_bread(log, i, 1, bp, &offset); + error = xlog_bread(log, i, 1, buffer, &offset); if (error) goto out; } @@ -607,7 +527,7 @@ xlog_find_verify_log_record( *last_blk = i; out: - xlog_put_bp(bp); + kmem_free(buffer); return error; } @@ -629,7 +549,7 @@ xlog_find_head( struct xlog *log, xfs_daddr_t *return_head_blk) { - xfs_buf_t *bp; + char *buffer; char *offset; xfs_daddr_t new_blk, first_blk, start_blk, last_blk, head_blk; int num_scan_bblks; @@ -659,20 +579,20 @@ xlog_find_head( } first_blk = 0; /* get cycle # of 1st block */ - bp = xlog_get_bp(log, 1); - if (!bp) + buffer = xlog_alloc_buffer(log, 1); + if (!buffer) return -ENOMEM; - error = xlog_bread(log, 0, 1, bp, &offset); + error = xlog_bread(log, 0, 1, buffer, &offset); if (error) - goto bp_err; + goto out_free_buffer; first_half_cycle = xlog_get_cycle(offset); last_blk = head_blk = log_bbnum - 1; /* get cycle # of last block */ - error = xlog_bread(log, last_blk, 1, bp, &offset); + error = xlog_bread(log, last_blk, 1, buffer, &offset); if (error) - goto bp_err; + goto out_free_buffer; last_half_cycle = xlog_get_cycle(offset); ASSERT(last_half_cycle != 0); @@ -740,9 +660,10 @@ xlog_find_head( * ^ we want to locate this spot */ stop_on_cycle = last_half_cycle; - if ((error = xlog_find_cycle_start(log, bp, first_blk, - &head_blk, last_half_cycle))) - goto bp_err; + error = xlog_find_cycle_start(log, buffer, first_blk, &head_blk, + last_half_cycle); + if (error) + goto out_free_buffer; } /* @@ -762,7 +683,7 @@ xlog_find_head( if ((error = xlog_find_verify_cycle(log, start_blk, num_scan_bblks, stop_on_cycle, &new_blk))) - goto bp_err; + goto out_free_buffer; if (new_blk != -1) head_blk = new_blk; } else { /* need to read 2 parts of log */ @@ -799,7 +720,7 @@ xlog_find_head( if ((error = xlog_find_verify_cycle(log, start_blk, num_scan_bblks - (int)head_blk, (stop_on_cycle - 1), &new_blk))) - goto bp_err; + goto out_free_buffer; if (new_blk != -1) { head_blk = new_blk; goto validate_head; @@ -815,7 +736,7 @@ xlog_find_head( if ((error = xlog_find_verify_cycle(log, start_blk, (int)head_blk, stop_on_cycle, &new_blk))) - goto bp_err; + goto out_free_buffer; if (new_blk != -1) head_blk = new_blk; } @@ -834,13 +755,13 @@ validate_head: if (error == 1) error = -EIO; if (error) - goto bp_err; + goto out_free_buffer; } else { start_blk = 0; ASSERT(head_blk <= INT_MAX); error = xlog_find_verify_log_record(log, start_blk, &head_blk, 0); if (error < 0) - goto bp_err; + goto out_free_buffer; if (error == 1) { /* We hit the beginning of the log during our search */ start_blk = log_bbnum - (num_scan_bblks - head_blk); @@ -853,14 +774,14 @@ validate_head: if (error == 1) error = -EIO; if (error) - goto bp_err; + goto out_free_buffer; if (new_blk != log_bbnum) head_blk = new_blk; } else if (error) - goto bp_err; + goto out_free_buffer; } - xlog_put_bp(bp); + kmem_free(buffer); if (head_blk == log_bbnum) *return_head_blk = 0; else @@ -873,9 +794,8 @@ validate_head: */ return 0; - bp_err: - xlog_put_bp(bp); - +out_free_buffer: + kmem_free(buffer); if (error) xfs_warn(log->l_mp, "failed to find log head"); return error; @@ -895,7 +815,7 @@ xlog_rseek_logrec_hdr( xfs_daddr_t head_blk, xfs_daddr_t tail_blk, int count, - struct xfs_buf *bp, + char *buffer, xfs_daddr_t *rblk, struct xlog_rec_header **rhead, bool *wrapped) @@ -914,7 +834,7 @@ xlog_rseek_logrec_hdr( */ end_blk = head_blk > tail_blk ? tail_blk : 0; for (i = (int) head_blk - 1; i >= end_blk; i--) { - error = xlog_bread(log, i, 1, bp, &offset); + error = xlog_bread(log, i, 1, buffer, &offset); if (error) goto out_error; @@ -933,7 +853,7 @@ xlog_rseek_logrec_hdr( */ if (tail_blk >= head_blk && found != count) { for (i = log->l_logBBsize - 1; i >= (int) tail_blk; i--) { - error = xlog_bread(log, i, 1, bp, &offset); + error = xlog_bread(log, i, 1, buffer, &offset); if (error) goto out_error; @@ -969,7 +889,7 @@ xlog_seek_logrec_hdr( xfs_daddr_t head_blk, xfs_daddr_t tail_blk, int count, - struct xfs_buf *bp, + char *buffer, xfs_daddr_t *rblk, struct xlog_rec_header **rhead, bool *wrapped) @@ -988,7 +908,7 @@ xlog_seek_logrec_hdr( */ end_blk = head_blk > tail_blk ? head_blk : log->l_logBBsize - 1; for (i = (int) tail_blk; i <= end_blk; i++) { - error = xlog_bread(log, i, 1, bp, &offset); + error = xlog_bread(log, i, 1, buffer, &offset); if (error) goto out_error; @@ -1006,7 +926,7 @@ xlog_seek_logrec_hdr( */ if (tail_blk > head_blk && found != count) { for (i = 0; i < (int) head_blk; i++) { - error = xlog_bread(log, i, 1, bp, &offset); + error = xlog_bread(log, i, 1, buffer, &offset); if (error) goto out_error; @@ -1069,22 +989,22 @@ xlog_verify_tail( int hsize) { struct xlog_rec_header *thead; - struct xfs_buf *bp; + char *buffer; xfs_daddr_t first_bad; int error = 0; bool wrapped; xfs_daddr_t tmp_tail; xfs_daddr_t orig_tail = *tail_blk; - bp = xlog_get_bp(log, 1); - if (!bp) + buffer = xlog_alloc_buffer(log, 1); + if (!buffer) return -ENOMEM; /* * Make sure the tail points to a record (returns positive count on * success). */ - error = xlog_seek_logrec_hdr(log, head_blk, *tail_blk, 1, bp, + error = xlog_seek_logrec_hdr(log, head_blk, *tail_blk, 1, buffer, &tmp_tail, &thead, &wrapped); if (error < 0) goto out; @@ -1113,8 +1033,8 @@ xlog_verify_tail( break; /* skip to the next record; returns positive count on success */ - error = xlog_seek_logrec_hdr(log, head_blk, first_bad, 2, bp, - &tmp_tail, &thead, &wrapped); + error = xlog_seek_logrec_hdr(log, head_blk, first_bad, 2, + buffer, &tmp_tail, &thead, &wrapped); if (error < 0) goto out; @@ -1129,7 +1049,7 @@ xlog_verify_tail( "Tail block (0x%llx) overwrite detected. Updated to 0x%llx", orig_tail, *tail_blk); out: - xlog_put_bp(bp); + kmem_free(buffer); return error; } @@ -1151,13 +1071,13 @@ xlog_verify_head( struct xlog *log, xfs_daddr_t *head_blk, /* in/out: unverified head */ xfs_daddr_t *tail_blk, /* out: tail block */ - struct xfs_buf *bp, + char *buffer, xfs_daddr_t *rhead_blk, /* start blk of last record */ struct xlog_rec_header **rhead, /* ptr to last record */ bool *wrapped) /* last rec. wraps phys. log */ { struct xlog_rec_header *tmp_rhead; - struct xfs_buf *tmp_bp; + char *tmp_buffer; xfs_daddr_t first_bad; xfs_daddr_t tmp_rhead_blk; int found; @@ -1168,15 +1088,15 @@ xlog_verify_head( * Check the head of the log for torn writes. Search backwards from the * head until we hit the tail or the maximum number of log record I/Os * that could have been in flight at one time. Use a temporary buffer so - * we don't trash the rhead/bp pointers from the caller. + * we don't trash the rhead/buffer pointers from the caller. */ - tmp_bp = xlog_get_bp(log, 1); - if (!tmp_bp) + tmp_buffer = xlog_alloc_buffer(log, 1); + if (!tmp_buffer) return -ENOMEM; error = xlog_rseek_logrec_hdr(log, *head_blk, *tail_blk, - XLOG_MAX_ICLOGS, tmp_bp, &tmp_rhead_blk, - &tmp_rhead, &tmp_wrapped); - xlog_put_bp(tmp_bp); + XLOG_MAX_ICLOGS, tmp_buffer, + &tmp_rhead_blk, &tmp_rhead, &tmp_wrapped); + kmem_free(tmp_buffer); if (error < 0) return error; @@ -1205,8 +1125,8 @@ xlog_verify_head( * (i.e., the records with invalid CRC) if the cycle number * matches the the current cycle. */ - found = xlog_rseek_logrec_hdr(log, first_bad, *tail_blk, 1, bp, - rhead_blk, rhead, wrapped); + found = xlog_rseek_logrec_hdr(log, first_bad, *tail_blk, 1, + buffer, rhead_blk, rhead, wrapped); if (found < 0) return found; if (found == 0) /* XXX: right thing to do here? */ @@ -1266,7 +1186,7 @@ xlog_check_unmount_rec( xfs_daddr_t *tail_blk, struct xlog_rec_header *rhead, xfs_daddr_t rhead_blk, - struct xfs_buf *bp, + char *buffer, bool *clean) { struct xlog_op_header *op_head; @@ -1309,7 +1229,7 @@ xlog_check_unmount_rec( if (*head_blk == after_umount_blk && be32_to_cpu(rhead->h_num_logops) == 1) { umount_data_blk = xlog_wrap_logbno(log, rhead_blk + hblks); - error = xlog_bread(log, umount_data_blk, 1, bp, &offset); + error = xlog_bread(log, umount_data_blk, 1, buffer, &offset); if (error) return error; @@ -1388,7 +1308,7 @@ xlog_find_tail( { xlog_rec_header_t *rhead; char *offset = NULL; - xfs_buf_t *bp; + char *buffer; int error; xfs_daddr_t rhead_blk; xfs_lsn_t tail_lsn; @@ -1402,11 +1322,11 @@ xlog_find_tail( return error; ASSERT(*head_blk < INT_MAX); - bp = xlog_get_bp(log, 1); - if (!bp) + buffer = xlog_alloc_buffer(log, 1); + if (!buffer) return -ENOMEM; if (*head_blk == 0) { /* special case */ - error = xlog_bread(log, 0, 1, bp, &offset); + error = xlog_bread(log, 0, 1, buffer, &offset); if (error) goto done; @@ -1422,7 +1342,7 @@ xlog_find_tail( * block. This wraps all the way back around to the head so something is * seriously wrong if we can't find it. */ - error = xlog_rseek_logrec_hdr(log, *head_blk, *head_blk, 1, bp, + error = xlog_rseek_logrec_hdr(log, *head_blk, *head_blk, 1, buffer, &rhead_blk, &rhead, &wrapped); if (error < 0) return error; @@ -1443,7 +1363,7 @@ xlog_find_tail( * state to determine whether recovery is necessary. */ error = xlog_check_unmount_rec(log, head_blk, tail_blk, rhead, - rhead_blk, bp, &clean); + rhead_blk, buffer, &clean); if (error) goto done; @@ -1460,7 +1380,7 @@ xlog_find_tail( if (!clean) { xfs_daddr_t orig_head = *head_blk; - error = xlog_verify_head(log, head_blk, tail_blk, bp, + error = xlog_verify_head(log, head_blk, tail_blk, buffer, &rhead_blk, &rhead, &wrapped); if (error) goto done; @@ -1471,7 +1391,7 @@ xlog_find_tail( wrapped); tail_lsn = atomic64_read(&log->l_tail_lsn); error = xlog_check_unmount_rec(log, head_blk, tail_blk, - rhead, rhead_blk, bp, + rhead, rhead_blk, buffer, &clean); if (error) goto done; @@ -1505,11 +1425,11 @@ xlog_find_tail( * But... if the -device- itself is readonly, just skip this. * We can't recover this device anyway, so it won't matter. */ - if (!xfs_readonly_buftarg(log->l_mp->m_logdev_targp)) + if (!xfs_readonly_buftarg(log->l_targ)) error = xlog_clear_stale_blocks(log, tail_lsn); done: - xlog_put_bp(bp); + kmem_free(buffer); if (error) xfs_warn(log->l_mp, "failed to locate log tail"); @@ -1537,7 +1457,7 @@ xlog_find_zeroed( struct xlog *log, xfs_daddr_t *blk_no) { - xfs_buf_t *bp; + char *buffer; char *offset; uint first_cycle, last_cycle; xfs_daddr_t new_blk, last_blk, start_blk; @@ -1547,35 +1467,36 @@ xlog_find_zeroed( *blk_no = 0; /* check totally zeroed log */ - bp = xlog_get_bp(log, 1); - if (!bp) + buffer = xlog_alloc_buffer(log, 1); + if (!buffer) return -ENOMEM; - error = xlog_bread(log, 0, 1, bp, &offset); + error = xlog_bread(log, 0, 1, buffer, &offset); if (error) - goto bp_err; + goto out_free_buffer; first_cycle = xlog_get_cycle(offset); if (first_cycle == 0) { /* completely zeroed log */ *blk_no = 0; - xlog_put_bp(bp); + kmem_free(buffer); return 1; } /* check partially zeroed log */ - error = xlog_bread(log, log_bbnum-1, 1, bp, &offset); + error = xlog_bread(log, log_bbnum-1, 1, buffer, &offset); if (error) - goto bp_err; + goto out_free_buffer; last_cycle = xlog_get_cycle(offset); if (last_cycle != 0) { /* log completely written to */ - xlog_put_bp(bp); + kmem_free(buffer); return 0; } /* we have a partially zeroed log */ last_blk = log_bbnum-1; - if ((error = xlog_find_cycle_start(log, bp, 0, &last_blk, 0))) - goto bp_err; + error = xlog_find_cycle_start(log, buffer, 0, &last_blk, 0); + if (error) + goto out_free_buffer; /* * Validate the answer. Because there is no way to guarantee that @@ -1598,7 +1519,7 @@ xlog_find_zeroed( */ if ((error = xlog_find_verify_cycle(log, start_blk, (int)num_scan_bblks, 0, &new_blk))) - goto bp_err; + goto out_free_buffer; if (new_blk != -1) last_blk = new_blk; @@ -1610,11 +1531,11 @@ xlog_find_zeroed( if (error == 1) error = -EIO; if (error) - goto bp_err; + goto out_free_buffer; *blk_no = last_blk; -bp_err: - xlog_put_bp(bp); +out_free_buffer: + kmem_free(buffer); if (error) return error; return 1; @@ -1657,7 +1578,7 @@ xlog_write_log_records( int tail_block) { char *offset; - xfs_buf_t *bp; + char *buffer; int balign, ealign; int sectbb = log->l_sectBBsize; int end_block = start_block + blocks; @@ -1674,7 +1595,7 @@ xlog_write_log_records( bufblks = 1 << ffs(blocks); while (bufblks > log->l_logBBsize) bufblks >>= 1; - while (!(bp = xlog_get_bp(log, bufblks))) { + while (!(buffer = xlog_alloc_buffer(log, bufblks))) { bufblks >>= 1; if (bufblks < sectbb) return -ENOMEM; @@ -1686,9 +1607,9 @@ xlog_write_log_records( */ balign = round_down(start_block, sectbb); if (balign != start_block) { - error = xlog_bread_noalign(log, start_block, 1, bp); + error = xlog_bread_noalign(log, start_block, 1, buffer); if (error) - goto out_put_bp; + goto out_free_buffer; j = start_block - balign; } @@ -1705,29 +1626,28 @@ xlog_write_log_records( */ ealign = round_down(end_block, sectbb); if (j == 0 && (start_block + endcount > ealign)) { - offset = bp->b_addr + BBTOB(ealign - start_block); - error = xlog_bread_offset(log, ealign, sectbb, - bp, offset); + error = xlog_bread_noalign(log, ealign, sectbb, + buffer + BBTOB(ealign - start_block)); if (error) break; } - offset = xlog_align(log, start_block, endcount, bp); + offset = buffer + xlog_align(log, start_block); for (; j < endcount; j++) { xlog_add_record(log, offset, cycle, i+j, tail_cycle, tail_block); offset += BBSIZE; } - error = xlog_bwrite(log, start_block, endcount, bp); + error = xlog_bwrite(log, start_block, endcount, buffer); if (error) break; start_block += endcount; j = 0; } - out_put_bp: - xlog_put_bp(bp); +out_free_buffer: + kmem_free(buffer); return error; } @@ -2162,7 +2082,7 @@ xlog_recover_do_inode_buffer( if (xfs_sb_version_hascrc(&mp->m_sb)) bp->b_ops = &xfs_inode_buf_ops; - inodes_per_buf = BBTOB(bp->b_io_length) >> mp->m_sb.sb_inodelog; + inodes_per_buf = BBTOB(bp->b_length) >> mp->m_sb.sb_inodelog; for (i = 0; i < inodes_per_buf; i++) { next_unlinked_offset = (i * mp->m_sb.sb_inodesize) + offsetof(xfs_dinode_t, di_next_unlinked); @@ -2204,8 +2124,7 @@ xlog_recover_do_inode_buffer( ASSERT(item->ri_buf[item_index].i_addr != NULL); ASSERT((item->ri_buf[item_index].i_len % XFS_BLF_CHUNK) == 0); - ASSERT((reg_buf_offset + reg_buf_bytes) <= - BBTOB(bp->b_io_length)); + ASSERT((reg_buf_offset + reg_buf_bytes) <= BBTOB(bp->b_length)); /* * The current logged region contains a copy of the @@ -2439,17 +2358,21 @@ xlog_recover_validate_buf_type( case XFS_BLFT_BTREE_BUF: switch (magic32) { case XFS_ABTB_CRC_MAGIC: - case XFS_ABTC_CRC_MAGIC: case XFS_ABTB_MAGIC: + bp->b_ops = &xfs_bnobt_buf_ops; + break; + case XFS_ABTC_CRC_MAGIC: case XFS_ABTC_MAGIC: - bp->b_ops = &xfs_allocbt_buf_ops; + bp->b_ops = &xfs_cntbt_buf_ops; break; case XFS_IBT_CRC_MAGIC: - case XFS_FIBT_CRC_MAGIC: case XFS_IBT_MAGIC: - case XFS_FIBT_MAGIC: bp->b_ops = &xfs_inobt_buf_ops; break; + case XFS_FIBT_CRC_MAGIC: + case XFS_FIBT_MAGIC: + bp->b_ops = &xfs_finobt_buf_ops; + break; case XFS_BMAP_CRC_MAGIC: case XFS_BMAP_MAGIC: bp->b_ops = &xfs_bmbt_buf_ops; @@ -2666,7 +2589,7 @@ xlog_recover_do_reg_buffer( ASSERT(nbits > 0); ASSERT(item->ri_buf[i].i_addr != NULL); ASSERT(item->ri_buf[i].i_len % XFS_BLF_CHUNK == 0); - ASSERT(BBTOB(bp->b_io_length) >= + ASSERT(BBTOB(bp->b_length) >= ((uint)bit << XFS_BLF_SHIFT) + (nbits << XFS_BLF_SHIFT)); /* @@ -2878,23 +2801,22 @@ xlog_recover_buffer_pass2( * * Also make sure that only inode buffers with good sizes stay in * the buffer cache. The kernel moves inodes in buffers of 1 block - * or mp->m_inode_cluster_size bytes, whichever is bigger. The inode + * or inode_cluster_size bytes, whichever is bigger. The inode * buffers in the log can be a different size if the log was generated * by an older kernel using unclustered inode buffers or a newer kernel * running with a different inode cluster size. Regardless, if the - * the inode buffer size isn't max(blocksize, mp->m_inode_cluster_size) - * for *our* value of mp->m_inode_cluster_size, then we need to keep + * the inode buffer size isn't max(blocksize, inode_cluster_size) + * for *our* value of inode_cluster_size, then we need to keep * the buffer out of the buffer cache so that the buffer won't * overlap with future reads of those inodes. */ if (XFS_DINODE_MAGIC == be16_to_cpu(*((__be16 *)xfs_buf_offset(bp, 0))) && - (BBTOB(bp->b_io_length) != max(log->l_mp->m_sb.sb_blocksize, - (uint32_t)log->l_mp->m_inode_cluster_size))) { + (BBTOB(bp->b_length) != M_IGEO(log->l_mp)->inode_cluster_size)) { xfs_buf_stale(bp); error = xfs_bwrite(bp); } else { - ASSERT(bp->b_target->bt_mount == mp); + ASSERT(bp->b_mount == mp); bp->b_iodone = xlog_recover_iodone; xfs_buf_delwri_queue(bp, buffer_list); } @@ -3045,7 +2967,7 @@ xlog_recover_inode_pass2( * Make sure the place we're flushing out to really looks * like an inode! */ - if (unlikely(dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC))) { + if (unlikely(!xfs_verify_magic16(bp, dip->di_magic))) { xfs_alert(mp, "%s: Bad inode magic number, dip = "PTR_FMT", dino bp = "PTR_FMT", ino = %Ld", __func__, dip, bp, in_f->ilf_ino); @@ -3256,7 +3178,7 @@ out_owner_change: /* re-generate the checksum. */ xfs_dinode_calc_crc(log->l_mp, dip); - ASSERT(bp->b_target->bt_mount == mp); + ASSERT(bp->b_mount == mp); bp->b_iodone = xlog_recover_iodone; xfs_buf_delwri_queue(bp, buffer_list); @@ -3395,7 +3317,7 @@ xlog_recover_dquot_pass2( } ASSERT(dq_f->qlf_size == 2); - ASSERT(bp->b_target->bt_mount == mp); + ASSERT(bp->b_mount == mp); bp->b_iodone = xlog_recover_iodone; xfs_buf_delwri_queue(bp, buffer_list); @@ -3459,7 +3381,7 @@ xlog_recover_efd_pass2( { xfs_efd_log_format_t *efd_formatp; xfs_efi_log_item_t *efip = NULL; - xfs_log_item_t *lip; + struct xfs_log_item *lip; uint64_t efi_id; struct xfs_ail_cursor cur; struct xfs_ail *ailp = log->l_ailp; @@ -3845,6 +3767,7 @@ xlog_recover_do_icreate_pass2( { struct xfs_mount *mp = log->l_mp; struct xfs_icreate_log *icl; + struct xfs_ino_geometry *igeo = M_IGEO(mp); xfs_agnumber_t agno; xfs_agblock_t agbno; unsigned int count; @@ -3894,10 +3817,10 @@ xlog_recover_do_icreate_pass2( /* * The inode chunk is either full or sparse and we only support - * m_ialloc_min_blks sized sparse allocations at this time. + * m_ino_geo.ialloc_min_blks sized sparse allocations at this time. */ - if (length != mp->m_ialloc_blks && - length != mp->m_ialloc_min_blks) { + if (length != igeo->ialloc_blks && + length != igeo->ialloc_min_blks) { xfs_warn(log->l_mp, "%s: unsupported chunk length", __FUNCTION__); return -EINVAL; @@ -3917,13 +3840,13 @@ xlog_recover_do_icreate_pass2( * buffers for cancellation so we don't overwrite anything written after * a cancellation. */ - bb_per_cluster = XFS_FSB_TO_BB(mp, mp->m_blocks_per_cluster); - nbufs = length / mp->m_blocks_per_cluster; + bb_per_cluster = XFS_FSB_TO_BB(mp, igeo->blocks_per_cluster); + nbufs = length / igeo->blocks_per_cluster; for (i = 0, cancel_count = 0; i < nbufs; i++) { xfs_daddr_t daddr; daddr = XFS_AGB_TO_DADDR(mp, agno, - agbno + i * mp->m_blocks_per_cluster); + agbno + i * igeo->blocks_per_cluster); if (xlog_check_buffer_cancelled(log, daddr, bb_per_cluster, 0)) cancel_count++; } @@ -4952,12 +4875,11 @@ out: * A cancel occurs when the mount has failed and we're bailing out. * Release all pending log intent items so they don't pin the AIL. */ -STATIC int +STATIC void xlog_recover_cancel_intents( struct xlog *log) { struct xfs_log_item *lip; - int error = 0; struct xfs_ail_cursor cur; struct xfs_ail *ailp; @@ -4997,7 +4919,6 @@ xlog_recover_cancel_intents( xfs_trans_ail_cursor_done(&cur); spin_unlock(&ailp->ail_lock); - return error; } /* @@ -5163,7 +5084,7 @@ xlog_recover_process_iunlinks( } } -STATIC int +STATIC void xlog_unpack_data( struct xlog_rec_header *rhead, char *dp, @@ -5186,8 +5107,6 @@ xlog_unpack_data( dp += BBSIZE; } } - - return 0; } /* @@ -5202,11 +5121,9 @@ xlog_recover_process( int pass, struct list_head *buffer_list) { - int error; __le32 old_crc = rhead->h_crc; __le32 crc; - crc = xlog_cksum(log, rhead, dp, be32_to_cpu(rhead->h_len)); /* @@ -5245,9 +5162,7 @@ xlog_recover_process( return -EFSCORRUPTED; } - error = xlog_unpack_data(rhead, dp, log); - if (error) - return error; + xlog_unpack_data(rhead, dp, log); return xlog_recover_process_data(log, rhash, rhead, dp, pass, buffer_list); @@ -5309,7 +5224,7 @@ xlog_do_recovery_pass( xfs_daddr_t blk_no, rblk_no; xfs_daddr_t rhead_blk; char *offset; - xfs_buf_t *hbp, *dbp; + char *hbp, *dbp; int error = 0, h_size, h_len; int error2 = 0; int bblks, split_bblks; @@ -5334,7 +5249,7 @@ xlog_do_recovery_pass( * iclog header and extract the header size from it. Get a * new hbp that is the correct size. */ - hbp = xlog_get_bp(log, 1); + hbp = xlog_alloc_buffer(log, 1); if (!hbp) return -ENOMEM; @@ -5376,23 +5291,23 @@ xlog_do_recovery_pass( hblks = h_size / XLOG_HEADER_CYCLE_SIZE; if (h_size % XLOG_HEADER_CYCLE_SIZE) hblks++; - xlog_put_bp(hbp); - hbp = xlog_get_bp(log, hblks); + kmem_free(hbp); + hbp = xlog_alloc_buffer(log, hblks); } else { hblks = 1; } } else { ASSERT(log->l_sectBBsize == 1); hblks = 1; - hbp = xlog_get_bp(log, 1); + hbp = xlog_alloc_buffer(log, 1); h_size = XLOG_BIG_RECORD_BSIZE; } if (!hbp) return -ENOMEM; - dbp = xlog_get_bp(log, BTOBB(h_size)); + dbp = xlog_alloc_buffer(log, BTOBB(h_size)); if (!dbp) { - xlog_put_bp(hbp); + kmem_free(hbp); return -ENOMEM; } @@ -5407,7 +5322,7 @@ xlog_do_recovery_pass( /* * Check for header wrapping around physical end-of-log */ - offset = hbp->b_addr; + offset = hbp; split_hblks = 0; wrapped_hblks = 0; if (blk_no + hblks <= log->l_logBBsize) { @@ -5443,8 +5358,8 @@ xlog_do_recovery_pass( * - order is important. */ wrapped_hblks = hblks - split_hblks; - error = xlog_bread_offset(log, 0, - wrapped_hblks, hbp, + error = xlog_bread_noalign(log, 0, + wrapped_hblks, offset + BBTOB(split_hblks)); if (error) goto bread_err2; @@ -5475,7 +5390,7 @@ xlog_do_recovery_pass( } else { /* This log record is split across the * physical end of log */ - offset = dbp->b_addr; + offset = dbp; split_bblks = 0; if (blk_no != log->l_logBBsize) { /* some data is before the physical @@ -5504,8 +5419,8 @@ xlog_do_recovery_pass( * _first_, then the log start (LR header end) * - order is important. */ - error = xlog_bread_offset(log, 0, - bblks - split_bblks, dbp, + error = xlog_bread_noalign(log, 0, + bblks - split_bblks, offset + BBTOB(split_bblks)); if (error) goto bread_err2; @@ -5553,9 +5468,9 @@ xlog_do_recovery_pass( } bread_err2: - xlog_put_bp(dbp); + kmem_free(dbp); bread_err1: - xlog_put_bp(hbp); + kmem_free(hbp); /* * Submit buffers that have been added from the last record processed, @@ -5689,7 +5604,7 @@ xlog_do_recover( * Now that we've finished replaying all buffer and inode * updates, re-read in the superblock and reverify it. */ - bp = xfs_getsb(mp, 0); + bp = xfs_getsb(mp); bp->b_flags &= ~(XBF_DONE | XBF_ASYNC); ASSERT(!(bp->b_flags & XBF_WRITE)); bp->b_flags |= XBF_READ; @@ -5862,16 +5777,12 @@ xlog_recover_finish( return 0; } -int +void xlog_recover_cancel( struct xlog *log) { - int error = 0; - if (log->l_flags & XLOG_RECOVERY_NEEDED) - error = xlog_recover_cancel_intents(log); - - return error; + xlog_recover_cancel_intents(log); } #if defined(DEBUG) diff --git a/fs/xfs/xfs_message.c b/fs/xfs/xfs_message.c index 6b736ea58d35..9804efe525a9 100644 --- a/fs/xfs/xfs_message.c +++ b/fs/xfs/xfs_message.c @@ -6,8 +6,8 @@ #include "xfs.h" #include "xfs_fs.h" #include "xfs_error.h" +#include "xfs_shared.h" #include "xfs_format.h" -#include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index b4d8c318be3c..322da6909290 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -12,9 +12,6 @@ #include "xfs_bit.h" #include "xfs_sb.h" #include "xfs_mount.h" -#include "xfs_defer.h" -#include "xfs_da_format.h" -#include "xfs_da_btree.h" #include "xfs_inode.h" #include "xfs_dir2.h" #include "xfs_ialloc.h" @@ -27,13 +24,13 @@ #include "xfs_error.h" #include "xfs_quota.h" #include "xfs_fsops.h" -#include "xfs_trace.h" #include "xfs_icache.h" #include "xfs_sysfs.h" #include "xfs_rmap_btree.h" #include "xfs_refcount_btree.h" #include "xfs_reflink.h" #include "xfs_extent_busy.h" +#include "xfs_health.h" static DEFINE_MUTEX(xfs_uuid_table_mutex); @@ -149,6 +146,7 @@ xfs_free_perag( spin_unlock(&mp->m_perag_lock); ASSERT(pag); ASSERT(atomic_read(&pag->pag_ref) == 0); + xfs_iunlink_destroy(pag); xfs_buf_hash_destroy(pag); mutex_destroy(&pag->pag_ici_reclaim_lock); call_rcu(&pag->rcu_head, __xfs_free_perag); @@ -227,6 +225,10 @@ xfs_initialize_perag( /* first new pag is fully initialized */ if (first_initialised == NULLAGNUMBER) first_initialised = index; + error = xfs_iunlink_init(pag); + if (error) + goto out_hash_destroy; + spin_lock_init(&pag->pag_state_lock); } index = xfs_set_inode_alloc(mp, agcount); @@ -249,6 +251,7 @@ out_unwind_new_pags: if (!pag) break; xfs_buf_hash_destroy(pag); + xfs_iunlink_destroy(pag); mutex_destroy(&pag->pag_ici_reclaim_lock); kmem_free(pag); } @@ -423,30 +426,6 @@ xfs_update_alignment(xfs_mount_t *mp) } /* - * Set the maximum inode count for this filesystem - */ -STATIC void -xfs_set_maxicount(xfs_mount_t *mp) -{ - xfs_sb_t *sbp = &(mp->m_sb); - uint64_t icount; - - if (sbp->sb_imax_pct) { - /* - * Make sure the maximum inode count is a multiple - * of the units we allocate inodes in. - */ - icount = sbp->sb_dblocks * sbp->sb_imax_pct; - do_div(icount, 100); - do_div(icount, mp->m_ialloc_blks); - mp->m_maxicount = (icount * mp->m_ialloc_blks) << - sbp->sb_inopblog; - } else { - mp->m_maxicount = 0; - } -} - -/* * Set the default minimum read and write sizes unless * already specified in a mount option. * We use smaller I/O sizes when the file system @@ -502,29 +481,6 @@ xfs_set_low_space_thresholds( } } - -/* - * Set whether we're using inode alignment. - */ -STATIC void -xfs_set_inoalignment(xfs_mount_t *mp) -{ - if (xfs_sb_version_hasalign(&mp->m_sb) && - mp->m_sb.sb_inoalignmt >= xfs_icluster_size_fsb(mp)) - mp->m_inoalign_mask = mp->m_sb.sb_inoalignmt - 1; - else - mp->m_inoalign_mask = 0; - /* - * If we are using stripe alignment, check whether - * the stripe unit is a multiple of the inode alignment - */ - if (mp->m_dalign && mp->m_inoalign_mask && - !(mp->m_dalign & mp->m_inoalign_mask)) - mp->m_sinoalign = mp->m_dalign; - else - mp->m_sinoalign = 0; -} - /* * Check that the data (and log if separate) is an ok size. */ @@ -639,7 +595,7 @@ xfs_check_summary_counts( (mp->m_sb.sb_fdblocks > mp->m_sb.sb_dblocks || !xfs_verify_icount(mp, mp->m_sb.sb_icount) || mp->m_sb.sb_ifree > mp->m_sb.sb_icount)) - mp->m_flags |= XFS_MOUNT_BAD_SUMMARY; + xfs_fs_mark_sick(mp, XFS_SICK_FS_COUNTERS); /* * We can safely re-initialise incore superblock counters from the @@ -654,7 +610,7 @@ xfs_check_summary_counts( */ if ((!xfs_sb_version_haslazysbcount(&mp->m_sb) || XFS_LAST_UNMOUNT_WAS_CLEAN(mp)) && - !(mp->m_flags & XFS_MOUNT_BAD_SUMMARY)) + !xfs_fs_has_sickness(mp, XFS_SICK_FS_COUNTERS)) return 0; return xfs_initialize_perag_data(mp, mp->m_sb.sb_agcount); @@ -676,6 +632,7 @@ xfs_mountfs( { struct xfs_sb *sbp = &(mp->m_sb); struct xfs_inode *rip; + struct xfs_ino_geometry *igeo = M_IGEO(mp); uint64_t resblks; uint quotamount = 0; uint quotaflags = 0; @@ -742,12 +699,10 @@ xfs_mountfs( xfs_alloc_compute_maxlevels(mp); xfs_bmap_compute_maxlevels(mp, XFS_DATA_FORK); xfs_bmap_compute_maxlevels(mp, XFS_ATTR_FORK); - xfs_ialloc_compute_maxlevels(mp); + xfs_ialloc_setup_geometry(mp); xfs_rmapbt_compute_maxlevels(mp); xfs_refcountbt_compute_maxlevels(mp); - xfs_set_maxicount(mp); - /* enable fail_at_unmount as default */ mp->m_fail_unmount = true; @@ -781,50 +736,22 @@ xfs_mountfs( xfs_set_low_space_thresholds(mp); /* - * Set the inode cluster size. - * This may still be overridden by the file system - * block size if it is larger than the chosen cluster size. - * - * For v5 filesystems, scale the cluster size with the inode size to - * keep a constant ratio of inode per cluster buffer, but only if mkfs - * has set the inode alignment value appropriately for larger cluster - * sizes. - */ - mp->m_inode_cluster_size = XFS_INODE_BIG_CLUSTER_SIZE; - if (xfs_sb_version_hascrc(&mp->m_sb)) { - int new_size = mp->m_inode_cluster_size; - - new_size *= mp->m_sb.sb_inodesize / XFS_DINODE_MIN_SIZE; - if (mp->m_sb.sb_inoalignmt >= XFS_B_TO_FSBT(mp, new_size)) - mp->m_inode_cluster_size = new_size; - } - mp->m_blocks_per_cluster = xfs_icluster_size_fsb(mp); - mp->m_inodes_per_cluster = XFS_FSB_TO_INO(mp, mp->m_blocks_per_cluster); - mp->m_cluster_align = xfs_ialloc_cluster_alignment(mp); - mp->m_cluster_align_inodes = XFS_FSB_TO_INO(mp, mp->m_cluster_align); - - /* * If enabled, sparse inode chunk alignment is expected to match the * cluster size. Full inode chunk alignment must match the chunk size, * but that is checked on sb read verification... */ if (xfs_sb_version_hassparseinodes(&mp->m_sb) && mp->m_sb.sb_spino_align != - XFS_B_TO_FSBT(mp, mp->m_inode_cluster_size)) { + XFS_B_TO_FSBT(mp, igeo->inode_cluster_size_raw)) { xfs_warn(mp, "Sparse inode block alignment (%u) must match cluster size (%llu).", mp->m_sb.sb_spino_align, - XFS_B_TO_FSBT(mp, mp->m_inode_cluster_size)); + XFS_B_TO_FSBT(mp, igeo->inode_cluster_size_raw)); error = -EINVAL; goto out_remove_uuid; } /* - * Set inode alignment fields - */ - xfs_set_inoalignment(mp); - - /* * Check that the data (and log if separate) is an ok size. */ error = xfs_check_sizes(mp); @@ -1063,6 +990,7 @@ xfs_mountfs( */ cancel_delayed_work_sync(&mp->m_reclaim_work); xfs_reclaim_inodes(mp, SYNC_WAIT); + xfs_health_unmount(mp); out_log_dealloc: mp->m_flags |= XFS_MOUNT_UNMOUNTING; xfs_log_mount_cancel(mp); @@ -1099,7 +1027,7 @@ xfs_unmountfs( uint64_t resblks; int error; - xfs_icache_disable_reclaim(mp); + xfs_stop_block_reaping(mp); xfs_fs_unreserve_ag_blocks(mp); xfs_qm_unmount_quotas(mp); xfs_rtunmount_inodes(mp); @@ -1145,6 +1073,7 @@ xfs_unmountfs( */ cancel_delayed_work_sync(&mp->m_reclaim_work); xfs_reclaim_inodes(mp, SYNC_WAIT); + xfs_health_unmount(mp); xfs_qm_unmount(mp); @@ -1376,24 +1305,14 @@ xfs_mod_frextents( * xfs_getsb() is called to obtain the buffer for the superblock. * The buffer is returned locked and read in from disk. * The buffer should be released with a call to xfs_brelse(). - * - * If the flags parameter is BUF_TRYLOCK, then we'll only return - * the superblock buffer if it can be locked without sleeping. - * If it can't then we'll return NULL. */ struct xfs_buf * xfs_getsb( - struct xfs_mount *mp, - int flags) + struct xfs_mount *mp) { struct xfs_buf *bp = mp->m_sb_bp; - if (!xfs_buf_trylock(bp)) { - if (flags & XBF_TRYLOCK) - return NULL; - xfs_buf_lock(bp); - } - + xfs_buf_lock(bp); xfs_buf_hold(bp); ASSERT(bp->b_flags & XBF_DONE); return bp; @@ -1440,7 +1359,26 @@ xfs_force_summary_recalc( if (!xfs_sb_version_haslazysbcount(&mp->m_sb)) return; - spin_lock(&mp->m_sb_lock); - mp->m_flags |= XFS_MOUNT_BAD_SUMMARY; - spin_unlock(&mp->m_sb_lock); + xfs_fs_mark_sick(mp, XFS_SICK_FS_COUNTERS); +} + +/* + * Update the in-core delayed block counter. + * + * We prefer to update the counter without having to take a spinlock for every + * counter update (i.e. batching). Each change to delayed allocation + * reservations can change can easily exceed the default percpu counter + * batching, so we use a larger batch factor here. + * + * Note that we don't currently have any callers requiring fast summation + * (e.g. percpu_counter_read) so we can use a big batch value here. + */ +#define XFS_DELALLOC_BATCH (4096) +void +xfs_mod_delalloc( + struct xfs_mount *mp, + int64_t delta) +{ + percpu_counter_add_batch(&mp->m_delalloc_blks, delta, + XFS_DELALLOC_BATCH); } diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 7daafe064af8..4adb6837439a 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -60,6 +60,20 @@ struct xfs_error_cfg { typedef struct xfs_mount { struct super_block *m_super; xfs_tid_t m_tid; /* next unused tid for fs */ + + /* + * Bitsets of per-fs metadata that have been checked and/or are sick. + * Callers must hold m_sb_lock to access these two fields. + */ + uint8_t m_fs_checked; + uint8_t m_fs_sick; + /* + * Bitsets of rt metadata that have been checked and/or are sick. + * Callers must hold m_sb_lock to access this field. + */ + uint8_t m_rt_checked; + uint8_t m_rt_sick; + struct xfs_ail *m_ail; /* fs active log item list */ struct xfs_sb m_sb; /* copy of fs superblock */ @@ -67,6 +81,12 @@ typedef struct xfs_mount { struct percpu_counter m_icount; /* allocated inodes counter */ struct percpu_counter m_ifree; /* free inodes counter */ struct percpu_counter m_fdblocks; /* free block counter */ + /* + * Count of data device blocks reserved for delayed allocations, + * including indlen blocks. Does not include allocated CoW staging + * extents or anything related to the rt device. + */ + struct percpu_counter m_delalloc_blks; struct xfs_buf *m_sb_bp; /* buffer for superblock */ char *m_fsname; /* filesystem name */ @@ -85,6 +105,7 @@ typedef struct xfs_mount { struct xfs_da_geometry *m_dir_geo; /* directory block geometry */ struct xfs_da_geometry *m_attr_geo; /* attribute block geometry */ struct xlog *m_log; /* log specific stuff */ + struct xfs_ino_geometry m_ino_geo; /* inode geometry */ int m_logbufs; /* number of log buffers */ int m_logbsize; /* size of each log buffer */ uint m_rsumlevels; /* rt summary levels */ @@ -106,12 +127,6 @@ typedef struct xfs_mount { uint8_t m_blkbit_log; /* blocklog + NBBY */ uint8_t m_blkbb_log; /* blocklog - BBSHIFT */ uint8_t m_agno_log; /* log #ag's */ - uint8_t m_agino_log; /* #bits for agino in inum */ - uint m_inode_cluster_size;/* min inode buf size */ - unsigned int m_inodes_per_cluster; - unsigned int m_blocks_per_cluster; - unsigned int m_cluster_align; - unsigned int m_cluster_align_inodes; uint m_blockmask; /* sb_blocksize-1 */ uint m_blockwsize; /* sb_blocksize in words */ uint m_blockwmask; /* blockwsize-1 */ @@ -119,15 +134,12 @@ typedef struct xfs_mount { uint m_alloc_mnr[2]; /* min alloc btree records */ uint m_bmap_dmxr[2]; /* max bmap btree records */ uint m_bmap_dmnr[2]; /* min bmap btree records */ - uint m_inobt_mxr[2]; /* max inobt btree records */ - uint m_inobt_mnr[2]; /* min inobt btree records */ uint m_rmap_mxr[2]; /* max rmap btree records */ uint m_rmap_mnr[2]; /* min rmap btree records */ uint m_refc_mxr[2]; /* max refc btree records */ uint m_refc_mnr[2]; /* min refc btree records */ uint m_ag_maxlevels; /* XFS_AG_MAXLEVELS */ uint m_bm_maxlevels[2]; /* XFS_BM_MAXLEVELS */ - uint m_in_maxlevels; /* max inobt btree levels. */ uint m_rmap_maxlevels; /* max rmap btree levels */ uint m_refc_maxlevels; /* max refcount btree level */ xfs_extlen_t m_ag_prealloc_blocks; /* reserved ag blocks */ @@ -138,21 +150,14 @@ typedef struct xfs_mount { struct mutex m_growlock; /* growfs mutex */ int m_fixedfsid[2]; /* unchanged for life of FS */ uint64_t m_flags; /* global mount flags */ - bool m_inotbt_nores; /* no per-AG finobt resv. */ - int m_ialloc_inos; /* inodes in inode allocation */ - int m_ialloc_blks; /* blocks in inode allocation */ - int m_ialloc_min_blks;/* min blocks in sparse inode - * allocation */ - int m_inoalign_mask;/* mask sb_inoalignmt if used */ + bool m_finobt_nores; /* no per-AG finobt resv. */ uint m_qflags; /* quota status flags */ struct xfs_trans_resv m_resv; /* precomputed res values */ - uint64_t m_maxicount; /* maximum inode count */ uint64_t m_resblks; /* total reserved blocks */ uint64_t m_resblks_avail;/* available reserved blocks */ uint64_t m_resblks_save; /* reserved blks @ remount,ro */ int m_dalign; /* stripe unit */ int m_swidth; /* stripe width */ - int m_sinoalign; /* stripe unit inode alignment */ uint8_t m_sectbb_log; /* sectlog - BBSHIFT */ const struct xfs_nameops *m_dirnameops; /* vector of dir name ops */ const struct xfs_dir_ops *m_dir_inode_ops; /* vector of dir inode ops */ @@ -175,11 +180,9 @@ typedef struct xfs_mount { struct xstats m_stats; /* per-fs stats */ struct workqueue_struct *m_buf_workqueue; - struct workqueue_struct *m_data_workqueue; struct workqueue_struct *m_unwritten_workqueue; struct workqueue_struct *m_cil_workqueue; struct workqueue_struct *m_reclaim_workqueue; - struct workqueue_struct *m_log_workqueue; struct workqueue_struct *m_eofblocks_workqueue; struct workqueue_struct *m_sync_workqueue; @@ -194,6 +197,7 @@ typedef struct xfs_mount { */ uint32_t m_generation; + bool m_always_cow; bool m_fail_unmount; #ifdef DEBUG /* @@ -206,6 +210,8 @@ typedef struct xfs_mount { #endif } xfs_mount_t; +#define M_IGEO(mp) (&(mp)->m_ino_geo) + /* * Flags for m_flags. */ @@ -213,7 +219,6 @@ typedef struct xfs_mount { must be synchronous except for space allocations */ #define XFS_MOUNT_UNMOUNTING (1ULL << 1) /* filesystem is unmounting */ -#define XFS_MOUNT_BAD_SUMMARY (1ULL << 2) /* summary counters are bad */ #define XFS_MOUNT_WAS_CLEAN (1ULL << 3) #define XFS_MOUNT_FS_SHUTDOWN (1ULL << 4) /* atomic stop of all filesystem operations, typically for @@ -368,6 +373,15 @@ typedef struct xfs_perag { xfs_agino_t pagl_pagino; xfs_agino_t pagl_leftrec; xfs_agino_t pagl_rightrec; + + /* + * Bitsets of per-ag metadata that have been checked and/or are sick. + * Callers should hold pag_state_lock before accessing this field. + */ + uint16_t pag_checked; + uint16_t pag_sick; + spinlock_t pag_state_lock; + spinlock_t pagb_lock; /* lock for pagb_tree */ struct rb_root pagb_tree; /* ordered tree of busy extents */ unsigned int pagb_gen; /* generation count for pagb_tree */ @@ -396,6 +410,13 @@ typedef struct xfs_perag { /* reference count */ uint8_t pagf_refcount_level; + + /* + * Unlinked inode information. This incore information reflects + * data stored in the AGI, so callers must hold the AGI buffer lock + * or have some other means to control concurrency. + */ + struct rhashtable pagi_unlinked_hash; } xfs_perag_t; static inline struct xfs_ag_resv * @@ -430,7 +451,7 @@ extern int xfs_mod_fdblocks(struct xfs_mount *mp, int64_t delta, bool reserved); extern int xfs_mod_frextents(struct xfs_mount *mp, int64_t delta); -extern struct xfs_buf *xfs_getsb(xfs_mount_t *, int); +extern struct xfs_buf *xfs_getsb(xfs_mount_t *); extern int xfs_readsb(xfs_mount_t *, int); extern void xfs_freesb(xfs_mount_t *); extern bool xfs_fs_writable(struct xfs_mount *mp, int level); @@ -446,5 +467,6 @@ int xfs_zero_extent(struct xfs_inode *ip, xfs_fsblock_t start_fsb, struct xfs_error_cfg * xfs_error_get_cfg(struct xfs_mount *mp, int error_class, int error); void xfs_force_summary_recalc(struct xfs_mount *mp); +void xfs_mod_delalloc(struct xfs_mount *mp, int64_t delta); #endif /* __XFS_MOUNT_H__ */ diff --git a/fs/xfs/xfs_ondisk.h b/fs/xfs/xfs_ondisk.h index d3e04d20d8d4..b6701b4f59a9 100644 --- a/fs/xfs/xfs_ondisk.h +++ b/fs/xfs/xfs_ondisk.h @@ -125,6 +125,32 @@ xfs_check_ondisk_structs(void) XFS_CHECK_STRUCT_SIZE(struct xfs_inode_log_format, 56); XFS_CHECK_STRUCT_SIZE(struct xfs_qoff_logformat, 20); XFS_CHECK_STRUCT_SIZE(struct xfs_trans_header, 16); + + /* + * The v5 superblock format extended several v4 header structures with + * additional data. While new fields are only accessible on v5 + * superblocks, it's important that the v5 structures place original v4 + * fields/headers in the correct location on-disk. For example, we must + * be able to find magic values at the same location in certain blocks + * regardless of superblock version. + * + * The following checks ensure that various v5 data structures place the + * subset of v4 metadata associated with the same type of block at the + * start of the on-disk block. If there is no data structure definition + * for certain types of v4 blocks, traverse down to the first field of + * common metadata (e.g., magic value) and make sure it is at offset + * zero. + */ + XFS_CHECK_OFFSET(struct xfs_dir3_leaf, hdr.info.hdr, 0); + XFS_CHECK_OFFSET(struct xfs_da3_intnode, hdr.info.hdr, 0); + XFS_CHECK_OFFSET(struct xfs_dir3_data_hdr, hdr.magic, 0); + XFS_CHECK_OFFSET(struct xfs_dir3_free, hdr.hdr.magic, 0); + XFS_CHECK_OFFSET(struct xfs_attr3_leafblock, hdr.info.hdr, 0); + + XFS_CHECK_STRUCT_SIZE(struct xfs_bulkstat, 192); + XFS_CHECK_STRUCT_SIZE(struct xfs_inumbers, 24); + XFS_CHECK_STRUCT_SIZE(struct xfs_bulkstat_req, 64); + XFS_CHECK_STRUCT_SIZE(struct xfs_inumbers_req, 64); } #endif /* __XFS_ONDISK_H */ diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c index f44c3599527d..0c954cad7449 100644 --- a/fs/xfs/xfs_pnfs.c +++ b/fs/xfs/xfs_pnfs.c @@ -2,23 +2,16 @@ /* * Copyright (c) 2014 Christoph Hellwig. */ -#include <linux/iomap.h> #include "xfs.h" +#include "xfs_shared.h" #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" -#include "xfs_sb.h" #include "xfs_mount.h" #include "xfs_inode.h" #include "xfs_trans.h" -#include "xfs_log.h" #include "xfs_bmap.h" -#include "xfs_bmap_util.h" -#include "xfs_error.h" #include "xfs_iomap.h" -#include "xfs_shared.h" -#include "xfs_bit.h" -#include "xfs_pnfs.h" /* * Ensure that we do not have any outstanding pNFS layouts that can be used by @@ -185,7 +178,7 @@ xfs_fs_map_blocks( } xfs_iunlock(ip, XFS_IOLOCK_EXCL); - xfs_bmbt_to_iomap(ip, iomap, &imap); + error = xfs_bmbt_to_iomap(ip, iomap, &imap, false); *device_generation = mp->m_generation; return error; out_unlock: diff --git a/fs/xfs/xfs_pwork.c b/fs/xfs/xfs_pwork.c new file mode 100644 index 000000000000..4bcc3e61056c --- /dev/null +++ b/fs/xfs/xfs_pwork.c @@ -0,0 +1,136 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2019 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <darrick.wong@oracle.com> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_trace.h" +#include "xfs_sysctl.h" +#include "xfs_pwork.h" +#include <linux/nmi.h> + +/* + * Parallel Work Queue + * =================== + * + * Abstract away the details of running a large and "obviously" parallelizable + * task across multiple CPUs. Callers initialize the pwork control object with + * a desired level of parallelization and a work function. Next, they embed + * struct xfs_pwork in whatever structure they use to pass work context to a + * worker thread and queue that pwork. The work function will be passed the + * pwork item when it is run (from process context) and any returned error will + * be recorded in xfs_pwork_ctl.error. Work functions should check for errors + * and abort if necessary; the non-zeroness of xfs_pwork_ctl.error does not + * stop workqueue item processing. + * + * This is the rough equivalent of the xfsprogs workqueue code, though we can't + * reuse that name here. + */ + +/* Invoke our caller's function. */ +static void +xfs_pwork_work( + struct work_struct *work) +{ + struct xfs_pwork *pwork; + struct xfs_pwork_ctl *pctl; + int error; + + pwork = container_of(work, struct xfs_pwork, work); + pctl = pwork->pctl; + error = pctl->work_fn(pctl->mp, pwork); + if (error && !pctl->error) + pctl->error = error; + if (atomic_dec_and_test(&pctl->nr_work)) + wake_up(&pctl->poll_wait); +} + +/* + * Set up control data for parallel work. @work_fn is the function that will + * be called. @tag will be written into the kernel threads. @nr_threads is + * the level of parallelism desired, or 0 for no limit. + */ +int +xfs_pwork_init( + struct xfs_mount *mp, + struct xfs_pwork_ctl *pctl, + xfs_pwork_work_fn work_fn, + const char *tag, + unsigned int nr_threads) +{ +#ifdef DEBUG + if (xfs_globals.pwork_threads >= 0) + nr_threads = xfs_globals.pwork_threads; +#endif + trace_xfs_pwork_init(mp, nr_threads, current->pid); + + pctl->wq = alloc_workqueue("%s-%d", WQ_FREEZABLE, nr_threads, tag, + current->pid); + if (!pctl->wq) + return -ENOMEM; + pctl->work_fn = work_fn; + pctl->error = 0; + pctl->mp = mp; + atomic_set(&pctl->nr_work, 0); + init_waitqueue_head(&pctl->poll_wait); + + return 0; +} + +/* Queue some parallel work. */ +void +xfs_pwork_queue( + struct xfs_pwork_ctl *pctl, + struct xfs_pwork *pwork) +{ + INIT_WORK(&pwork->work, xfs_pwork_work); + pwork->pctl = pctl; + atomic_inc(&pctl->nr_work); + queue_work(pctl->wq, &pwork->work); +} + +/* Wait for the work to finish and tear down the control structure. */ +int +xfs_pwork_destroy( + struct xfs_pwork_ctl *pctl) +{ + destroy_workqueue(pctl->wq); + pctl->wq = NULL; + return pctl->error; +} + +/* + * Wait for the work to finish by polling completion status and touch the soft + * lockup watchdog. This is for callers such as mount which hold locks. + */ +void +xfs_pwork_poll( + struct xfs_pwork_ctl *pctl) +{ + while (wait_event_timeout(pctl->poll_wait, + atomic_read(&pctl->nr_work) == 0, HZ) == 0) + touch_softlockup_watchdog(); +} + +/* + * Return the amount of parallelism that the data device can handle, or 0 for + * no limit. + */ +unsigned int +xfs_pwork_guess_datadev_parallelism( + struct xfs_mount *mp) +{ + struct xfs_buftarg *btp = mp->m_ddev_targp; + + /* + * For now we'll go with the most conservative setting possible, + * which is two threads for an SSD and 1 thread everywhere else. + */ + return blk_queue_nonrot(btp->bt_bdev->bd_queue) ? 2 : 1; +} diff --git a/fs/xfs/xfs_pwork.h b/fs/xfs/xfs_pwork.h new file mode 100644 index 000000000000..8133124cf3bb --- /dev/null +++ b/fs/xfs/xfs_pwork.h @@ -0,0 +1,61 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (C) 2019 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <darrick.wong@oracle.com> + */ +#ifndef __XFS_PWORK_H__ +#define __XFS_PWORK_H__ + +struct xfs_pwork; +struct xfs_mount; + +typedef int (*xfs_pwork_work_fn)(struct xfs_mount *mp, struct xfs_pwork *pwork); + +/* + * Parallel work coordination structure. + */ +struct xfs_pwork_ctl { + struct workqueue_struct *wq; + struct xfs_mount *mp; + xfs_pwork_work_fn work_fn; + struct wait_queue_head poll_wait; + atomic_t nr_work; + int error; +}; + +/* + * Embed this parallel work control item inside your own work structure, + * then queue work with it. + */ +struct xfs_pwork { + struct work_struct work; + struct xfs_pwork_ctl *pctl; +}; + +#define XFS_PWORK_SINGLE_THREADED { .pctl = NULL } + +/* Have we been told to abort? */ +static inline bool +xfs_pwork_ctl_want_abort( + struct xfs_pwork_ctl *pctl) +{ + return pctl && pctl->error; +} + +/* Have we been told to abort? */ +static inline bool +xfs_pwork_want_abort( + struct xfs_pwork *pwork) +{ + return xfs_pwork_ctl_want_abort(pwork->pctl); +} + +int xfs_pwork_init(struct xfs_mount *mp, struct xfs_pwork_ctl *pctl, + xfs_pwork_work_fn work_fn, const char *tag, + unsigned int nr_threads); +void xfs_pwork_queue(struct xfs_pwork_ctl *pctl, struct xfs_pwork *pwork); +int xfs_pwork_destroy(struct xfs_pwork_ctl *pctl); +void xfs_pwork_poll(struct xfs_pwork_ctl *pctl); +unsigned int xfs_pwork_guess_datadev_parallelism(struct xfs_mount *mp); + +#endif /* __XFS_PWORK_H__ */ diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index 52ed7904df10..5e7a37f0cf84 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -13,19 +13,15 @@ #include "xfs_sb.h" #include "xfs_mount.h" #include "xfs_inode.h" -#include "xfs_ialloc.h" -#include "xfs_itable.h" +#include "xfs_iwalk.h" #include "xfs_quota.h" -#include "xfs_error.h" #include "xfs_bmap.h" -#include "xfs_bmap_btree.h" #include "xfs_bmap_util.h" #include "xfs_trans.h" #include "xfs_trans_space.h" #include "xfs_qm.h" #include "xfs_trace.h" #include "xfs_icache.h" -#include "xfs_cksum.h" /* * The global quota manager. There is only one of these for the entire @@ -1118,17 +1114,15 @@ xfs_qm_quotacheck_dqadjust( /* ARGSUSED */ STATIC int xfs_qm_dqusage_adjust( - xfs_mount_t *mp, /* mount point for filesystem */ - xfs_ino_t ino, /* inode number to get data for */ - void __user *buffer, /* not used */ - int ubsize, /* not used */ - int *ubused, /* not used */ - int *res) /* result code value */ + struct xfs_mount *mp, + struct xfs_trans *tp, + xfs_ino_t ino, + void *data) { - xfs_inode_t *ip; - xfs_qcnt_t nblks; - xfs_filblks_t rtblks = 0; /* total rt blks */ - int error; + struct xfs_inode *ip; + xfs_qcnt_t nblks; + xfs_filblks_t rtblks = 0; /* total rt blks */ + int error; ASSERT(XFS_IS_QUOTA_RUNNING(mp)); @@ -1136,20 +1130,18 @@ xfs_qm_dqusage_adjust( * rootino must have its resources accounted for, not so with the quota * inodes. */ - if (xfs_is_quota_inode(&mp->m_sb, ino)) { - *res = BULKSTAT_RV_NOTHING; - return -EINVAL; - } + if (xfs_is_quota_inode(&mp->m_sb, ino)) + return 0; /* * We don't _need_ to take the ilock EXCL here because quotacheck runs * at mount time and therefore nobody will be racing chown/chproj. */ - error = xfs_iget(mp, NULL, ino, XFS_IGET_DONTCACHE, 0, &ip); - if (error) { - *res = BULKSTAT_RV_NOTHING; + error = xfs_iget(mp, tp, ino, XFS_IGET_DONTCACHE, 0, &ip); + if (error == -EINVAL || error == -ENOENT) + return 0; + if (error) return error; - } ASSERT(ip->i_delayed_blks == 0); @@ -1157,7 +1149,7 @@ xfs_qm_dqusage_adjust( struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); if (!(ifp->if_flags & XFS_IFEXTENTS)) { - error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK); + error = xfs_iread_extents(tp, ip, XFS_DATA_FORK); if (error) goto error0; } @@ -1200,13 +1192,8 @@ xfs_qm_dqusage_adjust( goto error0; } - xfs_irele(ip); - *res = BULKSTAT_RV_DIDONE; - return 0; - error0: xfs_irele(ip); - *res = BULKSTAT_RV_GIVEUP; return error; } @@ -1270,18 +1257,13 @@ STATIC int xfs_qm_quotacheck( xfs_mount_t *mp) { - int done, count, error, error2; - xfs_ino_t lastino; - size_t structsz; + int error, error2; uint flags; LIST_HEAD (buffer_list); struct xfs_inode *uip = mp->m_quotainfo->qi_uquotaip; struct xfs_inode *gip = mp->m_quotainfo->qi_gquotaip; struct xfs_inode *pip = mp->m_quotainfo->qi_pquotaip; - count = INT_MAX; - structsz = 1; - lastino = 0; flags = 0; ASSERT(uip || gip || pip); @@ -1318,18 +1300,10 @@ xfs_qm_quotacheck( flags |= XFS_PQUOTA_CHKD; } - do { - /* - * Iterate thru all the inodes in the file system, - * adjusting the corresponding dquot counters in core. - */ - error = xfs_bulkstat(mp, &lastino, &count, - xfs_qm_dqusage_adjust, - structsz, NULL, &done); - if (error) - break; - - } while (!done); + error = xfs_iwalk_threaded(mp, 0, 0, xfs_qm_dqusage_adjust, 0, true, + NULL); + if (error) + goto error_return; /* * We've made all the changes that we need to make incore. Flush them @@ -1812,7 +1786,8 @@ xfs_qm_vop_chown_reserve( uint flags) { struct xfs_mount *mp = ip->i_mount; - uint delblks, blkflags, prjflags = 0; + uint64_t delblks; + unsigned int blkflags, prjflags = 0; struct xfs_dquot *udq_unres = NULL; struct xfs_dquot *gdq_unres = NULL; struct xfs_dquot *pdq_unres = NULL; diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h index 3ccf0fbc9071..b41b75089548 100644 --- a/fs/xfs/xfs_qm.h +++ b/fs/xfs/xfs_qm.h @@ -113,12 +113,8 @@ xfs_quota_inode(xfs_mount_t *mp, uint dq_flags) return NULL; } -extern void xfs_trans_mod_dquot(struct xfs_trans *, - struct xfs_dquot *, uint, long); -extern int xfs_trans_reserve_quota_bydquots(struct xfs_trans *, - struct xfs_mount *, struct xfs_dquot *, - struct xfs_dquot *, struct xfs_dquot *, - long, long, uint); +extern void xfs_trans_mod_dquot(struct xfs_trans *tp, struct xfs_dquot *dqp, + uint field, int64_t delta); extern void xfs_trans_dqjoin(struct xfs_trans *, struct xfs_dquot *); extern void xfs_trans_log_dquot(struct xfs_trans *, struct xfs_dquot *); diff --git a/fs/xfs/xfs_qm_bhv.c b/fs/xfs/xfs_qm_bhv.c index 3091e4bc04ef..5d72e88598b4 100644 --- a/fs/xfs/xfs_qm_bhv.c +++ b/fs/xfs/xfs_qm_bhv.c @@ -5,13 +5,13 @@ */ #include "xfs.h" #include "xfs_fs.h" +#include "xfs_shared.h" #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_quota.h" #include "xfs_mount.h" #include "xfs_inode.h" -#include "xfs_error.h" #include "xfs_trans.h" #include "xfs_qm.h" diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c index b3190890f096..da7ad0383037 100644 --- a/fs/xfs/xfs_qm_syscalls.c +++ b/fs/xfs/xfs_qm_syscalls.c @@ -4,7 +4,6 @@ * All Rights Reserved. */ -#include <linux/capability.h> #include "xfs.h" #include "xfs_fs.h" @@ -12,17 +11,13 @@ #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" -#include "xfs_bit.h" #include "xfs_sb.h" #include "xfs_mount.h" #include "xfs_inode.h" #include "xfs_trans.h" -#include "xfs_error.h" #include "xfs_quota.h" #include "xfs_qm.h" -#include "xfs_trace.h" #include "xfs_icache.h" -#include "xfs_defer.h" STATIC int xfs_qm_log_quotaoff(xfs_mount_t *, xfs_qoff_logitem_t **, uint); STATIC int xfs_qm_log_quotaoff_end(xfs_mount_t *, xfs_qoff_logitem_t *, diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h index 55b798265ef7..efe42ae7a2f3 100644 --- a/fs/xfs/xfs_quota.h +++ b/fs/xfs/xfs_quota.h @@ -56,32 +56,35 @@ xfs_quota_chkd_flag( * The structure kept inside the xfs_trans_t keep track of dquot changes * within a transaction and apply them later. */ -typedef struct xfs_dqtrx { +struct xfs_dqtrx { struct xfs_dquot *qt_dquot; /* the dquot this refers to */ - ulong qt_blk_res; /* blks reserved on a dquot */ - ulong qt_ino_res; /* inode reserved on a dquot */ - ulong qt_ino_res_used; /* inodes used from the reservation */ - long qt_bcount_delta; /* dquot blk count changes */ - long qt_delbcnt_delta; /* delayed dquot blk count changes */ - long qt_icount_delta; /* dquot inode count changes */ - ulong qt_rtblk_res; /* # blks reserved on a dquot */ - ulong qt_rtblk_res_used;/* # blks used from reservation */ - long qt_rtbcount_delta;/* dquot realtime blk changes */ - long qt_delrtb_delta; /* delayed RT blk count changes */ -} xfs_dqtrx_t; + + uint64_t qt_blk_res; /* blks reserved on a dquot */ + int64_t qt_bcount_delta; /* dquot blk count changes */ + int64_t qt_delbcnt_delta; /* delayed dquot blk count changes */ + + uint64_t qt_rtblk_res; /* # blks reserved on a dquot */ + uint64_t qt_rtblk_res_used;/* # blks used from reservation */ + int64_t qt_rtbcount_delta;/* dquot realtime blk changes */ + int64_t qt_delrtb_delta; /* delayed RT blk count changes */ + + uint64_t qt_ino_res; /* inode reserved on a dquot */ + uint64_t qt_ino_res_used; /* inodes used from the reservation */ + int64_t qt_icount_delta; /* dquot inode count changes */ +}; #ifdef CONFIG_XFS_QUOTA extern void xfs_trans_dup_dqinfo(struct xfs_trans *, struct xfs_trans *); extern void xfs_trans_free_dqinfo(struct xfs_trans *); extern void xfs_trans_mod_dquot_byino(struct xfs_trans *, struct xfs_inode *, - uint, long); + uint, int64_t); extern void xfs_trans_apply_dquot_deltas(struct xfs_trans *); extern void xfs_trans_unreserve_and_mod_dquots(struct xfs_trans *); extern int xfs_trans_reserve_quota_nblks(struct xfs_trans *, - struct xfs_inode *, long, long, uint); + struct xfs_inode *, int64_t, long, uint); extern int xfs_trans_reserve_quota_bydquots(struct xfs_trans *, struct xfs_mount *, struct xfs_dquot *, - struct xfs_dquot *, struct xfs_dquot *, long, long, uint); + struct xfs_dquot *, struct xfs_dquot *, int64_t, long, uint); extern int xfs_qm_vop_dqalloc(struct xfs_inode *, xfs_dqid_t, xfs_dqid_t, prid_t, uint, struct xfs_dquot **, struct xfs_dquot **, @@ -121,14 +124,14 @@ xfs_qm_vop_dqalloc(struct xfs_inode *ip, xfs_dqid_t uid, xfs_dqid_t gid, #define xfs_trans_apply_dquot_deltas(tp) #define xfs_trans_unreserve_and_mod_dquots(tp) static inline int xfs_trans_reserve_quota_nblks(struct xfs_trans *tp, - struct xfs_inode *ip, long nblks, long ninos, uint flags) + struct xfs_inode *ip, int64_t nblks, long ninos, uint flags) { return 0; } static inline int xfs_trans_reserve_quota_bydquots(struct xfs_trans *tp, struct xfs_mount *mp, struct xfs_dquot *udqp, struct xfs_dquot *gdqp, struct xfs_dquot *pdqp, - long nblks, long nions, uint flags) + int64_t nblks, long nions, uint flags) { return 0; } diff --git a/fs/xfs/xfs_quotaops.c b/fs/xfs/xfs_quotaops.c index a7c0c657dfaf..cd6c7210a373 100644 --- a/fs/xfs/xfs_quotaops.c +++ b/fs/xfs/xfs_quotaops.c @@ -4,6 +4,7 @@ * All Rights Reserved. */ #include "xfs.h" +#include "xfs_shared.h" #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" @@ -11,10 +12,8 @@ #include "xfs_inode.h" #include "xfs_quota.h" #include "xfs_trans.h" -#include "xfs_trace.h" #include "xfs_icache.h" #include "xfs_qm.h" -#include <linux/quota.h> static void diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c index fce38b56b962..d8288aa0670a 100644 --- a/fs/xfs/xfs_refcount_item.c +++ b/fs/xfs/xfs_refcount_item.c @@ -14,7 +14,6 @@ #include "xfs_defer.h" #include "xfs_trans.h" #include "xfs_trans_priv.h" -#include "xfs_buf_item.h" #include "xfs_refcount_item.h" #include "xfs_log.h" #include "xfs_refcount.h" @@ -95,15 +94,6 @@ xfs_cui_item_format( } /* - * Pinning has no meaning for an cui item, so just return. - */ -STATIC void -xfs_cui_item_pin( - struct xfs_log_item *lip) -{ -} - -/* * The unpin operation is the last place an CUI is manipulated in the log. It is * either inserted in the AIL or aborted in the event of a log I/O error. In * either case, the CUI transaction has been successfully committed to make it @@ -122,71 +112,22 @@ xfs_cui_item_unpin( } /* - * CUI items have no locking or pushing. However, since CUIs are pulled from - * the AIL when their corresponding CUDs are committed to disk, their situation - * is very similar to being pinned. Return XFS_ITEM_PINNED so that the caller - * will eventually flush the log. This should help in getting the CUI out of - * the AIL. - */ -STATIC uint -xfs_cui_item_push( - struct xfs_log_item *lip, - struct list_head *buffer_list) -{ - return XFS_ITEM_PINNED; -} - -/* * The CUI has been either committed or aborted if the transaction has been * cancelled. If the transaction was cancelled, an CUD isn't going to be * constructed and thus we free the CUI here directly. */ STATIC void -xfs_cui_item_unlock( +xfs_cui_item_release( struct xfs_log_item *lip) { - if (test_bit(XFS_LI_ABORTED, &lip->li_flags)) - xfs_cui_release(CUI_ITEM(lip)); + xfs_cui_release(CUI_ITEM(lip)); } -/* - * The CUI is logged only once and cannot be moved in the log, so simply return - * the lsn at which it's been logged. - */ -STATIC xfs_lsn_t -xfs_cui_item_committed( - struct xfs_log_item *lip, - xfs_lsn_t lsn) -{ - return lsn; -} - -/* - * The CUI dependency tracking op doesn't do squat. It can't because - * it doesn't know where the free extent is coming from. The dependency - * tracking has to be handled by the "enclosing" metadata object. For - * example, for inodes, the inode is locked throughout the extent freeing - * so the dependency should be recorded there. - */ -STATIC void -xfs_cui_item_committing( - struct xfs_log_item *lip, - xfs_lsn_t lsn) -{ -} - -/* - * This is the ops vector shared by all cui log items. - */ static const struct xfs_item_ops xfs_cui_item_ops = { .iop_size = xfs_cui_item_size, .iop_format = xfs_cui_item_format, - .iop_pin = xfs_cui_item_pin, .iop_unpin = xfs_cui_item_unpin, - .iop_unlock = xfs_cui_item_unlock, - .iop_committed = xfs_cui_item_committed, - .iop_push = xfs_cui_item_push, - .iop_committing = xfs_cui_item_committing, + .iop_release = xfs_cui_item_release, }; /* @@ -254,126 +195,250 @@ xfs_cud_item_format( } /* - * Pinning has no meaning for an cud item, so just return. + * The CUD is either committed or aborted if the transaction is cancelled. If + * the transaction is cancelled, drop our reference to the CUI and free the + * CUD. */ STATIC void -xfs_cud_item_pin( +xfs_cud_item_release( struct xfs_log_item *lip) { + struct xfs_cud_log_item *cudp = CUD_ITEM(lip); + + xfs_cui_release(cudp->cud_cuip); + kmem_zone_free(xfs_cud_zone, cudp); } -/* - * Since pinning has no meaning for an cud item, unpinning does - * not either. - */ -STATIC void -xfs_cud_item_unpin( - struct xfs_log_item *lip, - int remove) +static const struct xfs_item_ops xfs_cud_item_ops = { + .flags = XFS_ITEM_RELEASE_WHEN_COMMITTED, + .iop_size = xfs_cud_item_size, + .iop_format = xfs_cud_item_format, + .iop_release = xfs_cud_item_release, +}; + +static struct xfs_cud_log_item * +xfs_trans_get_cud( + struct xfs_trans *tp, + struct xfs_cui_log_item *cuip) { + struct xfs_cud_log_item *cudp; + + cudp = kmem_zone_zalloc(xfs_cud_zone, KM_SLEEP); + xfs_log_item_init(tp->t_mountp, &cudp->cud_item, XFS_LI_CUD, + &xfs_cud_item_ops); + cudp->cud_cuip = cuip; + cudp->cud_format.cud_cui_id = cuip->cui_format.cui_id; + + xfs_trans_add_item(tp, &cudp->cud_item); + return cudp; } /* - * There isn't much you can do to push on an cud item. It is simply stuck - * waiting for the log to be flushed to disk. + * Finish an refcount update and log it to the CUD. Note that the + * transaction is marked dirty regardless of whether the refcount + * update succeeds or fails to support the CUI/CUD lifecycle rules. */ -STATIC uint -xfs_cud_item_push( - struct xfs_log_item *lip, - struct list_head *buffer_list) +static int +xfs_trans_log_finish_refcount_update( + struct xfs_trans *tp, + struct xfs_cud_log_item *cudp, + enum xfs_refcount_intent_type type, + xfs_fsblock_t startblock, + xfs_extlen_t blockcount, + xfs_fsblock_t *new_fsb, + xfs_extlen_t *new_len, + struct xfs_btree_cur **pcur) { - return XFS_ITEM_PINNED; + int error; + + error = xfs_refcount_finish_one(tp, type, startblock, + blockcount, new_fsb, new_len, pcur); + + /* + * Mark the transaction dirty, even on error. This ensures the + * transaction is aborted, which: + * + * 1.) releases the CUI and frees the CUD + * 2.) shuts down the filesystem + */ + tp->t_flags |= XFS_TRANS_DIRTY; + set_bit(XFS_LI_DIRTY, &cudp->cud_item.li_flags); + + return error; } -/* - * The CUD is either committed or aborted if the transaction is cancelled. If - * the transaction is cancelled, drop our reference to the CUI and free the - * CUD. - */ -STATIC void -xfs_cud_item_unlock( - struct xfs_log_item *lip) +/* Sort refcount intents by AG. */ +static int +xfs_refcount_update_diff_items( + void *priv, + struct list_head *a, + struct list_head *b) { - struct xfs_cud_log_item *cudp = CUD_ITEM(lip); + struct xfs_mount *mp = priv; + struct xfs_refcount_intent *ra; + struct xfs_refcount_intent *rb; + + ra = container_of(a, struct xfs_refcount_intent, ri_list); + rb = container_of(b, struct xfs_refcount_intent, ri_list); + return XFS_FSB_TO_AGNO(mp, ra->ri_startblock) - + XFS_FSB_TO_AGNO(mp, rb->ri_startblock); +} - if (test_bit(XFS_LI_ABORTED, &lip->li_flags)) { - xfs_cui_release(cudp->cud_cuip); - kmem_zone_free(xfs_cud_zone, cudp); +/* Get an CUI. */ +STATIC void * +xfs_refcount_update_create_intent( + struct xfs_trans *tp, + unsigned int count) +{ + struct xfs_cui_log_item *cuip; + + ASSERT(tp != NULL); + ASSERT(count > 0); + + cuip = xfs_cui_init(tp->t_mountp, count); + ASSERT(cuip != NULL); + + /* + * Get a log_item_desc to point at the new item. + */ + xfs_trans_add_item(tp, &cuip->cui_item); + return cuip; +} + +/* Set the phys extent flags for this reverse mapping. */ +static void +xfs_trans_set_refcount_flags( + struct xfs_phys_extent *refc, + enum xfs_refcount_intent_type type) +{ + refc->pe_flags = 0; + switch (type) { + case XFS_REFCOUNT_INCREASE: + case XFS_REFCOUNT_DECREASE: + case XFS_REFCOUNT_ALLOC_COW: + case XFS_REFCOUNT_FREE_COW: + refc->pe_flags |= type; + break; + default: + ASSERT(0); } } -/* - * When the cud item is committed to disk, all we need to do is delete our - * reference to our partner cui item and then free ourselves. Since we're - * freeing ourselves we must return -1 to keep the transaction code from - * further referencing this item. - */ -STATIC xfs_lsn_t -xfs_cud_item_committed( - struct xfs_log_item *lip, - xfs_lsn_t lsn) +/* Log refcount updates in the intent item. */ +STATIC void +xfs_refcount_update_log_item( + struct xfs_trans *tp, + void *intent, + struct list_head *item) { - struct xfs_cud_log_item *cudp = CUD_ITEM(lip); + struct xfs_cui_log_item *cuip = intent; + struct xfs_refcount_intent *refc; + uint next_extent; + struct xfs_phys_extent *ext; + + refc = container_of(item, struct xfs_refcount_intent, ri_list); + + tp->t_flags |= XFS_TRANS_DIRTY; + set_bit(XFS_LI_DIRTY, &cuip->cui_item.li_flags); /* - * Drop the CUI reference regardless of whether the CUD has been - * aborted. Once the CUD transaction is constructed, it is the sole - * responsibility of the CUD to release the CUI (even if the CUI is - * aborted due to log I/O error). + * atomic_inc_return gives us the value after the increment; + * we want to use it as an array index so we need to subtract 1 from + * it. */ - xfs_cui_release(cudp->cud_cuip); - kmem_zone_free(xfs_cud_zone, cudp); + next_extent = atomic_inc_return(&cuip->cui_next_extent) - 1; + ASSERT(next_extent < cuip->cui_format.cui_nextents); + ext = &cuip->cui_format.cui_extents[next_extent]; + ext->pe_startblock = refc->ri_startblock; + ext->pe_len = refc->ri_blockcount; + xfs_trans_set_refcount_flags(ext, refc->ri_type); +} - return (xfs_lsn_t)-1; +/* Get an CUD so we can process all the deferred refcount updates. */ +STATIC void * +xfs_refcount_update_create_done( + struct xfs_trans *tp, + void *intent, + unsigned int count) +{ + return xfs_trans_get_cud(tp, intent); } -/* - * The CUD dependency tracking op doesn't do squat. It can't because - * it doesn't know where the free extent is coming from. The dependency - * tracking has to be handled by the "enclosing" metadata object. For - * example, for inodes, the inode is locked throughout the extent freeing - * so the dependency should be recorded there. - */ -STATIC void -xfs_cud_item_committing( - struct xfs_log_item *lip, - xfs_lsn_t lsn) +/* Process a deferred refcount update. */ +STATIC int +xfs_refcount_update_finish_item( + struct xfs_trans *tp, + struct list_head *item, + void *done_item, + void **state) { + struct xfs_refcount_intent *refc; + xfs_fsblock_t new_fsb; + xfs_extlen_t new_aglen; + int error; + + refc = container_of(item, struct xfs_refcount_intent, ri_list); + error = xfs_trans_log_finish_refcount_update(tp, done_item, + refc->ri_type, + refc->ri_startblock, + refc->ri_blockcount, + &new_fsb, &new_aglen, + (struct xfs_btree_cur **)state); + /* Did we run out of reservation? Requeue what we didn't finish. */ + if (!error && new_aglen > 0) { + ASSERT(refc->ri_type == XFS_REFCOUNT_INCREASE || + refc->ri_type == XFS_REFCOUNT_DECREASE); + refc->ri_startblock = new_fsb; + refc->ri_blockcount = new_aglen; + return -EAGAIN; + } + kmem_free(refc); + return error; } -/* - * This is the ops vector shared by all cud log items. - */ -static const struct xfs_item_ops xfs_cud_item_ops = { - .iop_size = xfs_cud_item_size, - .iop_format = xfs_cud_item_format, - .iop_pin = xfs_cud_item_pin, - .iop_unpin = xfs_cud_item_unpin, - .iop_unlock = xfs_cud_item_unlock, - .iop_committed = xfs_cud_item_committed, - .iop_push = xfs_cud_item_push, - .iop_committing = xfs_cud_item_committing, -}; +/* Clean up after processing deferred refcounts. */ +STATIC void +xfs_refcount_update_finish_cleanup( + struct xfs_trans *tp, + void *state, + int error) +{ + struct xfs_btree_cur *rcur = state; -/* - * Allocate and initialize an cud item with the given number of extents. - */ -struct xfs_cud_log_item * -xfs_cud_init( - struct xfs_mount *mp, - struct xfs_cui_log_item *cuip) + xfs_refcount_finish_one_cleanup(tp, rcur, error); +} +/* Abort all pending CUIs. */ +STATIC void +xfs_refcount_update_abort_intent( + void *intent) { - struct xfs_cud_log_item *cudp; + xfs_cui_release(intent); +} - cudp = kmem_zone_zalloc(xfs_cud_zone, KM_SLEEP); - xfs_log_item_init(mp, &cudp->cud_item, XFS_LI_CUD, &xfs_cud_item_ops); - cudp->cud_cuip = cuip; - cudp->cud_format.cud_cui_id = cuip->cui_format.cui_id; +/* Cancel a deferred refcount update. */ +STATIC void +xfs_refcount_update_cancel_item( + struct list_head *item) +{ + struct xfs_refcount_intent *refc; - return cudp; + refc = container_of(item, struct xfs_refcount_intent, ri_list); + kmem_free(refc); } +const struct xfs_defer_op_type xfs_refcount_update_defer_type = { + .max_items = XFS_CUI_MAX_FAST_EXTENTS, + .diff_items = xfs_refcount_update_diff_items, + .create_intent = xfs_refcount_update_create_intent, + .abort_intent = xfs_refcount_update_abort_intent, + .log_item = xfs_refcount_update_log_item, + .create_done = xfs_refcount_update_create_done, + .finish_item = xfs_refcount_update_finish_item, + .finish_cleanup = xfs_refcount_update_finish_cleanup, + .cancel_item = xfs_refcount_update_cancel_item, +}; + /* * Process a refcount update intent item that was recovered from the log. * We need to update the refcountbt. diff --git a/fs/xfs/xfs_refcount_item.h b/fs/xfs/xfs_refcount_item.h index 3896dcc2368f..e47530f30489 100644 --- a/fs/xfs/xfs_refcount_item.h +++ b/fs/xfs/xfs_refcount_item.h @@ -78,8 +78,6 @@ extern struct kmem_zone *xfs_cui_zone; extern struct kmem_zone *xfs_cud_zone; struct xfs_cui_log_item *xfs_cui_init(struct xfs_mount *, uint); -struct xfs_cud_log_item *xfs_cud_init(struct xfs_mount *, - struct xfs_cui_log_item *); void xfs_cui_item_free(struct xfs_cui_log_item *); void xfs_cui_release(struct xfs_cui_log_item *); int xfs_cui_recover(struct xfs_trans *parent_tp, struct xfs_cui_log_item *cuip); diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index c5b4fa004ca4..c4ec7afd1170 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -11,21 +11,12 @@ #include "xfs_trans_resv.h" #include "xfs_mount.h" #include "xfs_defer.h" -#include "xfs_da_format.h" -#include "xfs_da_btree.h" #include "xfs_inode.h" #include "xfs_trans.h" -#include "xfs_inode_item.h" #include "xfs_bmap.h" #include "xfs_bmap_util.h" -#include "xfs_error.h" -#include "xfs_dir2.h" -#include "xfs_dir2_priv.h" -#include "xfs_ioctl.h" #include "xfs_trace.h" -#include "xfs_log.h" #include "xfs_icache.h" -#include "xfs_pnfs.h" #include "xfs_btree.h" #include "xfs_refcount_btree.h" #include "xfs_refcount.h" @@ -33,11 +24,9 @@ #include "xfs_trans_space.h" #include "xfs_bit.h" #include "xfs_alloc.h" -#include "xfs_quota_defs.h" #include "xfs_quota.h" #include "xfs_reflink.h" #include "xfs_iomap.h" -#include "xfs_rmap_btree.h" #include "xfs_sb.h" #include "xfs_ag_resv.h" @@ -192,7 +181,7 @@ xfs_reflink_trim_around_shared( int error = 0; /* Holes, unwritten, and delalloc extents cannot be shared */ - if (!xfs_is_reflink_inode(ip) || !xfs_bmap_is_real_extent(irec)) { + if (!xfs_is_cow_inode(ip) || !xfs_bmap_is_real_extent(irec)) { *shared = false; return 0; } @@ -234,93 +223,59 @@ xfs_reflink_trim_around_shared( } } -/* - * Trim the passed in imap to the next shared/unshared extent boundary, and - * if imap->br_startoff points to a shared extent reserve space for it in the - * COW fork. - * - * Note that imap will always contain the block numbers for the existing blocks - * in the data fork, as the upper layers need them for read-modify-write - * operations. - */ -int -xfs_reflink_reserve_cow( +bool +xfs_inode_need_cow( struct xfs_inode *ip, - struct xfs_bmbt_irec *imap) + struct xfs_bmbt_irec *imap, + bool *shared) { - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); - struct xfs_bmbt_irec got; - int error = 0; - bool eof = false; - struct xfs_iext_cursor icur; - bool shared; - - /* - * Search the COW fork extent list first. This serves two purposes: - * first this implement the speculative preallocation using cowextisze, - * so that we also unshared block adjacent to shared blocks instead - * of just the shared blocks themselves. Second the lookup in the - * extent list is generally faster than going out to the shared extent - * tree. - */ - - if (!xfs_iext_lookup_extent(ip, ifp, imap->br_startoff, &icur, &got)) - eof = true; - if (!eof && got.br_startoff <= imap->br_startoff) { - trace_xfs_reflink_cow_found(ip, imap); - xfs_trim_extent(imap, got.br_startoff, got.br_blockcount); + /* We can't update any real extents in always COW mode. */ + if (xfs_is_always_cow_inode(ip) && + !isnullstartblock(imap->br_startblock)) { + *shared = true; return 0; } /* Trim the mapping to the nearest shared extent boundary. */ - error = xfs_reflink_trim_around_shared(ip, imap, &shared); - if (error) - return error; - - /* Not shared? Just report the (potentially capped) extent. */ - if (!shared) - return 0; - - /* - * Fork all the shared blocks from our write offset until the end of - * the extent. - */ - error = xfs_qm_dqattach_locked(ip, false); - if (error) - return error; - - error = xfs_bmapi_reserve_delalloc(ip, XFS_COW_FORK, imap->br_startoff, - imap->br_blockcount, 0, &got, &icur, eof); - if (error == -ENOSPC || error == -EDQUOT) - trace_xfs_reflink_cow_enospc(ip, imap); - if (error) - return error; - - xfs_trim_extent(imap, got.br_startoff, got.br_blockcount); - trace_xfs_reflink_cow_alloc(ip, &got); - return 0; + return xfs_reflink_trim_around_shared(ip, imap, shared); } -/* Convert part of an unwritten CoW extent to a real one. */ -STATIC int -xfs_reflink_convert_cow_extent( - struct xfs_inode *ip, - struct xfs_bmbt_irec *imap, - xfs_fileoff_t offset_fsb, - xfs_filblks_t count_fsb) +static int +xfs_reflink_convert_cow_locked( + struct xfs_inode *ip, + xfs_fileoff_t offset_fsb, + xfs_filblks_t count_fsb) { - int nimaps = 1; + struct xfs_iext_cursor icur; + struct xfs_bmbt_irec got; + struct xfs_btree_cur *dummy_cur = NULL; + int dummy_logflags; + int error = 0; - if (imap->br_state == XFS_EXT_NORM) + if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &got)) return 0; - xfs_trim_extent(imap, offset_fsb, count_fsb); - trace_xfs_reflink_convert_cow(ip, imap); - if (imap->br_blockcount == 0) - return 0; - return xfs_bmapi_write(NULL, ip, imap->br_startoff, imap->br_blockcount, - XFS_BMAPI_COWFORK | XFS_BMAPI_CONVERT, 0, imap, - &nimaps); + do { + if (got.br_startoff >= offset_fsb + count_fsb) + break; + if (got.br_state == XFS_EXT_NORM) + continue; + if (WARN_ON_ONCE(isnullstartblock(got.br_startblock))) + return -EIO; + + xfs_trim_extent(&got, offset_fsb, count_fsb); + if (!got.br_blockcount) + continue; + + got.br_state = XFS_EXT_NORM; + error = xfs_bmap_add_extent_unwritten_real(NULL, ip, + XFS_COW_FORK, &icur, &dummy_cur, &got, + &dummy_logflags); + if (error) + return error; + } while (xfs_iext_next_extent(ip->i_cowfp, &icur, &got)); + + return error; } /* Convert all of the unwritten CoW extents in a file's range to real ones. */ @@ -334,15 +289,12 @@ xfs_reflink_convert_cow( xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + count); xfs_filblks_t count_fsb = end_fsb - offset_fsb; - struct xfs_bmbt_irec imap; - int nimaps = 1, error = 0; + int error; ASSERT(count != 0); xfs_ilock(ip, XFS_ILOCK_EXCL); - error = xfs_bmapi_write(NULL, ip, offset_fsb, count_fsb, - XFS_BMAPI_COWFORK | XFS_BMAPI_CONVERT | - XFS_BMAPI_CONVERT_ONLY, 0, &imap, &nimaps); + error = xfs_reflink_convert_cow_locked(ip, offset_fsb, count_fsb); xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; } @@ -375,7 +327,7 @@ xfs_find_trim_cow_extent( if (got.br_startoff > offset_fsb) { xfs_trim_extent(imap, imap->br_startoff, got.br_startoff - imap->br_startoff); - return xfs_reflink_trim_around_shared(ip, imap, shared); + return xfs_inode_need_cow(ip, imap, shared); } *shared = true; @@ -397,7 +349,8 @@ xfs_reflink_allocate_cow( struct xfs_inode *ip, struct xfs_bmbt_irec *imap, bool *shared, - uint *lockmode) + uint *lockmode, + bool convert_now) { struct xfs_mount *mp = ip->i_mount; xfs_fileoff_t offset_fsb = imap->br_startoff; @@ -409,7 +362,10 @@ xfs_reflink_allocate_cow( xfs_extlen_t resblks = 0; ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); - ASSERT(xfs_is_reflink_inode(ip)); + if (!ip->i_cowfp) { + ASSERT(!xfs_is_reflink_inode(ip)); + xfs_ifork_init_cow(ip); + } error = xfs_find_trim_cow_extent(ip, imap, shared, &found); if (error || !*shared) @@ -471,7 +427,16 @@ xfs_reflink_allocate_cow( if (nimaps == 0) return -ENOSPC; convert: - return xfs_reflink_convert_cow_extent(ip, imap, offset_fsb, count_fsb); + xfs_trim_extent(imap, offset_fsb, count_fsb); + /* + * COW fork extents are supposed to remain unwritten until we're ready + * to initiate a disk write. For direct I/O we are going to write the + * data and need the conversion, but for buffered writes we're done. + */ + if (!convert_now || imap->br_state == XFS_EXT_NORM) + return 0; + trace_xfs_reflink_convert_cow(ip, imap); + return xfs_reflink_convert_cow_locked(ip, offset_fsb, count_fsb); out_unreserve: xfs_trans_unreserve_quota_nblks(tp, ip, (long)resblks, 0, @@ -586,7 +551,7 @@ xfs_reflink_cancel_cow_range( int error; trace_xfs_reflink_cancel_cow_range(ip, offset, count); - ASSERT(xfs_is_reflink_inode(ip)); + ASSERT(ip->i_cowfp); offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset); if (count == NULLFILEOFF) @@ -596,7 +561,7 @@ xfs_reflink_cancel_cow_range( /* Start a rolling transaction to remove the mappings */ error = xfs_trans_alloc(ip->i_mount, &M_RES(ip->i_mount)->tr_write, - 0, 0, XFS_TRANS_NOFS, &tp); + 0, 0, 0, &tp); if (error) goto out; @@ -655,7 +620,7 @@ xfs_reflink_end_cow_extent( resblks = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK); error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, - XFS_TRANS_RESERVE | XFS_TRANS_NOFS, &tp); + XFS_TRANS_RESERVE, &tp); if (error) return error; @@ -1192,7 +1157,7 @@ xfs_reflink_remap_blocks( break; ASSERT(nimaps == 1); - trace_xfs_reflink_remap_imap(src, srcoff, len, XFS_IO_OVERWRITE, + trace_xfs_reflink_remap_imap(src, srcoff, len, XFS_DATA_FORK, &imap); /* Translate imap into the destination file. */ diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h index 6d73daef1f13..28a43b7f581d 100644 --- a/fs/xfs/xfs_reflink.h +++ b/fs/xfs/xfs_reflink.h @@ -6,16 +6,28 @@ #ifndef __XFS_REFLINK_H #define __XFS_REFLINK_H 1 +static inline bool xfs_is_always_cow_inode(struct xfs_inode *ip) +{ + return ip->i_mount->m_always_cow && + xfs_sb_version_hasreflink(&ip->i_mount->m_sb); +} + +static inline bool xfs_is_cow_inode(struct xfs_inode *ip) +{ + return xfs_is_reflink_inode(ip) || xfs_is_always_cow_inode(ip); +} + extern int xfs_reflink_find_shared(struct xfs_mount *mp, struct xfs_trans *tp, xfs_agnumber_t agno, xfs_agblock_t agbno, xfs_extlen_t aglen, xfs_agblock_t *fbno, xfs_extlen_t *flen, bool find_maximal); extern int xfs_reflink_trim_around_shared(struct xfs_inode *ip, struct xfs_bmbt_irec *irec, bool *shared); +bool xfs_inode_need_cow(struct xfs_inode *ip, struct xfs_bmbt_irec *imap, + bool *shared); -extern int xfs_reflink_reserve_cow(struct xfs_inode *ip, - struct xfs_bmbt_irec *imap); extern int xfs_reflink_allocate_cow(struct xfs_inode *ip, - struct xfs_bmbt_irec *imap, bool *shared, uint *lockmode); + struct xfs_bmbt_irec *imap, bool *shared, uint *lockmode, + bool convert_now); extern int xfs_reflink_convert_cow(struct xfs_inode *ip, xfs_off_t offset, xfs_off_t count); diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c index 127dc9c32a54..77ed557b6127 100644 --- a/fs/xfs/xfs_rmap_item.c +++ b/fs/xfs/xfs_rmap_item.c @@ -14,7 +14,6 @@ #include "xfs_defer.h" #include "xfs_trans.h" #include "xfs_trans_priv.h" -#include "xfs_buf_item.h" #include "xfs_rmap_item.h" #include "xfs_log.h" #include "xfs_rmap.h" @@ -94,15 +93,6 @@ xfs_rui_item_format( } /* - * Pinning has no meaning for an rui item, so just return. - */ -STATIC void -xfs_rui_item_pin( - struct xfs_log_item *lip) -{ -} - -/* * The unpin operation is the last place an RUI is manipulated in the log. It is * either inserted in the AIL or aborted in the event of a log I/O error. In * either case, the RUI transaction has been successfully committed to make it @@ -121,71 +111,22 @@ xfs_rui_item_unpin( } /* - * RUI items have no locking or pushing. However, since RUIs are pulled from - * the AIL when their corresponding RUDs are committed to disk, their situation - * is very similar to being pinned. Return XFS_ITEM_PINNED so that the caller - * will eventually flush the log. This should help in getting the RUI out of - * the AIL. - */ -STATIC uint -xfs_rui_item_push( - struct xfs_log_item *lip, - struct list_head *buffer_list) -{ - return XFS_ITEM_PINNED; -} - -/* * The RUI has been either committed or aborted if the transaction has been * cancelled. If the transaction was cancelled, an RUD isn't going to be * constructed and thus we free the RUI here directly. */ STATIC void -xfs_rui_item_unlock( +xfs_rui_item_release( struct xfs_log_item *lip) { - if (test_bit(XFS_LI_ABORTED, &lip->li_flags)) - xfs_rui_release(RUI_ITEM(lip)); + xfs_rui_release(RUI_ITEM(lip)); } -/* - * The RUI is logged only once and cannot be moved in the log, so simply return - * the lsn at which it's been logged. - */ -STATIC xfs_lsn_t -xfs_rui_item_committed( - struct xfs_log_item *lip, - xfs_lsn_t lsn) -{ - return lsn; -} - -/* - * The RUI dependency tracking op doesn't do squat. It can't because - * it doesn't know where the free extent is coming from. The dependency - * tracking has to be handled by the "enclosing" metadata object. For - * example, for inodes, the inode is locked throughout the extent freeing - * so the dependency should be recorded there. - */ -STATIC void -xfs_rui_item_committing( - struct xfs_log_item *lip, - xfs_lsn_t lsn) -{ -} - -/* - * This is the ops vector shared by all rui log items. - */ static const struct xfs_item_ops xfs_rui_item_ops = { .iop_size = xfs_rui_item_size, .iop_format = xfs_rui_item_format, - .iop_pin = xfs_rui_item_pin, .iop_unpin = xfs_rui_item_unpin, - .iop_unlock = xfs_rui_item_unlock, - .iop_committed = xfs_rui_item_committed, - .iop_push = xfs_rui_item_push, - .iop_committing = xfs_rui_item_committing, + .iop_release = xfs_rui_item_release, }; /* @@ -275,126 +216,271 @@ xfs_rud_item_format( } /* - * Pinning has no meaning for an rud item, so just return. + * The RUD is either committed or aborted if the transaction is cancelled. If + * the transaction is cancelled, drop our reference to the RUI and free the + * RUD. */ STATIC void -xfs_rud_item_pin( +xfs_rud_item_release( struct xfs_log_item *lip) { + struct xfs_rud_log_item *rudp = RUD_ITEM(lip); + + xfs_rui_release(rudp->rud_ruip); + kmem_zone_free(xfs_rud_zone, rudp); } -/* - * Since pinning has no meaning for an rud item, unpinning does - * not either. - */ -STATIC void -xfs_rud_item_unpin( - struct xfs_log_item *lip, - int remove) +static const struct xfs_item_ops xfs_rud_item_ops = { + .flags = XFS_ITEM_RELEASE_WHEN_COMMITTED, + .iop_size = xfs_rud_item_size, + .iop_format = xfs_rud_item_format, + .iop_release = xfs_rud_item_release, +}; + +static struct xfs_rud_log_item * +xfs_trans_get_rud( + struct xfs_trans *tp, + struct xfs_rui_log_item *ruip) { + struct xfs_rud_log_item *rudp; + + rudp = kmem_zone_zalloc(xfs_rud_zone, KM_SLEEP); + xfs_log_item_init(tp->t_mountp, &rudp->rud_item, XFS_LI_RUD, + &xfs_rud_item_ops); + rudp->rud_ruip = ruip; + rudp->rud_format.rud_rui_id = ruip->rui_format.rui_id; + + xfs_trans_add_item(tp, &rudp->rud_item); + return rudp; } -/* - * There isn't much you can do to push on an rud item. It is simply stuck - * waiting for the log to be flushed to disk. - */ -STATIC uint -xfs_rud_item_push( - struct xfs_log_item *lip, - struct list_head *buffer_list) +/* Set the map extent flags for this reverse mapping. */ +static void +xfs_trans_set_rmap_flags( + struct xfs_map_extent *rmap, + enum xfs_rmap_intent_type type, + int whichfork, + xfs_exntst_t state) { - return XFS_ITEM_PINNED; + rmap->me_flags = 0; + if (state == XFS_EXT_UNWRITTEN) + rmap->me_flags |= XFS_RMAP_EXTENT_UNWRITTEN; + if (whichfork == XFS_ATTR_FORK) + rmap->me_flags |= XFS_RMAP_EXTENT_ATTR_FORK; + switch (type) { + case XFS_RMAP_MAP: + rmap->me_flags |= XFS_RMAP_EXTENT_MAP; + break; + case XFS_RMAP_MAP_SHARED: + rmap->me_flags |= XFS_RMAP_EXTENT_MAP_SHARED; + break; + case XFS_RMAP_UNMAP: + rmap->me_flags |= XFS_RMAP_EXTENT_UNMAP; + break; + case XFS_RMAP_UNMAP_SHARED: + rmap->me_flags |= XFS_RMAP_EXTENT_UNMAP_SHARED; + break; + case XFS_RMAP_CONVERT: + rmap->me_flags |= XFS_RMAP_EXTENT_CONVERT; + break; + case XFS_RMAP_CONVERT_SHARED: + rmap->me_flags |= XFS_RMAP_EXTENT_CONVERT_SHARED; + break; + case XFS_RMAP_ALLOC: + rmap->me_flags |= XFS_RMAP_EXTENT_ALLOC; + break; + case XFS_RMAP_FREE: + rmap->me_flags |= XFS_RMAP_EXTENT_FREE; + break; + default: + ASSERT(0); + } } /* - * The RUD is either committed or aborted if the transaction is cancelled. If - * the transaction is cancelled, drop our reference to the RUI and free the - * RUD. + * Finish an rmap update and log it to the RUD. Note that the transaction is + * marked dirty regardless of whether the rmap update succeeds or fails to + * support the RUI/RUD lifecycle rules. */ -STATIC void -xfs_rud_item_unlock( - struct xfs_log_item *lip) +static int +xfs_trans_log_finish_rmap_update( + struct xfs_trans *tp, + struct xfs_rud_log_item *rudp, + enum xfs_rmap_intent_type type, + uint64_t owner, + int whichfork, + xfs_fileoff_t startoff, + xfs_fsblock_t startblock, + xfs_filblks_t blockcount, + xfs_exntst_t state, + struct xfs_btree_cur **pcur) { - struct xfs_rud_log_item *rudp = RUD_ITEM(lip); + int error; - if (test_bit(XFS_LI_ABORTED, &lip->li_flags)) { - xfs_rui_release(rudp->rud_ruip); - kmem_zone_free(xfs_rud_zone, rudp); - } + error = xfs_rmap_finish_one(tp, type, owner, whichfork, startoff, + startblock, blockcount, state, pcur); + + /* + * Mark the transaction dirty, even on error. This ensures the + * transaction is aborted, which: + * + * 1.) releases the RUI and frees the RUD + * 2.) shuts down the filesystem + */ + tp->t_flags |= XFS_TRANS_DIRTY; + set_bit(XFS_LI_DIRTY, &rudp->rud_item.li_flags); + + return error; } -/* - * When the rud item is committed to disk, all we need to do is delete our - * reference to our partner rui item and then free ourselves. Since we're - * freeing ourselves we must return -1 to keep the transaction code from - * further referencing this item. - */ -STATIC xfs_lsn_t -xfs_rud_item_committed( - struct xfs_log_item *lip, - xfs_lsn_t lsn) +/* Sort rmap intents by AG. */ +static int +xfs_rmap_update_diff_items( + void *priv, + struct list_head *a, + struct list_head *b) { - struct xfs_rud_log_item *rudp = RUD_ITEM(lip); + struct xfs_mount *mp = priv; + struct xfs_rmap_intent *ra; + struct xfs_rmap_intent *rb; + + ra = container_of(a, struct xfs_rmap_intent, ri_list); + rb = container_of(b, struct xfs_rmap_intent, ri_list); + return XFS_FSB_TO_AGNO(mp, ra->ri_bmap.br_startblock) - + XFS_FSB_TO_AGNO(mp, rb->ri_bmap.br_startblock); +} + +/* Get an RUI. */ +STATIC void * +xfs_rmap_update_create_intent( + struct xfs_trans *tp, + unsigned int count) +{ + struct xfs_rui_log_item *ruip; + + ASSERT(tp != NULL); + ASSERT(count > 0); + + ruip = xfs_rui_init(tp->t_mountp, count); + ASSERT(ruip != NULL); /* - * Drop the RUI reference regardless of whether the RUD has been - * aborted. Once the RUD transaction is constructed, it is the sole - * responsibility of the RUD to release the RUI (even if the RUI is - * aborted due to log I/O error). + * Get a log_item_desc to point at the new item. */ - xfs_rui_release(rudp->rud_ruip); - kmem_zone_free(xfs_rud_zone, rudp); - - return (xfs_lsn_t)-1; + xfs_trans_add_item(tp, &ruip->rui_item); + return ruip; } -/* - * The RUD dependency tracking op doesn't do squat. It can't because - * it doesn't know where the free extent is coming from. The dependency - * tracking has to be handled by the "enclosing" metadata object. For - * example, for inodes, the inode is locked throughout the extent freeing - * so the dependency should be recorded there. - */ +/* Log rmap updates in the intent item. */ STATIC void -xfs_rud_item_committing( - struct xfs_log_item *lip, - xfs_lsn_t lsn) +xfs_rmap_update_log_item( + struct xfs_trans *tp, + void *intent, + struct list_head *item) { + struct xfs_rui_log_item *ruip = intent; + struct xfs_rmap_intent *rmap; + uint next_extent; + struct xfs_map_extent *map; + + rmap = container_of(item, struct xfs_rmap_intent, ri_list); + + tp->t_flags |= XFS_TRANS_DIRTY; + set_bit(XFS_LI_DIRTY, &ruip->rui_item.li_flags); + + /* + * atomic_inc_return gives us the value after the increment; + * we want to use it as an array index so we need to subtract 1 from + * it. + */ + next_extent = atomic_inc_return(&ruip->rui_next_extent) - 1; + ASSERT(next_extent < ruip->rui_format.rui_nextents); + map = &ruip->rui_format.rui_extents[next_extent]; + map->me_owner = rmap->ri_owner; + map->me_startblock = rmap->ri_bmap.br_startblock; + map->me_startoff = rmap->ri_bmap.br_startoff; + map->me_len = rmap->ri_bmap.br_blockcount; + xfs_trans_set_rmap_flags(map, rmap->ri_type, rmap->ri_whichfork, + rmap->ri_bmap.br_state); } -/* - * This is the ops vector shared by all rud log items. - */ -static const struct xfs_item_ops xfs_rud_item_ops = { - .iop_size = xfs_rud_item_size, - .iop_format = xfs_rud_item_format, - .iop_pin = xfs_rud_item_pin, - .iop_unpin = xfs_rud_item_unpin, - .iop_unlock = xfs_rud_item_unlock, - .iop_committed = xfs_rud_item_committed, - .iop_push = xfs_rud_item_push, - .iop_committing = xfs_rud_item_committing, -}; +/* Get an RUD so we can process all the deferred rmap updates. */ +STATIC void * +xfs_rmap_update_create_done( + struct xfs_trans *tp, + void *intent, + unsigned int count) +{ + return xfs_trans_get_rud(tp, intent); +} -/* - * Allocate and initialize an rud item with the given number of extents. - */ -struct xfs_rud_log_item * -xfs_rud_init( - struct xfs_mount *mp, - struct xfs_rui_log_item *ruip) +/* Process a deferred rmap update. */ +STATIC int +xfs_rmap_update_finish_item( + struct xfs_trans *tp, + struct list_head *item, + void *done_item, + void **state) +{ + struct xfs_rmap_intent *rmap; + int error; + + rmap = container_of(item, struct xfs_rmap_intent, ri_list); + error = xfs_trans_log_finish_rmap_update(tp, done_item, + rmap->ri_type, + rmap->ri_owner, rmap->ri_whichfork, + rmap->ri_bmap.br_startoff, + rmap->ri_bmap.br_startblock, + rmap->ri_bmap.br_blockcount, + rmap->ri_bmap.br_state, + (struct xfs_btree_cur **)state); + kmem_free(rmap); + return error; +} + +/* Clean up after processing deferred rmaps. */ +STATIC void +xfs_rmap_update_finish_cleanup( + struct xfs_trans *tp, + void *state, + int error) +{ + struct xfs_btree_cur *rcur = state; + + xfs_rmap_finish_one_cleanup(tp, rcur, error); +} +/* Abort all pending RUIs. */ +STATIC void +xfs_rmap_update_abort_intent( + void *intent) { - struct xfs_rud_log_item *rudp; + xfs_rui_release(intent); +} - rudp = kmem_zone_zalloc(xfs_rud_zone, KM_SLEEP); - xfs_log_item_init(mp, &rudp->rud_item, XFS_LI_RUD, &xfs_rud_item_ops); - rudp->rud_ruip = ruip; - rudp->rud_format.rud_rui_id = ruip->rui_format.rui_id; +/* Cancel a deferred rmap update. */ +STATIC void +xfs_rmap_update_cancel_item( + struct list_head *item) +{ + struct xfs_rmap_intent *rmap; - return rudp; + rmap = container_of(item, struct xfs_rmap_intent, ri_list); + kmem_free(rmap); } +const struct xfs_defer_op_type xfs_rmap_update_defer_type = { + .max_items = XFS_RUI_MAX_FAST_EXTENTS, + .diff_items = xfs_rmap_update_diff_items, + .create_intent = xfs_rmap_update_create_intent, + .abort_intent = xfs_rmap_update_abort_intent, + .log_item = xfs_rmap_update_log_item, + .create_done = xfs_rmap_update_create_done, + .finish_item = xfs_rmap_update_finish_item, + .finish_cleanup = xfs_rmap_update_finish_cleanup, + .cancel_item = xfs_rmap_update_cancel_item, +}; + /* * Process an rmap update intent item that was recovered from the log. * We need to update the rmapbt. diff --git a/fs/xfs/xfs_rmap_item.h b/fs/xfs/xfs_rmap_item.h index 7e482baa27f5..8708e4a5aa5c 100644 --- a/fs/xfs/xfs_rmap_item.h +++ b/fs/xfs/xfs_rmap_item.h @@ -78,8 +78,6 @@ extern struct kmem_zone *xfs_rui_zone; extern struct kmem_zone *xfs_rud_zone; struct xfs_rui_log_item *xfs_rui_init(struct xfs_mount *, uint); -struct xfs_rud_log_item *xfs_rud_init(struct xfs_mount *, - struct xfs_rui_log_item *); int xfs_rui_copy_format(struct xfs_log_iovec *buf, struct xfs_rui_log_format *dst_rui_fmt); void xfs_rui_item_free(struct xfs_rui_log_item *); diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index ac0fcdad0c4e..5fa4db3c3e32 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -11,17 +11,11 @@ #include "xfs_trans_resv.h" #include "xfs_bit.h" #include "xfs_mount.h" -#include "xfs_defer.h" #include "xfs_inode.h" #include "xfs_bmap.h" -#include "xfs_bmap_util.h" #include "xfs_bmap_btree.h" -#include "xfs_alloc.h" -#include "xfs_error.h" #include "xfs_trans.h" #include "xfs_trans_space.h" -#include "xfs_trace.h" -#include "xfs_buf.h" #include "xfs_icache.h" #include "xfs_rtalloc.h" diff --git a/fs/xfs/xfs_stats.c b/fs/xfs/xfs_stats.c index cc509743facd..113883c4f202 100644 --- a/fs/xfs/xfs_stats.c +++ b/fs/xfs/xfs_stats.c @@ -4,7 +4,6 @@ * All Rights Reserved. */ #include "xfs.h" -#include <linux/proc_fs.h> struct xstats xfsstats; diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index c9097cb0b955..f9450235533c 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -11,18 +11,15 @@ #include "xfs_trans_resv.h" #include "xfs_sb.h" #include "xfs_mount.h" -#include "xfs_da_format.h" #include "xfs_inode.h" #include "xfs_btree.h" #include "xfs_bmap.h" #include "xfs_alloc.h" -#include "xfs_error.h" #include "xfs_fsops.h" #include "xfs_trans.h" #include "xfs_buf_item.h" #include "xfs_log.h" #include "xfs_log_priv.h" -#include "xfs_da_btree.h" #include "xfs_dir2.h" #include "xfs_extfree_item.h" #include "xfs_mru_cache.h" @@ -38,18 +35,8 @@ #include "xfs_refcount_item.h" #include "xfs_bmap_item.h" #include "xfs_reflink.h" -#include "xfs_defer.h" -#include <linux/namei.h> -#include <linux/dax.h> -#include <linux/init.h> -#include <linux/slab.h> #include <linux/magic.h> -#include <linux/mount.h> -#include <linux/mempool.h> -#include <linux/writeback.h> -#include <linux/kthread.h> -#include <linux/freezer.h> #include <linux/parser.h> static const struct super_operations xfs_super_operations; @@ -66,7 +53,7 @@ static struct xfs_kobj xfs_dbg_kobj; /* global debug sysfs attrs */ enum { Opt_logbufs, Opt_logbsize, Opt_logdev, Opt_rtdev, Opt_biosize, Opt_wsync, Opt_noalign, Opt_swalloc, Opt_sunit, Opt_swidth, Opt_nouuid, - Opt_mtpt, Opt_grpid, Opt_nogrpid, Opt_bsdgroups, Opt_sysvgroups, + Opt_grpid, Opt_nogrpid, Opt_bsdgroups, Opt_sysvgroups, Opt_allocsize, Opt_norecovery, Opt_inode64, Opt_inode32, Opt_ikeep, Opt_noikeep, Opt_largeio, Opt_nolargeio, Opt_attr2, Opt_noattr2, Opt_filestreams, Opt_quota, Opt_noquota, Opt_usrquota, Opt_grpquota, @@ -87,7 +74,6 @@ static const match_table_t tokens = { {Opt_sunit, "sunit=%u"}, /* data volume stripe unit */ {Opt_swidth, "swidth=%u"}, /* data volume stripe width */ {Opt_nouuid, "nouuid"}, /* ignore filesystem UUID */ - {Opt_mtpt, "mtpt"}, /* filesystem mount point */ {Opt_grpid, "grpid"}, /* group-ID from parent directory */ {Opt_nogrpid, "nogrpid"}, /* group-ID from current process */ {Opt_bsdgroups, "bsdgroups"}, /* group-ID from parent directory */ @@ -236,9 +222,6 @@ xfs_parseargs( if (!mp->m_logname) return -ENOMEM; break; - case Opt_mtpt: - xfs_warn(mp, "%s option not allowed on this system", p); - return -EINVAL; case Opt_rtdev: kfree(mp->m_rtname); mp->m_rtname = match_strdup(args); @@ -448,7 +431,7 @@ struct proc_xfs_info { char *str; }; -STATIC int +STATIC void xfs_showargs( struct xfs_mount *mp, struct seq_file *m) @@ -527,9 +510,8 @@ xfs_showargs( if (!(mp->m_qflags & XFS_ALL_QUOTA_ACCT)) seq_puts(m, ",noquota"); - - return 0; } + static uint64_t xfs_max_file_offset( unsigned int blockshift) @@ -539,26 +521,18 @@ xfs_max_file_offset( /* Figure out maximum filesize, on Linux this can depend on * the filesystem blocksize (on 32 bit platforms). - * __block_write_begin does this in an [unsigned] long... + * __block_write_begin does this in an [unsigned] long long... * page->index << (PAGE_SHIFT - bbits) * So, for page sized blocks (4K on 32 bit platforms), * this wraps at around 8Tb (hence MAX_LFS_FILESIZE which is * (((u64)PAGE_SIZE << (BITS_PER_LONG-1))-1) * but for smaller blocksizes it is less (bbits = log2 bsize). - * Note1: get_block_t takes a long (implicit cast from above) - * Note2: The Large Block Device (LBD and HAVE_SECTOR_T) patch - * can optionally convert the [unsigned] long from above into - * an [unsigned] long long. */ #if BITS_PER_LONG == 32 -# if defined(CONFIG_LBDAF) ASSERT(sizeof(sector_t) == 8); pagefactor = PAGE_SIZE; bitshift = BITS_PER_LONG; -# else - pagefactor = PAGE_SIZE >> (PAGE_SHIFT - blockshift); -# endif #endif return (((uint64_t)pagefactor) << bitshift) - 1; @@ -595,7 +569,7 @@ xfs_set_inode_alloc( * Calculate how much should be reserved for inodes to meet * the max inode percentage. Used only for inode32. */ - if (mp->m_maxicount) { + if (M_IGEO(mp)->maxicount) { uint64_t icount; icount = sbp->sb_dblocks * sbp->sb_imax_pct; @@ -838,15 +812,10 @@ xfs_init_mount_workqueues( if (!mp->m_buf_workqueue) goto out; - mp->m_data_workqueue = alloc_workqueue("xfs-data/%s", - WQ_MEM_RECLAIM|WQ_FREEZABLE, 0, mp->m_fsname); - if (!mp->m_data_workqueue) - goto out_destroy_buf; - mp->m_unwritten_workqueue = alloc_workqueue("xfs-conv/%s", WQ_MEM_RECLAIM|WQ_FREEZABLE, 0, mp->m_fsname); if (!mp->m_unwritten_workqueue) - goto out_destroy_data_iodone_queue; + goto out_destroy_buf; mp->m_cil_workqueue = alloc_workqueue("xfs-cil/%s", WQ_MEM_RECLAIM|WQ_FREEZABLE, 0, mp->m_fsname); @@ -858,16 +827,10 @@ xfs_init_mount_workqueues( if (!mp->m_reclaim_workqueue) goto out_destroy_cil; - mp->m_log_workqueue = alloc_workqueue("xfs-log/%s", - WQ_MEM_RECLAIM|WQ_FREEZABLE|WQ_HIGHPRI, 0, - mp->m_fsname); - if (!mp->m_log_workqueue) - goto out_destroy_reclaim; - mp->m_eofblocks_workqueue = alloc_workqueue("xfs-eofblocks/%s", WQ_MEM_RECLAIM|WQ_FREEZABLE, 0, mp->m_fsname); if (!mp->m_eofblocks_workqueue) - goto out_destroy_log; + goto out_destroy_reclaim; mp->m_sync_workqueue = alloc_workqueue("xfs-sync/%s", WQ_FREEZABLE, 0, mp->m_fsname); @@ -878,16 +841,12 @@ xfs_init_mount_workqueues( out_destroy_eofb: destroy_workqueue(mp->m_eofblocks_workqueue); -out_destroy_log: - destroy_workqueue(mp->m_log_workqueue); out_destroy_reclaim: destroy_workqueue(mp->m_reclaim_workqueue); out_destroy_cil: destroy_workqueue(mp->m_cil_workqueue); out_destroy_unwritten: destroy_workqueue(mp->m_unwritten_workqueue); -out_destroy_data_iodone_queue: - destroy_workqueue(mp->m_data_workqueue); out_destroy_buf: destroy_workqueue(mp->m_buf_workqueue); out: @@ -900,10 +859,8 @@ xfs_destroy_mount_workqueues( { destroy_workqueue(mp->m_sync_workqueue); destroy_workqueue(mp->m_eofblocks_workqueue); - destroy_workqueue(mp->m_log_workqueue); destroy_workqueue(mp->m_reclaim_workqueue); destroy_workqueue(mp->m_cil_workqueue); - destroy_workqueue(mp->m_data_workqueue); destroy_workqueue(mp->m_unwritten_workqueue); destroy_workqueue(mp->m_buf_workqueue); } @@ -1152,10 +1109,10 @@ xfs_fs_statfs( fakeinos = XFS_FSB_TO_INO(mp, statp->f_bfree); statp->f_files = min(icount + fakeinos, (uint64_t)XFS_MAXINUMBER); - if (mp->m_maxicount) + if (M_IGEO(mp)->maxicount) statp->f_files = min_t(typeof(statp->f_files), statp->f_files, - mp->m_maxicount); + M_IGEO(mp)->maxicount); /* If sb_icount overshot maxicount, report actual allocation */ statp->f_files = max_t(typeof(statp->f_files), @@ -1376,7 +1333,7 @@ xfs_fs_remount( xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); return error; } - xfs_icache_enable_reclaim(mp); + xfs_start_block_reaping(mp); /* Create the per-AG metadata reservation pool .*/ error = xfs_fs_reserve_ag_blocks(mp); @@ -1390,7 +1347,7 @@ xfs_fs_remount( * Cancel background eofb scanning so it cannot race with the * final log force+buftarg wait and deadlock the remount. */ - xfs_icache_disable_reclaim(mp); + xfs_stop_block_reaping(mp); /* Get rid of any leftover CoW reservations... */ error = xfs_icache_free_cowblocks(mp, NULL); @@ -1434,7 +1391,7 @@ xfs_fs_freeze( { struct xfs_mount *mp = XFS_M(sb); - xfs_icache_disable_reclaim(mp); + xfs_stop_block_reaping(mp); xfs_save_resvblks(mp); xfs_quiesce_attr(mp); return xfs_sync_sb(mp, true); @@ -1448,7 +1405,7 @@ xfs_fs_unfreeze( xfs_restore_resvblks(mp); xfs_log_work_queue(mp); - xfs_icache_enable_reclaim(mp); + xfs_start_block_reaping(mp); return 0; } @@ -1457,7 +1414,8 @@ xfs_fs_show_options( struct seq_file *m, struct dentry *root) { - return xfs_showargs(XFS_M(root->d_sb), m); + xfs_showargs(XFS_M(root->d_sb), m); + return 0; } /* @@ -1546,8 +1504,14 @@ xfs_init_percpu_counters( if (error) goto free_ifree; + error = percpu_counter_init(&mp->m_delalloc_blks, 0, GFP_KERNEL); + if (error) + goto free_fdblocks; + return 0; +free_fdblocks: + percpu_counter_destroy(&mp->m_fdblocks); free_ifree: percpu_counter_destroy(&mp->m_ifree); free_icount: @@ -1571,6 +1535,9 @@ xfs_destroy_percpu_counters( percpu_counter_destroy(&mp->m_icount); percpu_counter_destroy(&mp->m_ifree); percpu_counter_destroy(&mp->m_fdblocks); + ASSERT(XFS_FORCED_SHUTDOWN(mp) || + percpu_counter_sum(&mp->m_delalloc_blks) == 0); + percpu_counter_destroy(&mp->m_delalloc_blks); } static struct xfs_mount * @@ -1594,6 +1561,13 @@ xfs_mount_alloc( INIT_DELAYED_WORK(&mp->m_eofblocks_work, xfs_eofblocks_worker); INIT_DELAYED_WORK(&mp->m_cowblocks_work, xfs_cowblocks_worker); mp->m_kobj.kobject.kset = xfs_kset; + /* + * We don't create the finobt per-ag space reservation until after log + * recovery, so we must set this to true so that an ifree transaction + * started during log recovery will not depend on space reservations + * for finobt expansion. + */ + mp->m_finobt_nores = true; return mp; } @@ -1689,6 +1663,8 @@ xfs_fs_fill_super( sb->s_maxbytes = xfs_max_file_offset(sb->s_blocksize_bits); sb->s_max_links = XFS_MAXLINK; sb->s_time_gran = 1; + sb->s_iflags |= SB_I_CGROUPWB; + set_posix_acl_flag(sb); /* version 5 superblocks support inode version counters. */ @@ -1729,11 +1705,18 @@ xfs_fs_fill_super( } } - if (xfs_sb_version_hasreflink(&mp->m_sb) && mp->m_sb.sb_rblocks) { - xfs_alert(mp, + if (xfs_sb_version_hasreflink(&mp->m_sb)) { + if (mp->m_sb.sb_rblocks) { + xfs_alert(mp, "reflink not compatible with realtime device!"); - error = -EINVAL; - goto out_filestream_unmount; + error = -EINVAL; + goto out_filestream_unmount; + } + + if (xfs_globals.always_cow) { + xfs_info(mp, "using DEBUG-only always_cow mode."); + mp->m_always_cow = true; + } } if (xfs_sb_version_hasrmapbt(&mp->m_sb) && mp->m_sb.sb_rblocks) { diff --git a/fs/xfs/xfs_super.h b/fs/xfs/xfs_super.h index 21cb49a43d7c..763e43d22dee 100644 --- a/fs/xfs/xfs_super.h +++ b/fs/xfs/xfs_super.h @@ -38,6 +38,18 @@ extern void xfs_qm_exit(void); # define XFS_SCRUB_STRING #endif +#ifdef CONFIG_XFS_ONLINE_REPAIR +# define XFS_REPAIR_STRING "repair, " +#else +# define XFS_REPAIR_STRING +#endif + +#ifdef CONFIG_XFS_WARN +# define XFS_WARN_STRING "verbose warnings, " +#else +# define XFS_WARN_STRING +#endif + #ifdef DEBUG # define XFS_DBG_STRING "debug" #else @@ -49,6 +61,8 @@ extern void xfs_qm_exit(void); XFS_SECURITY_STRING \ XFS_REALTIME_STRING \ XFS_SCRUB_STRING \ + XFS_REPAIR_STRING \ + XFS_WARN_STRING \ XFS_DBG_STRING /* DBG must be last */ struct xfs_inode; diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c index b2c1177c717f..ed66fd2de327 100644 --- a/fs/xfs/xfs_symlink.c +++ b/fs/xfs/xfs_symlink.c @@ -12,23 +12,14 @@ #include "xfs_trans_resv.h" #include "xfs_bit.h" #include "xfs_mount.h" -#include "xfs_da_format.h" -#include "xfs_da_btree.h" -#include "xfs_defer.h" #include "xfs_dir2.h" #include "xfs_inode.h" -#include "xfs_ialloc.h" -#include "xfs_alloc.h" #include "xfs_bmap.h" #include "xfs_bmap_btree.h" -#include "xfs_bmap_util.h" -#include "xfs_error.h" #include "xfs_quota.h" #include "xfs_trans_space.h" #include "xfs_trace.h" -#include "xfs_symlink.h" #include "xfs_trans.h" -#include "xfs_log.h" /* ----- Kernel only functions below ----- */ int diff --git a/fs/xfs/xfs_sysctl.c b/fs/xfs/xfs_sysctl.c index 0cc034dfb786..31b3bdbd2eba 100644 --- a/fs/xfs/xfs_sysctl.c +++ b/fs/xfs/xfs_sysctl.c @@ -4,10 +4,7 @@ * All Rights Reserved. */ #include "xfs.h" -#include <linux/sysctl.h> -#include <linux/proc_fs.h> #include "xfs_error.h" -#include "xfs_stats.h" static struct ctl_table_header *xfs_table_header; diff --git a/fs/xfs/xfs_sysctl.h b/fs/xfs/xfs_sysctl.h index 168488130a19..8abf4640f1d5 100644 --- a/fs/xfs/xfs_sysctl.h +++ b/fs/xfs/xfs_sysctl.h @@ -82,9 +82,13 @@ enum { extern xfs_param_t xfs_params; struct xfs_globals { +#ifdef DEBUG + int pwork_threads; /* parallel workqueue threads */ +#endif int log_recovery_delay; /* log recovery delay (secs) */ int mount_delay; /* mount setup delay (secs) */ bool bug_on_assert; /* BUG() the kernel on assert failure */ + bool always_cow; /* use COW fork for all overwrites */ }; extern struct xfs_globals xfs_globals; diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c index cd6a994a7250..ddd0bf7a4740 100644 --- a/fs/xfs/xfs_sysfs.c +++ b/fs/xfs/xfs_sysfs.c @@ -10,9 +10,7 @@ #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_sysfs.h" -#include "xfs_log.h" #include "xfs_log_priv.h" -#include "xfs_stats.h" #include "xfs_mount.h" struct xfs_sysfs_attr { @@ -183,10 +181,74 @@ mount_delay_show( } XFS_SYSFS_ATTR_RW(mount_delay); +static ssize_t +always_cow_store( + struct kobject *kobject, + const char *buf, + size_t count) +{ + ssize_t ret; + + ret = kstrtobool(buf, &xfs_globals.always_cow); + if (ret < 0) + return ret; + return count; +} + +static ssize_t +always_cow_show( + struct kobject *kobject, + char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%d\n", xfs_globals.always_cow); +} +XFS_SYSFS_ATTR_RW(always_cow); + +#ifdef DEBUG +/* + * Override how many threads the parallel work queue is allowed to create. + * This has to be a debug-only global (instead of an errortag) because one of + * the main users of parallel workqueues is mount time quotacheck. + */ +STATIC ssize_t +pwork_threads_store( + struct kobject *kobject, + const char *buf, + size_t count) +{ + int ret; + int val; + + ret = kstrtoint(buf, 0, &val); + if (ret) + return ret; + + if (val < -1 || val > num_possible_cpus()) + return -EINVAL; + + xfs_globals.pwork_threads = val; + + return count; +} + +STATIC ssize_t +pwork_threads_show( + struct kobject *kobject, + char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%d\n", xfs_globals.pwork_threads); +} +XFS_SYSFS_ATTR_RW(pwork_threads); +#endif /* DEBUG */ + static struct attribute *xfs_dbg_attrs[] = { ATTR_LIST(bug_on_assert), ATTR_LIST(log_recovery_delay), ATTR_LIST(mount_delay), + ATTR_LIST(always_cow), +#ifdef DEBUG + ATTR_LIST(pwork_threads), +#endif NULL, }; diff --git a/fs/xfs/xfs_trace.c b/fs/xfs/xfs_trace.c index cb6489c22cad..bc85b89f88ca 100644 --- a/fs/xfs/xfs_trace.c +++ b/fs/xfs/xfs_trace.c @@ -15,24 +15,16 @@ #include "xfs_inode.h" #include "xfs_btree.h" #include "xfs_da_btree.h" -#include "xfs_ialloc.h" -#include "xfs_itable.h" #include "xfs_alloc.h" #include "xfs_bmap.h" #include "xfs_attr.h" -#include "xfs_attr_leaf.h" #include "xfs_trans.h" -#include "xfs_log.h" #include "xfs_log_priv.h" #include "xfs_buf_item.h" #include "xfs_quota.h" -#include "xfs_iomap.h" -#include "xfs_aops.h" #include "xfs_dquot_item.h" #include "xfs_dquot.h" #include "xfs_log_recover.h" -#include "xfs_inode_item.h" -#include "xfs_bmap_btree.h" #include "xfs_filestream.h" #include "xfs_fsmap.h" diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 6fcc893dfc91..8094b1920eef 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -475,7 +475,7 @@ DEFINE_BUF_ITEM_EVENT(xfs_buf_item_ordered); DEFINE_BUF_ITEM_EVENT(xfs_buf_item_pin); DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unpin); DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unpin_stale); -DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unlock); +DEFINE_BUF_ITEM_EVENT(xfs_buf_item_release); DEFINE_BUF_ITEM_EVENT(xfs_buf_item_committed); DEFINE_BUF_ITEM_EVENT(xfs_buf_item_push); DEFINE_BUF_ITEM_EVENT(xfs_trans_get_buf); @@ -1218,23 +1218,17 @@ DEFINE_EVENT(xfs_readpage_class, name, \ DEFINE_READPAGE_EVENT(xfs_vm_readpage); DEFINE_READPAGE_EVENT(xfs_vm_readpages); -TRACE_DEFINE_ENUM(XFS_IO_HOLE); -TRACE_DEFINE_ENUM(XFS_IO_DELALLOC); -TRACE_DEFINE_ENUM(XFS_IO_UNWRITTEN); -TRACE_DEFINE_ENUM(XFS_IO_OVERWRITE); -TRACE_DEFINE_ENUM(XFS_IO_COW); - DECLARE_EVENT_CLASS(xfs_imap_class, TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count, - int type, struct xfs_bmbt_irec *irec), - TP_ARGS(ip, offset, count, type, irec), + int whichfork, struct xfs_bmbt_irec *irec), + TP_ARGS(ip, offset, count, whichfork, irec), TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_ino_t, ino) __field(loff_t, size) __field(loff_t, offset) __field(size_t, count) - __field(int, type) + __field(int, whichfork) __field(xfs_fileoff_t, startoff) __field(xfs_fsblock_t, startblock) __field(xfs_filblks_t, blockcount) @@ -1245,33 +1239,33 @@ DECLARE_EVENT_CLASS(xfs_imap_class, __entry->size = ip->i_d.di_size; __entry->offset = offset; __entry->count = count; - __entry->type = type; + __entry->whichfork = whichfork; __entry->startoff = irec ? irec->br_startoff : 0; __entry->startblock = irec ? irec->br_startblock : 0; __entry->blockcount = irec ? irec->br_blockcount : 0; ), TP_printk("dev %d:%d ino 0x%llx size 0x%llx offset 0x%llx count %zd " - "type %s startoff 0x%llx startblock %lld blockcount 0x%llx", + "fork %s startoff 0x%llx startblock %lld blockcount 0x%llx", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->size, __entry->offset, __entry->count, - __print_symbolic(__entry->type, XFS_IO_TYPES), + __entry->whichfork == XFS_COW_FORK ? "cow" : "data", __entry->startoff, (int64_t)__entry->startblock, __entry->blockcount) ) -#define DEFINE_IOMAP_EVENT(name) \ +#define DEFINE_IMAP_EVENT(name) \ DEFINE_EVENT(xfs_imap_class, name, \ TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count, \ - int type, struct xfs_bmbt_irec *irec), \ - TP_ARGS(ip, offset, count, type, irec)) -DEFINE_IOMAP_EVENT(xfs_map_blocks_found); -DEFINE_IOMAP_EVENT(xfs_map_blocks_alloc); -DEFINE_IOMAP_EVENT(xfs_iomap_alloc); -DEFINE_IOMAP_EVENT(xfs_iomap_found); + int whichfork, struct xfs_bmbt_irec *irec), \ + TP_ARGS(ip, offset, count, whichfork, irec)) +DEFINE_IMAP_EVENT(xfs_map_blocks_found); +DEFINE_IMAP_EVENT(xfs_map_blocks_alloc); +DEFINE_IMAP_EVENT(xfs_iomap_alloc); +DEFINE_IMAP_EVENT(xfs_iomap_found); DECLARE_EVENT_CLASS(xfs_simple_io_class, TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count), @@ -3078,7 +3072,7 @@ DEFINE_EVENT(xfs_inode_irec_class, name, \ DEFINE_INODE_EVENT(xfs_reflink_set_inode_flag); DEFINE_INODE_EVENT(xfs_reflink_unset_inode_flag); DEFINE_ITRUNC_EVENT(xfs_reflink_update_inode_size); -DEFINE_IOMAP_EVENT(xfs_reflink_remap_imap); +DEFINE_IMAP_EVENT(xfs_reflink_remap_imap); TRACE_EVENT(xfs_reflink_remap_blocks_loop, TP_PROTO(struct xfs_inode *src, xfs_fileoff_t soffset, xfs_filblks_t len, struct xfs_inode *dest, @@ -3202,13 +3196,10 @@ DEFINE_INODE_ERROR_EVENT(xfs_reflink_unshare_error); /* copy on write */ DEFINE_INODE_IREC_EVENT(xfs_reflink_trim_around_shared); -DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_alloc); DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_found); DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_enospc); DEFINE_INODE_IREC_EVENT(xfs_reflink_convert_cow); -DEFINE_RW_EVENT(xfs_reflink_reserve_cow); - DEFINE_SIMPLE_IO_EVENT(xfs_reflink_bounce_dio_write); DEFINE_SIMPLE_IO_EVENT(xfs_reflink_cancel_cow_range); @@ -3369,8 +3360,221 @@ DEFINE_TRANS_EVENT(xfs_trans_dup); DEFINE_TRANS_EVENT(xfs_trans_free); DEFINE_TRANS_EVENT(xfs_trans_roll); DEFINE_TRANS_EVENT(xfs_trans_add_item); +DEFINE_TRANS_EVENT(xfs_trans_commit_items); DEFINE_TRANS_EVENT(xfs_trans_free_items); +TRACE_EVENT(xfs_iunlink_update_bucket, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, unsigned int bucket, + xfs_agino_t old_ptr, xfs_agino_t new_ptr), + TP_ARGS(mp, agno, bucket, old_ptr, new_ptr), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(unsigned int, bucket) + __field(xfs_agino_t, old_ptr) + __field(xfs_agino_t, new_ptr) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + __entry->bucket = bucket; + __entry->old_ptr = old_ptr; + __entry->new_ptr = new_ptr; + ), + TP_printk("dev %d:%d agno %u bucket %u old 0x%x new 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->bucket, + __entry->old_ptr, + __entry->new_ptr) +); + +TRACE_EVENT(xfs_iunlink_update_dinode, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agino_t agino, + xfs_agino_t old_ptr, xfs_agino_t new_ptr), + TP_ARGS(mp, agno, agino, old_ptr, new_ptr), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agino_t, agino) + __field(xfs_agino_t, old_ptr) + __field(xfs_agino_t, new_ptr) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + __entry->agino = agino; + __entry->old_ptr = old_ptr; + __entry->new_ptr = new_ptr; + ), + TP_printk("dev %d:%d agno %u agino 0x%x old 0x%x new 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->agino, + __entry->old_ptr, + __entry->new_ptr) +); + +DECLARE_EVENT_CLASS(xfs_ag_inode_class, + TP_PROTO(struct xfs_inode *ip), + TP_ARGS(ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agino_t, agino) + ), + TP_fast_assign( + __entry->dev = VFS_I(ip)->i_sb->s_dev; + __entry->agno = XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino); + __entry->agino = XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino); + ), + TP_printk("dev %d:%d agno %u agino %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, __entry->agino) +) + +#define DEFINE_AGINODE_EVENT(name) \ +DEFINE_EVENT(xfs_ag_inode_class, name, \ + TP_PROTO(struct xfs_inode *ip), \ + TP_ARGS(ip)) +DEFINE_AGINODE_EVENT(xfs_iunlink); +DEFINE_AGINODE_EVENT(xfs_iunlink_remove); +DEFINE_AG_EVENT(xfs_iunlink_map_prev_fallback); + +DECLARE_EVENT_CLASS(xfs_fs_corrupt_class, + TP_PROTO(struct xfs_mount *mp, unsigned int flags), + TP_ARGS(mp, flags), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(unsigned int, flags) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->flags = flags; + ), + TP_printk("dev %d:%d flags 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->flags) +); +#define DEFINE_FS_CORRUPT_EVENT(name) \ +DEFINE_EVENT(xfs_fs_corrupt_class, name, \ + TP_PROTO(struct xfs_mount *mp, unsigned int flags), \ + TP_ARGS(mp, flags)) +DEFINE_FS_CORRUPT_EVENT(xfs_fs_mark_sick); +DEFINE_FS_CORRUPT_EVENT(xfs_fs_mark_healthy); +DEFINE_FS_CORRUPT_EVENT(xfs_fs_unfixed_corruption); +DEFINE_FS_CORRUPT_EVENT(xfs_rt_mark_sick); +DEFINE_FS_CORRUPT_EVENT(xfs_rt_mark_healthy); +DEFINE_FS_CORRUPT_EVENT(xfs_rt_unfixed_corruption); + +DECLARE_EVENT_CLASS(xfs_ag_corrupt_class, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, unsigned int flags), + TP_ARGS(mp, agno, flags), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(unsigned int, flags) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + __entry->flags = flags; + ), + TP_printk("dev %d:%d agno %u flags 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, __entry->flags) +); +#define DEFINE_AG_CORRUPT_EVENT(name) \ +DEFINE_EVENT(xfs_ag_corrupt_class, name, \ + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \ + unsigned int flags), \ + TP_ARGS(mp, agno, flags)) +DEFINE_AG_CORRUPT_EVENT(xfs_ag_mark_sick); +DEFINE_AG_CORRUPT_EVENT(xfs_ag_mark_healthy); +DEFINE_AG_CORRUPT_EVENT(xfs_ag_unfixed_corruption); + +DECLARE_EVENT_CLASS(xfs_inode_corrupt_class, + TP_PROTO(struct xfs_inode *ip, unsigned int flags), + TP_ARGS(ip, flags), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(unsigned int, flags) + ), + TP_fast_assign( + __entry->dev = ip->i_mount->m_super->s_dev; + __entry->ino = ip->i_ino; + __entry->flags = flags; + ), + TP_printk("dev %d:%d ino 0x%llx flags 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, __entry->flags) +); +#define DEFINE_INODE_CORRUPT_EVENT(name) \ +DEFINE_EVENT(xfs_inode_corrupt_class, name, \ + TP_PROTO(struct xfs_inode *ip, unsigned int flags), \ + TP_ARGS(ip, flags)) +DEFINE_INODE_CORRUPT_EVENT(xfs_inode_mark_sick); +DEFINE_INODE_CORRUPT_EVENT(xfs_inode_mark_healthy); + +TRACE_EVENT(xfs_iwalk_ag, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, + xfs_agino_t startino), + TP_ARGS(mp, agno, startino), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agino_t, startino) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + __entry->startino = startino; + ), + TP_printk("dev %d:%d agno %d startino %u", + MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, + __entry->startino) +) + +TRACE_EVENT(xfs_iwalk_ag_rec, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, + struct xfs_inobt_rec_incore *irec), + TP_ARGS(mp, agno, irec), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agino_t, startino) + __field(uint64_t, freemask) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + __entry->startino = irec->ir_startino; + __entry->freemask = irec->ir_free; + ), + TP_printk("dev %d:%d agno %d startino %u freemask 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, + __entry->startino, __entry->freemask) +) + +TRACE_EVENT(xfs_pwork_init, + TP_PROTO(struct xfs_mount *mp, unsigned int nr_threads, pid_t pid), + TP_ARGS(mp, nr_threads, pid), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(unsigned int, nr_threads) + __field(pid_t, pid) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->nr_threads = nr_threads; + __entry->pid = pid; + ), + TP_printk("dev %d:%d nr_threads %u pid %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->nr_threads, __entry->pid) +) + #endif /* _TRACE_XFS_H */ #undef TRACE_INCLUDE_PATH diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 912b42f5fe4a..d42a68d8313b 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -11,7 +11,6 @@ #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" -#include "xfs_inode.h" #include "xfs_extent_busy.h" #include "xfs_quota.h" #include "xfs_trans.h" @@ -264,9 +263,7 @@ xfs_trans_alloc( * GFP_NOFS allocation context so that we avoid lockdep false positives * by doing GFP_KERNEL allocations inside sb_start_intwrite(). */ - tp = kmem_zone_zalloc(xfs_trans_zone, - (flags & XFS_TRANS_NOFS) ? KM_NOFS : KM_SLEEP); - + tp = kmem_zone_zalloc(xfs_trans_zone, KM_SLEEP); if (!(flags & XFS_TRANS_NO_WRITECOUNT)) sb_start_intwrite(mp->m_super); @@ -452,7 +449,7 @@ xfs_trans_apply_sb_deltas( xfs_buf_t *bp; int whole = 0; - bp = xfs_trans_getsb(tp, tp->t_mountp, 0); + bp = xfs_trans_getsb(tp, tp->t_mountp); sbp = XFS_BUF_TO_SBP(bp); /* @@ -767,10 +764,9 @@ xfs_trans_del_item( } /* Detach and unlock all of the items in a transaction */ -void +static void xfs_trans_free_items( struct xfs_trans *tp, - xfs_lsn_t commit_lsn, bool abort) { struct xfs_log_item *lip, *next; @@ -779,11 +775,10 @@ xfs_trans_free_items( list_for_each_entry_safe(lip, next, &tp->t_items, li_trans) { xfs_trans_del_item(lip); - if (commit_lsn != NULLCOMMITLSN) - lip->li_ops->iop_committing(lip, commit_lsn); if (abort) set_bit(XFS_LI_ABORTED, &lip->li_flags); - lip->li_ops->iop_unlock(lip); + if (lip->li_ops->iop_release) + lip->li_ops->iop_release(lip); } } @@ -804,7 +799,8 @@ xfs_log_item_batch_insert( for (i = 0; i < nr_items; i++) { struct xfs_log_item *lip = log_items[i]; - lip->li_ops->iop_unpin(lip, 0); + if (lip->li_ops->iop_unpin) + lip->li_ops->iop_unpin(lip, 0); } } @@ -815,7 +811,7 @@ xfs_log_item_batch_insert( * * If we are called with the aborted flag set, it is because a log write during * a CIL checkpoint commit has failed. In this case, all the items in the - * checkpoint have already gone through iop_commited and iop_unlock, which + * checkpoint have already gone through iop_committed and iop_committing, which * means that checkpoint commit abort handling is treated exactly the same * as an iclog write error even though we haven't started any IO yet. Hence in * this case all we need to do is iop_committed processing, followed by an @@ -833,7 +829,7 @@ xfs_trans_committed_bulk( struct xfs_ail *ailp, struct xfs_log_vec *log_vector, xfs_lsn_t commit_lsn, - int aborted) + bool aborted) { #define LOG_ITEM_BATCH_SIZE 32 struct xfs_log_item *log_items[LOG_ITEM_BATCH_SIZE]; @@ -852,7 +848,16 @@ xfs_trans_committed_bulk( if (aborted) set_bit(XFS_LI_ABORTED, &lip->li_flags); - item_lsn = lip->li_ops->iop_committed(lip, commit_lsn); + + if (lip->li_ops->flags & XFS_ITEM_RELEASE_WHEN_COMMITTED) { + lip->li_ops->iop_release(lip); + continue; + } + + if (lip->li_ops->iop_committed) + item_lsn = lip->li_ops->iop_committed(lip, commit_lsn); + else + item_lsn = commit_lsn; /* item_lsn of -1 means the item needs no further processing */ if (XFS_LSN_CMP(item_lsn, (xfs_lsn_t)-1) == 0) @@ -864,7 +869,8 @@ xfs_trans_committed_bulk( */ if (aborted) { ASSERT(XFS_FORCED_SHUTDOWN(ailp->ail_mount)); - lip->li_ops->iop_unpin(lip, 1); + if (lip->li_ops->iop_unpin) + lip->li_ops->iop_unpin(lip, 1); continue; } @@ -882,7 +888,8 @@ xfs_trans_committed_bulk( xfs_trans_ail_update(ailp, lip, item_lsn); else spin_unlock(&ailp->ail_lock); - lip->li_ops->iop_unpin(lip, 0); + if (lip->li_ops->iop_unpin) + lip->li_ops->iop_unpin(lip, 0); continue; } @@ -998,7 +1005,7 @@ out_unreserve: tp->t_ticket = NULL; } current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS); - xfs_trans_free_items(tp, NULLCOMMITLSN, !!error); + xfs_trans_free_items(tp, !!error); xfs_trans_free(tp); XFS_STATS_INC(mp, xs_trans_empty); @@ -1060,7 +1067,7 @@ xfs_trans_cancel( /* mark this thread as no longer being in a transaction */ current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS); - xfs_trans_free_items(tp, NULLCOMMITLSN, dirty); + xfs_trans_free_items(tp, dirty); xfs_trans_free(tp); } diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index c6e1c5704a8c..64d7f171ebd3 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -27,7 +27,7 @@ struct xfs_cud_log_item; struct xfs_bui_log_item; struct xfs_bud_log_item; -typedef struct xfs_log_item { +struct xfs_log_item { struct list_head li_ail; /* AIL pointers */ struct list_head li_trans; /* transaction list */ xfs_lsn_t li_lsn; /* last on-disk lsn */ @@ -48,7 +48,7 @@ typedef struct xfs_log_item { struct xfs_log_vec *li_lv; /* active log vector */ struct xfs_log_vec *li_lv_shadow; /* standby vector */ xfs_lsn_t li_seq; /* CIL commit seq */ -} xfs_log_item_t; +}; /* * li_flags use the (set/test/clear)_bit atomic interfaces because updates can @@ -67,17 +67,24 @@ typedef struct xfs_log_item { { (1 << XFS_LI_DIRTY), "DIRTY" } struct xfs_item_ops { - void (*iop_size)(xfs_log_item_t *, int *, int *); - void (*iop_format)(xfs_log_item_t *, struct xfs_log_vec *); - void (*iop_pin)(xfs_log_item_t *); - void (*iop_unpin)(xfs_log_item_t *, int remove); + unsigned flags; + void (*iop_size)(struct xfs_log_item *, int *, int *); + void (*iop_format)(struct xfs_log_item *, struct xfs_log_vec *); + void (*iop_pin)(struct xfs_log_item *); + void (*iop_unpin)(struct xfs_log_item *, int remove); uint (*iop_push)(struct xfs_log_item *, struct list_head *); - void (*iop_unlock)(xfs_log_item_t *); - xfs_lsn_t (*iop_committed)(xfs_log_item_t *, xfs_lsn_t); - void (*iop_committing)(xfs_log_item_t *, xfs_lsn_t); - void (*iop_error)(xfs_log_item_t *, xfs_buf_t *); + void (*iop_committing)(struct xfs_log_item *, xfs_lsn_t commit_lsn); + void (*iop_release)(struct xfs_log_item *); + xfs_lsn_t (*iop_committed)(struct xfs_log_item *, xfs_lsn_t); + void (*iop_error)(struct xfs_log_item *, xfs_buf_t *); }; +/* + * Release the log item as soon as committed. This is for items just logging + * intents that never need to be written back in place. + */ +#define XFS_ITEM_RELEASE_WHEN_COMMITTED (1 << 0) + void xfs_log_item_init(struct xfs_mount *mp, struct xfs_log_item *item, int type, const struct xfs_item_ops *ops); @@ -203,7 +210,7 @@ xfs_trans_read_buf( flags, bpp, ops); } -struct xfs_buf *xfs_trans_getsb(xfs_trans_t *, struct xfs_mount *, int); +struct xfs_buf *xfs_trans_getsb(xfs_trans_t *, struct xfs_mount *); void xfs_trans_brelse(xfs_trans_t *, struct xfs_buf *); void xfs_trans_bjoin(xfs_trans_t *, struct xfs_buf *); @@ -223,14 +230,6 @@ void xfs_trans_dirty_buf(struct xfs_trans *, struct xfs_buf *); bool xfs_trans_buf_is_dirty(struct xfs_buf *bp); void xfs_trans_log_inode(xfs_trans_t *, struct xfs_inode *, uint); -struct xfs_efd_log_item *xfs_trans_get_efd(struct xfs_trans *, - struct xfs_efi_log_item *, - uint); -int xfs_trans_free_extent(struct xfs_trans *, - struct xfs_efd_log_item *, xfs_fsblock_t, - xfs_extlen_t, - const struct xfs_owner_info *, - bool); int xfs_trans_commit(struct xfs_trans *); int xfs_trans_roll(struct xfs_trans **); int xfs_trans_roll_inode(struct xfs_trans **, struct xfs_inode *); @@ -245,37 +244,4 @@ void xfs_trans_buf_copy_type(struct xfs_buf *dst_bp, extern kmem_zone_t *xfs_trans_zone; -/* rmap updates */ -enum xfs_rmap_intent_type; - -struct xfs_rud_log_item *xfs_trans_get_rud(struct xfs_trans *tp, - struct xfs_rui_log_item *ruip); -int xfs_trans_log_finish_rmap_update(struct xfs_trans *tp, - struct xfs_rud_log_item *rudp, enum xfs_rmap_intent_type type, - uint64_t owner, int whichfork, xfs_fileoff_t startoff, - xfs_fsblock_t startblock, xfs_filblks_t blockcount, - xfs_exntst_t state, struct xfs_btree_cur **pcur); - -/* refcount updates */ -enum xfs_refcount_intent_type; - -struct xfs_cud_log_item *xfs_trans_get_cud(struct xfs_trans *tp, - struct xfs_cui_log_item *cuip); -int xfs_trans_log_finish_refcount_update(struct xfs_trans *tp, - struct xfs_cud_log_item *cudp, - enum xfs_refcount_intent_type type, xfs_fsblock_t startblock, - xfs_extlen_t blockcount, xfs_fsblock_t *new_fsb, - xfs_extlen_t *new_len, struct xfs_btree_cur **pcur); - -/* mapping updates */ -enum xfs_bmap_intent_type; - -struct xfs_bud_log_item *xfs_trans_get_bud(struct xfs_trans *tp, - struct xfs_bui_log_item *buip); -int xfs_trans_log_finish_bmap_update(struct xfs_trans *tp, - struct xfs_bud_log_item *rudp, enum xfs_bmap_intent_type type, - struct xfs_inode *ip, int whichfork, xfs_fileoff_t startoff, - xfs_fsblock_t startblock, xfs_filblks_t *blockcount, - xfs_exntst_t state); - #endif /* __XFS_TRANS_H__ */ diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c index d3a4e89bf4a0..6ccfd75d3c24 100644 --- a/fs/xfs/xfs_trans_ail.c +++ b/fs/xfs/xfs_trans_ail.c @@ -6,6 +6,7 @@ */ #include "xfs.h" #include "xfs_fs.h" +#include "xfs_shared.h" #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" @@ -74,29 +75,29 @@ xfs_ail_check( * Return a pointer to the last item in the AIL. If the AIL is empty, then * return NULL. */ -static xfs_log_item_t * +static struct xfs_log_item * xfs_ail_max( struct xfs_ail *ailp) { if (list_empty(&ailp->ail_head)) return NULL; - return list_entry(ailp->ail_head.prev, xfs_log_item_t, li_ail); + return list_entry(ailp->ail_head.prev, struct xfs_log_item, li_ail); } /* * Return a pointer to the item which follows the given item in the AIL. If * the given item is the last item in the list, then return NULL. */ -static xfs_log_item_t * +static struct xfs_log_item * xfs_ail_next( - struct xfs_ail *ailp, - xfs_log_item_t *lip) + struct xfs_ail *ailp, + struct xfs_log_item *lip) { if (lip->li_ail.next == &ailp->ail_head) return NULL; - return list_first_entry(&lip->li_ail, xfs_log_item_t, li_ail); + return list_first_entry(&lip->li_ail, struct xfs_log_item, li_ail); } /* @@ -109,10 +110,10 @@ xfs_ail_next( */ xfs_lsn_t xfs_ail_min_lsn( - struct xfs_ail *ailp) + struct xfs_ail *ailp) { - xfs_lsn_t lsn = 0; - xfs_log_item_t *lip; + xfs_lsn_t lsn = 0; + struct xfs_log_item *lip; spin_lock(&ailp->ail_lock); lip = xfs_ail_min(ailp); @@ -128,10 +129,10 @@ xfs_ail_min_lsn( */ static xfs_lsn_t xfs_ail_max_lsn( - struct xfs_ail *ailp) + struct xfs_ail *ailp) { - xfs_lsn_t lsn = 0; - xfs_log_item_t *lip; + xfs_lsn_t lsn = 0; + struct xfs_log_item *lip; spin_lock(&ailp->ail_lock); lip = xfs_ail_max(ailp); @@ -216,13 +217,13 @@ xfs_trans_ail_cursor_clear( * ascending traversal. Pass a @lsn of zero to initialise the cursor to the * first item in the AIL. Returns NULL if the list is empty. */ -xfs_log_item_t * +struct xfs_log_item * xfs_trans_ail_cursor_first( struct xfs_ail *ailp, struct xfs_ail_cursor *cur, xfs_lsn_t lsn) { - xfs_log_item_t *lip; + struct xfs_log_item *lip; xfs_trans_ail_cursor_init(ailp, cur); @@ -248,7 +249,7 @@ __xfs_trans_ail_cursor_last( struct xfs_ail *ailp, xfs_lsn_t lsn) { - xfs_log_item_t *lip; + struct xfs_log_item *lip; list_for_each_entry_reverse(lip, &ailp->ail_head, li_ail) { if (XFS_LSN_CMP(lip->li_lsn, lsn) <= 0) @@ -327,8 +328,8 @@ xfs_ail_splice( */ static void xfs_ail_delete( - struct xfs_ail *ailp, - xfs_log_item_t *lip) + struct xfs_ail *ailp, + struct xfs_log_item *lip) { xfs_ail_check(ailp, lip); list_del(&lip->li_ail); @@ -347,6 +348,14 @@ xfsaild_push_item( if (XFS_TEST_ERROR(false, ailp->ail_mount, XFS_ERRTAG_LOG_ITEM_PIN)) return XFS_ITEM_PINNED; + /* + * Consider the item pinned if a push callback is not defined so the + * caller will force the log. This should only happen for intent items + * as they are unpinned once the associated done item is committed to + * the on-disk log. + */ + if (!lip->li_ops->iop_push) + return XFS_ITEM_PINNED; return lip->li_ops->iop_push(lip, &ailp->ail_buf_list); } @@ -356,7 +365,7 @@ xfsaild_push( { xfs_mount_t *mp = ailp->ail_mount; struct xfs_ail_cursor cur; - xfs_log_item_t *lip; + struct xfs_log_item *lip; xfs_lsn_t lsn; xfs_lsn_t target; long tout; @@ -611,10 +620,10 @@ xfsaild( */ void xfs_ail_push( - struct xfs_ail *ailp, - xfs_lsn_t threshold_lsn) + struct xfs_ail *ailp, + xfs_lsn_t threshold_lsn) { - xfs_log_item_t *lip; + struct xfs_log_item *lip; lip = xfs_ail_min(ailp); if (!lip || XFS_FORCED_SHUTDOWN(ailp->ail_mount) || @@ -699,7 +708,7 @@ xfs_trans_ail_update_bulk( int nr_items, xfs_lsn_t lsn) __releases(ailp->ail_lock) { - xfs_log_item_t *mlip; + struct xfs_log_item *mlip; int mlip_changed = 0; int i; LIST_HEAD(tmp); diff --git a/fs/xfs/xfs_trans_bmap.c b/fs/xfs/xfs_trans_bmap.c deleted file mode 100644 index 11cff449d055..000000000000 --- a/fs/xfs/xfs_trans_bmap.c +++ /dev/null @@ -1,233 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0+ -/* - * Copyright (C) 2016 Oracle. All Rights Reserved. - * Author: Darrick J. Wong <darrick.wong@oracle.com> - */ -#include "xfs.h" -#include "xfs_fs.h" -#include "xfs_shared.h" -#include "xfs_format.h" -#include "xfs_log_format.h" -#include "xfs_trans_resv.h" -#include "xfs_mount.h" -#include "xfs_defer.h" -#include "xfs_trans.h" -#include "xfs_trans_priv.h" -#include "xfs_bmap_item.h" -#include "xfs_alloc.h" -#include "xfs_bmap.h" -#include "xfs_inode.h" -#include "xfs_defer.h" - -/* - * This routine is called to allocate a "bmap update done" - * log item. - */ -struct xfs_bud_log_item * -xfs_trans_get_bud( - struct xfs_trans *tp, - struct xfs_bui_log_item *buip) -{ - struct xfs_bud_log_item *budp; - - budp = xfs_bud_init(tp->t_mountp, buip); - xfs_trans_add_item(tp, &budp->bud_item); - return budp; -} - -/* - * Finish an bmap update and log it to the BUD. Note that the - * transaction is marked dirty regardless of whether the bmap update - * succeeds or fails to support the BUI/BUD lifecycle rules. - */ -int -xfs_trans_log_finish_bmap_update( - struct xfs_trans *tp, - struct xfs_bud_log_item *budp, - enum xfs_bmap_intent_type type, - struct xfs_inode *ip, - int whichfork, - xfs_fileoff_t startoff, - xfs_fsblock_t startblock, - xfs_filblks_t *blockcount, - xfs_exntst_t state) -{ - int error; - - error = xfs_bmap_finish_one(tp, ip, type, whichfork, startoff, - startblock, blockcount, state); - - /* - * Mark the transaction dirty, even on error. This ensures the - * transaction is aborted, which: - * - * 1.) releases the BUI and frees the BUD - * 2.) shuts down the filesystem - */ - tp->t_flags |= XFS_TRANS_DIRTY; - set_bit(XFS_LI_DIRTY, &budp->bud_item.li_flags); - - return error; -} - -/* Sort bmap intents by inode. */ -static int -xfs_bmap_update_diff_items( - void *priv, - struct list_head *a, - struct list_head *b) -{ - struct xfs_bmap_intent *ba; - struct xfs_bmap_intent *bb; - - ba = container_of(a, struct xfs_bmap_intent, bi_list); - bb = container_of(b, struct xfs_bmap_intent, bi_list); - return ba->bi_owner->i_ino - bb->bi_owner->i_ino; -} - -/* Get an BUI. */ -STATIC void * -xfs_bmap_update_create_intent( - struct xfs_trans *tp, - unsigned int count) -{ - struct xfs_bui_log_item *buip; - - ASSERT(count == XFS_BUI_MAX_FAST_EXTENTS); - ASSERT(tp != NULL); - - buip = xfs_bui_init(tp->t_mountp); - ASSERT(buip != NULL); - - /* - * Get a log_item_desc to point at the new item. - */ - xfs_trans_add_item(tp, &buip->bui_item); - return buip; -} - -/* Set the map extent flags for this mapping. */ -static void -xfs_trans_set_bmap_flags( - struct xfs_map_extent *bmap, - enum xfs_bmap_intent_type type, - int whichfork, - xfs_exntst_t state) -{ - bmap->me_flags = 0; - switch (type) { - case XFS_BMAP_MAP: - case XFS_BMAP_UNMAP: - bmap->me_flags = type; - break; - default: - ASSERT(0); - } - if (state == XFS_EXT_UNWRITTEN) - bmap->me_flags |= XFS_BMAP_EXTENT_UNWRITTEN; - if (whichfork == XFS_ATTR_FORK) - bmap->me_flags |= XFS_BMAP_EXTENT_ATTR_FORK; -} - -/* Log bmap updates in the intent item. */ -STATIC void -xfs_bmap_update_log_item( - struct xfs_trans *tp, - void *intent, - struct list_head *item) -{ - struct xfs_bui_log_item *buip = intent; - struct xfs_bmap_intent *bmap; - uint next_extent; - struct xfs_map_extent *map; - - bmap = container_of(item, struct xfs_bmap_intent, bi_list); - - tp->t_flags |= XFS_TRANS_DIRTY; - set_bit(XFS_LI_DIRTY, &buip->bui_item.li_flags); - - /* - * atomic_inc_return gives us the value after the increment; - * we want to use it as an array index so we need to subtract 1 from - * it. - */ - next_extent = atomic_inc_return(&buip->bui_next_extent) - 1; - ASSERT(next_extent < buip->bui_format.bui_nextents); - map = &buip->bui_format.bui_extents[next_extent]; - map->me_owner = bmap->bi_owner->i_ino; - map->me_startblock = bmap->bi_bmap.br_startblock; - map->me_startoff = bmap->bi_bmap.br_startoff; - map->me_len = bmap->bi_bmap.br_blockcount; - xfs_trans_set_bmap_flags(map, bmap->bi_type, bmap->bi_whichfork, - bmap->bi_bmap.br_state); -} - -/* Get an BUD so we can process all the deferred rmap updates. */ -STATIC void * -xfs_bmap_update_create_done( - struct xfs_trans *tp, - void *intent, - unsigned int count) -{ - return xfs_trans_get_bud(tp, intent); -} - -/* Process a deferred rmap update. */ -STATIC int -xfs_bmap_update_finish_item( - struct xfs_trans *tp, - struct list_head *item, - void *done_item, - void **state) -{ - struct xfs_bmap_intent *bmap; - xfs_filblks_t count; - int error; - - bmap = container_of(item, struct xfs_bmap_intent, bi_list); - count = bmap->bi_bmap.br_blockcount; - error = xfs_trans_log_finish_bmap_update(tp, done_item, - bmap->bi_type, - bmap->bi_owner, bmap->bi_whichfork, - bmap->bi_bmap.br_startoff, - bmap->bi_bmap.br_startblock, - &count, - bmap->bi_bmap.br_state); - if (!error && count > 0) { - ASSERT(bmap->bi_type == XFS_BMAP_UNMAP); - bmap->bi_bmap.br_blockcount = count; - return -EAGAIN; - } - kmem_free(bmap); - return error; -} - -/* Abort all pending BUIs. */ -STATIC void -xfs_bmap_update_abort_intent( - void *intent) -{ - xfs_bui_release(intent); -} - -/* Cancel a deferred rmap update. */ -STATIC void -xfs_bmap_update_cancel_item( - struct list_head *item) -{ - struct xfs_bmap_intent *bmap; - - bmap = container_of(item, struct xfs_bmap_intent, bi_list); - kmem_free(bmap); -} - -const struct xfs_defer_op_type xfs_bmap_update_defer_type = { - .max_items = XFS_BUI_MAX_FAST_EXTENTS, - .diff_items = xfs_bmap_update_diff_items, - .create_intent = xfs_bmap_update_create_intent, - .abort_intent = xfs_bmap_update_abort_intent, - .log_item = xfs_bmap_update_log_item, - .create_done = xfs_bmap_update_create_done, - .finish_item = xfs_bmap_update_finish_item, - .cancel_item = xfs_bmap_update_cancel_item, -}; diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c index 629f1479c9d2..b5b3a78ef31c 100644 --- a/fs/xfs/xfs_trans_buf.c +++ b/fs/xfs/xfs_trans_buf.c @@ -10,11 +10,9 @@ #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" -#include "xfs_inode.h" #include "xfs_trans.h" #include "xfs_buf_item.h" #include "xfs_trans_priv.h" -#include "xfs_error.h" #include "xfs_trace.h" /* @@ -174,8 +172,7 @@ xfs_trans_get_buf_map( xfs_buf_t * xfs_trans_getsb( xfs_trans_t *tp, - struct xfs_mount *mp, - int flags) + struct xfs_mount *mp) { xfs_buf_t *bp; struct xfs_buf_log_item *bip; @@ -185,7 +182,7 @@ xfs_trans_getsb( * if tp is NULL. */ if (tp == NULL) - return xfs_getsb(mp, flags); + return xfs_getsb(mp); /* * If the superblock buffer already has this transaction @@ -203,7 +200,7 @@ xfs_trans_getsb( return bp; } - bp = xfs_getsb(mp, flags); + bp = xfs_getsb(mp); if (bp == NULL) return NULL; @@ -277,7 +274,7 @@ xfs_trans_read_buf_map( * release this buffer when it kills the tranaction. */ ASSERT(bp->b_ops != NULL); - error = xfs_buf_ensure_ops(bp, ops); + error = xfs_buf_reverify(bp, ops); if (error) { xfs_buf_ioerror_alert(bp, __func__); @@ -428,7 +425,7 @@ xfs_trans_brelse( /* * Mark the buffer as not needing to be unlocked when the buf item's - * iop_unlock() routine is called. The buffer must already be locked + * iop_committing() routine is called. The buffer must already be locked * and associated with the given transaction. */ /* ARGSUSED */ diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c index c23257a26c2b..1027c9ca6eb8 100644 --- a/fs/xfs/xfs_trans_dquot.c +++ b/fs/xfs/xfs_trans_dquot.c @@ -11,7 +11,6 @@ #include "xfs_trans_resv.h" #include "xfs_mount.h" #include "xfs_inode.h" -#include "xfs_error.h" #include "xfs_trans.h" #include "xfs_trans_priv.h" #include "xfs_quota.h" @@ -29,7 +28,6 @@ xfs_trans_dqjoin( xfs_trans_t *tp, xfs_dquot_t *dqp) { - ASSERT(dqp->q_transp != tp); ASSERT(XFS_DQ_IS_LOCKED(dqp)); ASSERT(dqp->q_logitem.qli_dquot == dqp); @@ -37,15 +35,8 @@ xfs_trans_dqjoin( * Get a log_item_desc to point at the new item. */ xfs_trans_add_item(tp, &dqp->q_logitem.qli_item); - - /* - * Initialize d_transp so we can later determine if this dquot is - * associated with this transaction. - */ - dqp->q_transp = tp; } - /* * This is called to mark the dquot as needing * to be logged when the transaction is committed. The dquot must @@ -61,7 +52,6 @@ xfs_trans_log_dquot( xfs_trans_t *tp, xfs_dquot_t *dqp) { - ASSERT(dqp->q_transp == tp); ASSERT(XFS_DQ_IS_LOCKED(dqp)); tp->t_flags |= XFS_TRANS_DIRTY; @@ -74,13 +64,13 @@ xfs_trans_log_dquot( */ void xfs_trans_dup_dqinfo( - xfs_trans_t *otp, - xfs_trans_t *ntp) + struct xfs_trans *otp, + struct xfs_trans *ntp) { - xfs_dqtrx_t *oq, *nq; - int i, j; - xfs_dqtrx_t *oqa, *nqa; - ulong blk_res_used; + struct xfs_dqtrx *oq, *nq; + int i, j; + struct xfs_dqtrx *oqa, *nqa; + uint64_t blk_res_used; if (!otp->t_dqinfo) return; @@ -137,7 +127,7 @@ xfs_trans_mod_dquot_byino( xfs_trans_t *tp, xfs_inode_t *ip, uint field, - long delta) + int64_t delta) { xfs_mount_t *mp = tp->t_mountp; @@ -191,12 +181,12 @@ xfs_trans_get_dqtrx( */ void xfs_trans_mod_dquot( - xfs_trans_t *tp, - xfs_dquot_t *dqp, - uint field, - long delta) + struct xfs_trans *tp, + struct xfs_dquot *dqp, + uint field, + int64_t delta) { - xfs_dqtrx_t *qtrx; + struct xfs_dqtrx *qtrx; ASSERT(tp); ASSERT(XFS_IS_QUOTA_RUNNING(tp->t_mountp)); @@ -219,14 +209,14 @@ xfs_trans_mod_dquot( * regular disk blk reservation */ case XFS_TRANS_DQ_RES_BLKS: - qtrx->qt_blk_res += (ulong)delta; + qtrx->qt_blk_res += delta; break; /* * inode reservation */ case XFS_TRANS_DQ_RES_INOS: - qtrx->qt_ino_res += (ulong)delta; + qtrx->qt_ino_res += delta; break; /* @@ -245,7 +235,7 @@ xfs_trans_mod_dquot( */ case XFS_TRANS_DQ_ICOUNT: if (qtrx->qt_ino_res && delta > 0) { - qtrx->qt_ino_res_used += (ulong)delta; + qtrx->qt_ino_res_used += delta; ASSERT(qtrx->qt_ino_res >= qtrx->qt_ino_res_used); } qtrx->qt_icount_delta += delta; @@ -255,7 +245,7 @@ xfs_trans_mod_dquot( * rtblk reservation */ case XFS_TRANS_DQ_RES_RTBLKS: - qtrx->qt_rtblk_res += (ulong)delta; + qtrx->qt_rtblk_res += delta; break; /* @@ -263,7 +253,7 @@ xfs_trans_mod_dquot( */ case XFS_TRANS_DQ_RTBCOUNT: if (qtrx->qt_rtblk_res && delta > 0) { - qtrx->qt_rtblk_res_used += (ulong)delta; + qtrx->qt_rtblk_res_used += delta; ASSERT(qtrx->qt_rtblk_res >= qtrx->qt_rtblk_res_used); } qtrx->qt_rtbcount_delta += delta; @@ -288,8 +278,8 @@ xfs_trans_mod_dquot( */ STATIC void xfs_trans_dqlockedjoin( - xfs_trans_t *tp, - xfs_dqtrx_t *q) + struct xfs_trans *tp, + struct xfs_dqtrx *q) { ASSERT(q[0].qt_dquot != NULL); if (q[1].qt_dquot == NULL) { @@ -320,8 +310,8 @@ xfs_trans_apply_dquot_deltas( struct xfs_dquot *dqp; struct xfs_dqtrx *qtrx, *qa; struct xfs_disk_dquot *d; - long totalbdelta; - long totalrtbdelta; + int64_t totalbdelta; + int64_t totalrtbdelta; if (!(tp->t_flags & XFS_TRANS_DQ_DIRTY)) return; @@ -347,7 +337,6 @@ xfs_trans_apply_dquot_deltas( break; ASSERT(XFS_DQ_IS_LOCKED(dqp)); - ASSERT(dqp->q_transp == tp); /* * adjust the actual number of blocks used @@ -413,7 +402,7 @@ xfs_trans_apply_dquot_deltas( * reservation that a transaction structure knows of. */ if (qtrx->qt_blk_res != 0) { - ulong blk_res_used = 0; + uint64_t blk_res_used = 0; if (qtrx->qt_bcount_delta > 0) blk_res_used = qtrx->qt_bcount_delta; @@ -501,7 +490,7 @@ xfs_trans_unreserve_and_mod_dquots( { int i, j; xfs_dquot_t *dqp; - xfs_dqtrx_t *qtrx, *qa; + struct xfs_dqtrx *qtrx, *qa; bool locked; if (!tp->t_dqinfo || !(tp->t_flags & XFS_TRANS_DQ_DIRTY)) @@ -585,7 +574,7 @@ xfs_trans_dqresv( xfs_trans_t *tp, xfs_mount_t *mp, xfs_dquot_t *dqp, - long nblks, + int64_t nblks, long ninos, uint flags) { @@ -745,7 +734,7 @@ xfs_trans_reserve_quota_bydquots( struct xfs_dquot *udqp, struct xfs_dquot *gdqp, struct xfs_dquot *pdqp, - long nblks, + int64_t nblks, long ninos, uint flags) { @@ -804,7 +793,7 @@ int xfs_trans_reserve_quota_nblks( struct xfs_trans *tp, struct xfs_inode *ip, - long nblks, + int64_t nblks, long ninos, uint flags) { diff --git a/fs/xfs/xfs_trans_extfree.c b/fs/xfs/xfs_trans_extfree.c deleted file mode 100644 index 0710434eb240..000000000000 --- a/fs/xfs/xfs_trans_extfree.c +++ /dev/null @@ -1,287 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (c) 2000,2005 Silicon Graphics, Inc. - * All Rights Reserved. - */ -#include "xfs.h" -#include "xfs_fs.h" -#include "xfs_shared.h" -#include "xfs_format.h" -#include "xfs_log_format.h" -#include "xfs_trans_resv.h" -#include "xfs_bit.h" -#include "xfs_mount.h" -#include "xfs_defer.h" -#include "xfs_trans.h" -#include "xfs_trans_priv.h" -#include "xfs_extfree_item.h" -#include "xfs_alloc.h" -#include "xfs_bmap.h" -#include "xfs_trace.h" -#include "xfs_defer.h" - -/* - * This routine is called to allocate an "extent free done" - * log item that will hold nextents worth of extents. The - * caller must use all nextents extents, because we are not - * flexible about this at all. - */ -struct xfs_efd_log_item * -xfs_trans_get_efd(struct xfs_trans *tp, - struct xfs_efi_log_item *efip, - uint nextents) -{ - struct xfs_efd_log_item *efdp; - - ASSERT(tp != NULL); - ASSERT(nextents > 0); - - efdp = xfs_efd_init(tp->t_mountp, efip, nextents); - ASSERT(efdp != NULL); - - /* - * Get a log_item_desc to point at the new item. - */ - xfs_trans_add_item(tp, &efdp->efd_item); - return efdp; -} - -/* - * Free an extent and log it to the EFD. Note that the transaction is marked - * dirty regardless of whether the extent free succeeds or fails to support the - * EFI/EFD lifecycle rules. - */ -int -xfs_trans_free_extent( - struct xfs_trans *tp, - struct xfs_efd_log_item *efdp, - xfs_fsblock_t start_block, - xfs_extlen_t ext_len, - const struct xfs_owner_info *oinfo, - bool skip_discard) -{ - struct xfs_mount *mp = tp->t_mountp; - struct xfs_extent *extp; - uint next_extent; - xfs_agnumber_t agno = XFS_FSB_TO_AGNO(mp, start_block); - xfs_agblock_t agbno = XFS_FSB_TO_AGBNO(mp, - start_block); - int error; - - trace_xfs_bmap_free_deferred(tp->t_mountp, agno, 0, agbno, ext_len); - - error = __xfs_free_extent(tp, start_block, ext_len, - oinfo, XFS_AG_RESV_NONE, skip_discard); - /* - * Mark the transaction dirty, even on error. This ensures the - * transaction is aborted, which: - * - * 1.) releases the EFI and frees the EFD - * 2.) shuts down the filesystem - */ - tp->t_flags |= XFS_TRANS_DIRTY; - set_bit(XFS_LI_DIRTY, &efdp->efd_item.li_flags); - - next_extent = efdp->efd_next_extent; - ASSERT(next_extent < efdp->efd_format.efd_nextents); - extp = &(efdp->efd_format.efd_extents[next_extent]); - extp->ext_start = start_block; - extp->ext_len = ext_len; - efdp->efd_next_extent++; - - return error; -} - -/* Sort bmap items by AG. */ -static int -xfs_extent_free_diff_items( - void *priv, - struct list_head *a, - struct list_head *b) -{ - struct xfs_mount *mp = priv; - struct xfs_extent_free_item *ra; - struct xfs_extent_free_item *rb; - - ra = container_of(a, struct xfs_extent_free_item, xefi_list); - rb = container_of(b, struct xfs_extent_free_item, xefi_list); - return XFS_FSB_TO_AGNO(mp, ra->xefi_startblock) - - XFS_FSB_TO_AGNO(mp, rb->xefi_startblock); -} - -/* Get an EFI. */ -STATIC void * -xfs_extent_free_create_intent( - struct xfs_trans *tp, - unsigned int count) -{ - struct xfs_efi_log_item *efip; - - ASSERT(tp != NULL); - ASSERT(count > 0); - - efip = xfs_efi_init(tp->t_mountp, count); - ASSERT(efip != NULL); - - /* - * Get a log_item_desc to point at the new item. - */ - xfs_trans_add_item(tp, &efip->efi_item); - return efip; -} - -/* Log a free extent to the intent item. */ -STATIC void -xfs_extent_free_log_item( - struct xfs_trans *tp, - void *intent, - struct list_head *item) -{ - struct xfs_efi_log_item *efip = intent; - struct xfs_extent_free_item *free; - uint next_extent; - struct xfs_extent *extp; - - free = container_of(item, struct xfs_extent_free_item, xefi_list); - - tp->t_flags |= XFS_TRANS_DIRTY; - set_bit(XFS_LI_DIRTY, &efip->efi_item.li_flags); - - /* - * atomic_inc_return gives us the value after the increment; - * we want to use it as an array index so we need to subtract 1 from - * it. - */ - next_extent = atomic_inc_return(&efip->efi_next_extent) - 1; - ASSERT(next_extent < efip->efi_format.efi_nextents); - extp = &efip->efi_format.efi_extents[next_extent]; - extp->ext_start = free->xefi_startblock; - extp->ext_len = free->xefi_blockcount; -} - -/* Get an EFD so we can process all the free extents. */ -STATIC void * -xfs_extent_free_create_done( - struct xfs_trans *tp, - void *intent, - unsigned int count) -{ - return xfs_trans_get_efd(tp, intent, count); -} - -/* Process a free extent. */ -STATIC int -xfs_extent_free_finish_item( - struct xfs_trans *tp, - struct list_head *item, - void *done_item, - void **state) -{ - struct xfs_extent_free_item *free; - int error; - - free = container_of(item, struct xfs_extent_free_item, xefi_list); - error = xfs_trans_free_extent(tp, done_item, - free->xefi_startblock, - free->xefi_blockcount, - &free->xefi_oinfo, free->xefi_skip_discard); - kmem_free(free); - return error; -} - -/* Abort all pending EFIs. */ -STATIC void -xfs_extent_free_abort_intent( - void *intent) -{ - xfs_efi_release(intent); -} - -/* Cancel a free extent. */ -STATIC void -xfs_extent_free_cancel_item( - struct list_head *item) -{ - struct xfs_extent_free_item *free; - - free = container_of(item, struct xfs_extent_free_item, xefi_list); - kmem_free(free); -} - -const struct xfs_defer_op_type xfs_extent_free_defer_type = { - .max_items = XFS_EFI_MAX_FAST_EXTENTS, - .diff_items = xfs_extent_free_diff_items, - .create_intent = xfs_extent_free_create_intent, - .abort_intent = xfs_extent_free_abort_intent, - .log_item = xfs_extent_free_log_item, - .create_done = xfs_extent_free_create_done, - .finish_item = xfs_extent_free_finish_item, - .cancel_item = xfs_extent_free_cancel_item, -}; - -/* - * AGFL blocks are accounted differently in the reserve pools and are not - * inserted into the busy extent list. - */ -STATIC int -xfs_agfl_free_finish_item( - struct xfs_trans *tp, - struct list_head *item, - void *done_item, - void **state) -{ - struct xfs_mount *mp = tp->t_mountp; - struct xfs_efd_log_item *efdp = done_item; - struct xfs_extent_free_item *free; - struct xfs_extent *extp; - struct xfs_buf *agbp; - int error; - xfs_agnumber_t agno; - xfs_agblock_t agbno; - uint next_extent; - - free = container_of(item, struct xfs_extent_free_item, xefi_list); - ASSERT(free->xefi_blockcount == 1); - agno = XFS_FSB_TO_AGNO(mp, free->xefi_startblock); - agbno = XFS_FSB_TO_AGBNO(mp, free->xefi_startblock); - - trace_xfs_agfl_free_deferred(mp, agno, 0, agbno, free->xefi_blockcount); - - error = xfs_alloc_read_agf(mp, tp, agno, 0, &agbp); - if (!error) - error = xfs_free_agfl_block(tp, agno, agbno, agbp, - &free->xefi_oinfo); - - /* - * Mark the transaction dirty, even on error. This ensures the - * transaction is aborted, which: - * - * 1.) releases the EFI and frees the EFD - * 2.) shuts down the filesystem - */ - tp->t_flags |= XFS_TRANS_DIRTY; - set_bit(XFS_LI_DIRTY, &efdp->efd_item.li_flags); - - next_extent = efdp->efd_next_extent; - ASSERT(next_extent < efdp->efd_format.efd_nextents); - extp = &(efdp->efd_format.efd_extents[next_extent]); - extp->ext_start = free->xefi_startblock; - extp->ext_len = free->xefi_blockcount; - efdp->efd_next_extent++; - - kmem_free(free); - return error; -} - - -/* sub-type with special handling for AGFL deferred frees */ -const struct xfs_defer_op_type xfs_agfl_free_defer_type = { - .max_items = XFS_EFI_MAX_FAST_EXTENTS, - .diff_items = xfs_extent_free_diff_items, - .create_intent = xfs_extent_free_create_intent, - .abort_intent = xfs_extent_free_abort_intent, - .log_item = xfs_extent_free_log_item, - .create_done = xfs_extent_free_create_done, - .finish_item = xfs_agfl_free_finish_item, - .cancel_item = xfs_extent_free_cancel_item, -}; diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h index 091eae9f4e74..2e073c1c4614 100644 --- a/fs/xfs/xfs_trans_priv.h +++ b/fs/xfs/xfs_trans_priv.h @@ -16,12 +16,10 @@ struct xfs_log_vec; void xfs_trans_init(struct xfs_mount *); void xfs_trans_add_item(struct xfs_trans *, struct xfs_log_item *); void xfs_trans_del_item(struct xfs_log_item *); -void xfs_trans_free_items(struct xfs_trans *tp, xfs_lsn_t commit_lsn, - bool abort); void xfs_trans_unreserve_and_mod_sb(struct xfs_trans *tp); void xfs_trans_committed_bulk(struct xfs_ail *ailp, struct xfs_log_vec *lv, - xfs_lsn_t commit_lsn, int aborted); + xfs_lsn_t commit_lsn, bool aborted); /* * AIL traversal cursor. * diff --git a/fs/xfs/xfs_trans_refcount.c b/fs/xfs/xfs_trans_refcount.c deleted file mode 100644 index 6c947ff4faf6..000000000000 --- a/fs/xfs/xfs_trans_refcount.c +++ /dev/null @@ -1,241 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0+ -/* - * Copyright (C) 2016 Oracle. All Rights Reserved. - * Author: Darrick J. Wong <darrick.wong@oracle.com> - */ -#include "xfs.h" -#include "xfs_fs.h" -#include "xfs_shared.h" -#include "xfs_format.h" -#include "xfs_log_format.h" -#include "xfs_trans_resv.h" -#include "xfs_mount.h" -#include "xfs_defer.h" -#include "xfs_trans.h" -#include "xfs_trans_priv.h" -#include "xfs_refcount_item.h" -#include "xfs_alloc.h" -#include "xfs_refcount.h" -#include "xfs_defer.h" - -/* - * This routine is called to allocate a "refcount update done" - * log item. - */ -struct xfs_cud_log_item * -xfs_trans_get_cud( - struct xfs_trans *tp, - struct xfs_cui_log_item *cuip) -{ - struct xfs_cud_log_item *cudp; - - cudp = xfs_cud_init(tp->t_mountp, cuip); - xfs_trans_add_item(tp, &cudp->cud_item); - return cudp; -} - -/* - * Finish an refcount update and log it to the CUD. Note that the - * transaction is marked dirty regardless of whether the refcount - * update succeeds or fails to support the CUI/CUD lifecycle rules. - */ -int -xfs_trans_log_finish_refcount_update( - struct xfs_trans *tp, - struct xfs_cud_log_item *cudp, - enum xfs_refcount_intent_type type, - xfs_fsblock_t startblock, - xfs_extlen_t blockcount, - xfs_fsblock_t *new_fsb, - xfs_extlen_t *new_len, - struct xfs_btree_cur **pcur) -{ - int error; - - error = xfs_refcount_finish_one(tp, type, startblock, - blockcount, new_fsb, new_len, pcur); - - /* - * Mark the transaction dirty, even on error. This ensures the - * transaction is aborted, which: - * - * 1.) releases the CUI and frees the CUD - * 2.) shuts down the filesystem - */ - tp->t_flags |= XFS_TRANS_DIRTY; - set_bit(XFS_LI_DIRTY, &cudp->cud_item.li_flags); - - return error; -} - -/* Sort refcount intents by AG. */ -static int -xfs_refcount_update_diff_items( - void *priv, - struct list_head *a, - struct list_head *b) -{ - struct xfs_mount *mp = priv; - struct xfs_refcount_intent *ra; - struct xfs_refcount_intent *rb; - - ra = container_of(a, struct xfs_refcount_intent, ri_list); - rb = container_of(b, struct xfs_refcount_intent, ri_list); - return XFS_FSB_TO_AGNO(mp, ra->ri_startblock) - - XFS_FSB_TO_AGNO(mp, rb->ri_startblock); -} - -/* Get an CUI. */ -STATIC void * -xfs_refcount_update_create_intent( - struct xfs_trans *tp, - unsigned int count) -{ - struct xfs_cui_log_item *cuip; - - ASSERT(tp != NULL); - ASSERT(count > 0); - - cuip = xfs_cui_init(tp->t_mountp, count); - ASSERT(cuip != NULL); - - /* - * Get a log_item_desc to point at the new item. - */ - xfs_trans_add_item(tp, &cuip->cui_item); - return cuip; -} - -/* Set the phys extent flags for this reverse mapping. */ -static void -xfs_trans_set_refcount_flags( - struct xfs_phys_extent *refc, - enum xfs_refcount_intent_type type) -{ - refc->pe_flags = 0; - switch (type) { - case XFS_REFCOUNT_INCREASE: - case XFS_REFCOUNT_DECREASE: - case XFS_REFCOUNT_ALLOC_COW: - case XFS_REFCOUNT_FREE_COW: - refc->pe_flags |= type; - break; - default: - ASSERT(0); - } -} - -/* Log refcount updates in the intent item. */ -STATIC void -xfs_refcount_update_log_item( - struct xfs_trans *tp, - void *intent, - struct list_head *item) -{ - struct xfs_cui_log_item *cuip = intent; - struct xfs_refcount_intent *refc; - uint next_extent; - struct xfs_phys_extent *ext; - - refc = container_of(item, struct xfs_refcount_intent, ri_list); - - tp->t_flags |= XFS_TRANS_DIRTY; - set_bit(XFS_LI_DIRTY, &cuip->cui_item.li_flags); - - /* - * atomic_inc_return gives us the value after the increment; - * we want to use it as an array index so we need to subtract 1 from - * it. - */ - next_extent = atomic_inc_return(&cuip->cui_next_extent) - 1; - ASSERT(next_extent < cuip->cui_format.cui_nextents); - ext = &cuip->cui_format.cui_extents[next_extent]; - ext->pe_startblock = refc->ri_startblock; - ext->pe_len = refc->ri_blockcount; - xfs_trans_set_refcount_flags(ext, refc->ri_type); -} - -/* Get an CUD so we can process all the deferred refcount updates. */ -STATIC void * -xfs_refcount_update_create_done( - struct xfs_trans *tp, - void *intent, - unsigned int count) -{ - return xfs_trans_get_cud(tp, intent); -} - -/* Process a deferred refcount update. */ -STATIC int -xfs_refcount_update_finish_item( - struct xfs_trans *tp, - struct list_head *item, - void *done_item, - void **state) -{ - struct xfs_refcount_intent *refc; - xfs_fsblock_t new_fsb; - xfs_extlen_t new_aglen; - int error; - - refc = container_of(item, struct xfs_refcount_intent, ri_list); - error = xfs_trans_log_finish_refcount_update(tp, done_item, - refc->ri_type, - refc->ri_startblock, - refc->ri_blockcount, - &new_fsb, &new_aglen, - (struct xfs_btree_cur **)state); - /* Did we run out of reservation? Requeue what we didn't finish. */ - if (!error && new_aglen > 0) { - ASSERT(refc->ri_type == XFS_REFCOUNT_INCREASE || - refc->ri_type == XFS_REFCOUNT_DECREASE); - refc->ri_startblock = new_fsb; - refc->ri_blockcount = new_aglen; - return -EAGAIN; - } - kmem_free(refc); - return error; -} - -/* Clean up after processing deferred refcounts. */ -STATIC void -xfs_refcount_update_finish_cleanup( - struct xfs_trans *tp, - void *state, - int error) -{ - struct xfs_btree_cur *rcur = state; - - xfs_refcount_finish_one_cleanup(tp, rcur, error); -} - -/* Abort all pending CUIs. */ -STATIC void -xfs_refcount_update_abort_intent( - void *intent) -{ - xfs_cui_release(intent); -} - -/* Cancel a deferred refcount update. */ -STATIC void -xfs_refcount_update_cancel_item( - struct list_head *item) -{ - struct xfs_refcount_intent *refc; - - refc = container_of(item, struct xfs_refcount_intent, ri_list); - kmem_free(refc); -} - -const struct xfs_defer_op_type xfs_refcount_update_defer_type = { - .max_items = XFS_CUI_MAX_FAST_EXTENTS, - .diff_items = xfs_refcount_update_diff_items, - .create_intent = xfs_refcount_update_create_intent, - .abort_intent = xfs_refcount_update_abort_intent, - .log_item = xfs_refcount_update_log_item, - .create_done = xfs_refcount_update_create_done, - .finish_item = xfs_refcount_update_finish_item, - .finish_cleanup = xfs_refcount_update_finish_cleanup, - .cancel_item = xfs_refcount_update_cancel_item, -}; diff --git a/fs/xfs/xfs_trans_rmap.c b/fs/xfs/xfs_trans_rmap.c deleted file mode 100644 index a42890931ecd..000000000000 --- a/fs/xfs/xfs_trans_rmap.c +++ /dev/null @@ -1,258 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0+ -/* - * Copyright (C) 2016 Oracle. All Rights Reserved. - * Author: Darrick J. Wong <darrick.wong@oracle.com> - */ -#include "xfs.h" -#include "xfs_fs.h" -#include "xfs_shared.h" -#include "xfs_format.h" -#include "xfs_log_format.h" -#include "xfs_trans_resv.h" -#include "xfs_mount.h" -#include "xfs_defer.h" -#include "xfs_trans.h" -#include "xfs_trans_priv.h" -#include "xfs_rmap_item.h" -#include "xfs_alloc.h" -#include "xfs_rmap.h" -#include "xfs_defer.h" - -/* Set the map extent flags for this reverse mapping. */ -static void -xfs_trans_set_rmap_flags( - struct xfs_map_extent *rmap, - enum xfs_rmap_intent_type type, - int whichfork, - xfs_exntst_t state) -{ - rmap->me_flags = 0; - if (state == XFS_EXT_UNWRITTEN) - rmap->me_flags |= XFS_RMAP_EXTENT_UNWRITTEN; - if (whichfork == XFS_ATTR_FORK) - rmap->me_flags |= XFS_RMAP_EXTENT_ATTR_FORK; - switch (type) { - case XFS_RMAP_MAP: - rmap->me_flags |= XFS_RMAP_EXTENT_MAP; - break; - case XFS_RMAP_MAP_SHARED: - rmap->me_flags |= XFS_RMAP_EXTENT_MAP_SHARED; - break; - case XFS_RMAP_UNMAP: - rmap->me_flags |= XFS_RMAP_EXTENT_UNMAP; - break; - case XFS_RMAP_UNMAP_SHARED: - rmap->me_flags |= XFS_RMAP_EXTENT_UNMAP_SHARED; - break; - case XFS_RMAP_CONVERT: - rmap->me_flags |= XFS_RMAP_EXTENT_CONVERT; - break; - case XFS_RMAP_CONVERT_SHARED: - rmap->me_flags |= XFS_RMAP_EXTENT_CONVERT_SHARED; - break; - case XFS_RMAP_ALLOC: - rmap->me_flags |= XFS_RMAP_EXTENT_ALLOC; - break; - case XFS_RMAP_FREE: - rmap->me_flags |= XFS_RMAP_EXTENT_FREE; - break; - default: - ASSERT(0); - } -} - -struct xfs_rud_log_item * -xfs_trans_get_rud( - struct xfs_trans *tp, - struct xfs_rui_log_item *ruip) -{ - struct xfs_rud_log_item *rudp; - - rudp = xfs_rud_init(tp->t_mountp, ruip); - xfs_trans_add_item(tp, &rudp->rud_item); - return rudp; -} - -/* - * Finish an rmap update and log it to the RUD. Note that the transaction is - * marked dirty regardless of whether the rmap update succeeds or fails to - * support the RUI/RUD lifecycle rules. - */ -int -xfs_trans_log_finish_rmap_update( - struct xfs_trans *tp, - struct xfs_rud_log_item *rudp, - enum xfs_rmap_intent_type type, - uint64_t owner, - int whichfork, - xfs_fileoff_t startoff, - xfs_fsblock_t startblock, - xfs_filblks_t blockcount, - xfs_exntst_t state, - struct xfs_btree_cur **pcur) -{ - int error; - - error = xfs_rmap_finish_one(tp, type, owner, whichfork, startoff, - startblock, blockcount, state, pcur); - - /* - * Mark the transaction dirty, even on error. This ensures the - * transaction is aborted, which: - * - * 1.) releases the RUI and frees the RUD - * 2.) shuts down the filesystem - */ - tp->t_flags |= XFS_TRANS_DIRTY; - set_bit(XFS_LI_DIRTY, &rudp->rud_item.li_flags); - - return error; -} - -/* Sort rmap intents by AG. */ -static int -xfs_rmap_update_diff_items( - void *priv, - struct list_head *a, - struct list_head *b) -{ - struct xfs_mount *mp = priv; - struct xfs_rmap_intent *ra; - struct xfs_rmap_intent *rb; - - ra = container_of(a, struct xfs_rmap_intent, ri_list); - rb = container_of(b, struct xfs_rmap_intent, ri_list); - return XFS_FSB_TO_AGNO(mp, ra->ri_bmap.br_startblock) - - XFS_FSB_TO_AGNO(mp, rb->ri_bmap.br_startblock); -} - -/* Get an RUI. */ -STATIC void * -xfs_rmap_update_create_intent( - struct xfs_trans *tp, - unsigned int count) -{ - struct xfs_rui_log_item *ruip; - - ASSERT(tp != NULL); - ASSERT(count > 0); - - ruip = xfs_rui_init(tp->t_mountp, count); - ASSERT(ruip != NULL); - - /* - * Get a log_item_desc to point at the new item. - */ - xfs_trans_add_item(tp, &ruip->rui_item); - return ruip; -} - -/* Log rmap updates in the intent item. */ -STATIC void -xfs_rmap_update_log_item( - struct xfs_trans *tp, - void *intent, - struct list_head *item) -{ - struct xfs_rui_log_item *ruip = intent; - struct xfs_rmap_intent *rmap; - uint next_extent; - struct xfs_map_extent *map; - - rmap = container_of(item, struct xfs_rmap_intent, ri_list); - - tp->t_flags |= XFS_TRANS_DIRTY; - set_bit(XFS_LI_DIRTY, &ruip->rui_item.li_flags); - - /* - * atomic_inc_return gives us the value after the increment; - * we want to use it as an array index so we need to subtract 1 from - * it. - */ - next_extent = atomic_inc_return(&ruip->rui_next_extent) - 1; - ASSERT(next_extent < ruip->rui_format.rui_nextents); - map = &ruip->rui_format.rui_extents[next_extent]; - map->me_owner = rmap->ri_owner; - map->me_startblock = rmap->ri_bmap.br_startblock; - map->me_startoff = rmap->ri_bmap.br_startoff; - map->me_len = rmap->ri_bmap.br_blockcount; - xfs_trans_set_rmap_flags(map, rmap->ri_type, rmap->ri_whichfork, - rmap->ri_bmap.br_state); -} - -/* Get an RUD so we can process all the deferred rmap updates. */ -STATIC void * -xfs_rmap_update_create_done( - struct xfs_trans *tp, - void *intent, - unsigned int count) -{ - return xfs_trans_get_rud(tp, intent); -} - -/* Process a deferred rmap update. */ -STATIC int -xfs_rmap_update_finish_item( - struct xfs_trans *tp, - struct list_head *item, - void *done_item, - void **state) -{ - struct xfs_rmap_intent *rmap; - int error; - - rmap = container_of(item, struct xfs_rmap_intent, ri_list); - error = xfs_trans_log_finish_rmap_update(tp, done_item, - rmap->ri_type, - rmap->ri_owner, rmap->ri_whichfork, - rmap->ri_bmap.br_startoff, - rmap->ri_bmap.br_startblock, - rmap->ri_bmap.br_blockcount, - rmap->ri_bmap.br_state, - (struct xfs_btree_cur **)state); - kmem_free(rmap); - return error; -} - -/* Clean up after processing deferred rmaps. */ -STATIC void -xfs_rmap_update_finish_cleanup( - struct xfs_trans *tp, - void *state, - int error) -{ - struct xfs_btree_cur *rcur = state; - - xfs_rmap_finish_one_cleanup(tp, rcur, error); -} - -/* Abort all pending RUIs. */ -STATIC void -xfs_rmap_update_abort_intent( - void *intent) -{ - xfs_rui_release(intent); -} - -/* Cancel a deferred rmap update. */ -STATIC void -xfs_rmap_update_cancel_item( - struct list_head *item) -{ - struct xfs_rmap_intent *rmap; - - rmap = container_of(item, struct xfs_rmap_intent, ri_list); - kmem_free(rmap); -} - -const struct xfs_defer_op_type xfs_rmap_update_defer_type = { - .max_items = XFS_RUI_MAX_FAST_EXTENTS, - .diff_items = xfs_rmap_update_diff_items, - .create_intent = xfs_rmap_update_create_intent, - .abort_intent = xfs_rmap_update_abort_intent, - .log_item = xfs_rmap_update_log_item, - .create_done = xfs_rmap_update_create_done, - .finish_item = xfs_rmap_update_finish_item, - .finish_cleanup = xfs_rmap_update_finish_cleanup, - .cancel_item = xfs_rmap_update_cancel_item, -}; diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c index 63ee1d5bf1d7..3123b5aaad2a 100644 --- a/fs/xfs/xfs_xattr.c +++ b/fs/xfs/xfs_xattr.c @@ -5,15 +5,12 @@ */ #include "xfs.h" +#include "xfs_shared.h" #include "xfs_format.h" #include "xfs_log_format.h" -#include "xfs_trans_resv.h" -#include "xfs_mount.h" #include "xfs_da_format.h" #include "xfs_inode.h" #include "xfs_attr.h" -#include "xfs_attr_leaf.h" -#include "xfs_acl.h" #include <linux/posix_acl_xattr.h> #include <linux/xattr.h> @@ -129,6 +126,9 @@ __xfs_xattr_put_listent( char *offset; int arraytop; + if (context->count < 0 || context->seen_enough) + return; + if (!context->alist) goto compute_size; |