From 187372a3b9faff68ed61c291d0135e6739e0dbdf Mon Sep 17 00:00:00 2001
From: Christoph Hellwig
Date: Mon, 8 Feb 2016 14:40:51 +1100
Subject: direct-io: always call ->end_io if non-NULL

This way we can pass back errors to the file system, and allow for
cleanup required for all direct I/O invocations.

Also allow the ->end_io handlers to return errors on their own, so that
I/O completion errors can be passed on to the callers.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>


---
 fs/xfs/xfs_aops.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

(limited to 'fs/xfs/xfs_aops.c')

diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 379c089fb051..295aaffea78e 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -1645,7 +1645,7 @@ out_end_io:
  * case the completion can be called in interrupt context, whereas if we have an
  * ioend we will always be called in task context (i.e. from a workqueue).
  */
-STATIC void
+STATIC int
 xfs_end_io_direct_write(
 	struct kiocb		*iocb,
 	loff_t			offset,
@@ -1655,15 +1655,19 @@ xfs_end_io_direct_write(
 	struct inode		*inode = file_inode(iocb->ki_filp);
 	struct xfs_ioend	*ioend = private;
 
+	if (size <= 0)
+		return 0;
+
 	trace_xfs_gbmap_direct_endio(XFS_I(inode), offset, size,
 				     ioend ? ioend->io_type : 0, NULL);
 
 	if (!ioend) {
 		ASSERT(offset + size <= i_size_read(inode));
-		return;
+		return 0;
 	}
 
 	__xfs_end_io_direct_write(inode, ioend, offset, size);
+	return 0;
 }
 
 static inline ssize_t
@@ -1672,10 +1676,7 @@ xfs_vm_do_dio(
 	struct kiocb		*iocb,
 	struct iov_iter		*iter,
 	loff_t			offset,
-	void			(*endio)(struct kiocb	*iocb,
-					 loff_t		offset,
-					 ssize_t	size,
-					 void		*private),
+	dio_iodone_t		endio,
 	int			flags)
 {
 	struct block_device	*bdev;
-- 
cgit v1.2.3-55-g7522


From 273dda76f757108bc2b29d30a9595b6dd3bdf3a1 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig
Date: Mon, 8 Feb 2016 14:40:51 +1100
Subject: xfs: don't use ioends for direct write completions

We only need to communicate two bits of information to the direct I/O
completion handler:

 (1) do we need to convert any unwritten extents in the range
 (2) do we need to check if we need to update the inode size based
     on the range passed to the completion handler

We can use the private data passed to the get_block handler and the
completion handler as a simple bitmask to communicate this information
instead of the current complicated infrastructure reusing the ioends
from the buffer I/O path, and thus avoiding a memory allocation and
a context switch for any non-trivial direct write.  As a nice side
effect we also decouple the direct I/O path implementation from that
of the buffered I/O path.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Brian Foster <bfoster@redhat.com>


---
 fs/xfs/xfs_aops.c  | 216 ++++++++++++++++++-----------------------------------
 fs/xfs/xfs_trace.h |   9 +--
 2 files changed, 75 insertions(+), 150 deletions(-)

(limited to 'fs/xfs/xfs_aops.c')

diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 295aaffea78e..f008a4f9dcea 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -36,6 +36,10 @@
 #include <linux/pagevec.h>
 #include <linux/writeback.h>
 
+/* flags for direct write completions */
+#define XFS_DIO_FLAG_UNWRITTEN	(1 << 0)
+#define XFS_DIO_FLAG_APPEND	(1 << 1)
+
 void
 xfs_count_page_state(
 	struct page		*page,
@@ -1238,27 +1242,8 @@ xfs_vm_releasepage(
 }
 
 /*
- * When we map a DIO buffer, we may need to attach an ioend that describes the
- * type of write IO we are doing. This passes to the completion function the
- * operations it needs to perform. If the mapping is for an overwrite wholly
- * within the EOF then we don't need an ioend and so we don't allocate one.
- * This avoids the unnecessary overhead of allocating and freeing ioends for
- * workloads that don't require transactions on IO completion.
- *
- * If we get multiple mappings in a single IO, we might be mapping different
- * types. But because the direct IO can only have a single private pointer, we
- * need to ensure that:
- *
- * a) i) the ioend spans the entire region of unwritten mappings; or
- *    ii) the ioend spans all the mappings that cross or are beyond EOF; and
- * b) if it contains unwritten extents, it is *permanently* marked as such
- *
- * We could do this by chaining ioends like buffered IO does, but we only
- * actually get one IO completion callback from the direct IO, and that spans
- * the entire IO regardless of how many mappings and IOs are needed to complete
- * the DIO. There is only going to be one reference to the ioend and its life
- * cycle is constrained by the DIO completion code. hence we don't need
- * reference counting here.
+ * When we map a DIO buffer, we may need to pass flags to
+ * xfs_end_io_direct_write to tell it what kind of write IO we are doing.
  *
  * Note that for DIO, an IO to the highest supported file block offset (i.e.
  * 2^63 - 1FSB bytes) will result in the offset + count overflowing a signed 64
@@ -1266,68 +1251,26 @@ xfs_vm_releasepage(
  * extending the file size. We won't know for sure until IO completion is run
  * and the actual max write offset is communicated to the IO completion
  * routine.
- *
- * For DAX page faults, we are preparing to never see unwritten extents here,
- * nor should we ever extend the inode size. Hence we will soon have nothing to
- * do here for this case, ensuring we don't have to provide an IO completion
- * callback to free an ioend that we don't actually need for a fault into the
- * page at offset (2^63 - 1FSB) bytes.
  */
-
 static void
 xfs_map_direct(
 	struct inode		*inode,
 	struct buffer_head	*bh_result,
 	struct xfs_bmbt_irec	*imap,
-	xfs_off_t		offset,
-	bool			dax_fault)
+	xfs_off_t		offset)
 {
-	struct xfs_ioend	*ioend;
+	uintptr_t		*flags = (uintptr_t *)&bh_result->b_private;
 	xfs_off_t		size = bh_result->b_size;
-	int			type;
-
-	if (ISUNWRITTEN(imap))
-		type = XFS_IO_UNWRITTEN;
-	else
-		type = XFS_IO_OVERWRITE;
-
-	trace_xfs_gbmap_direct(XFS_I(inode), offset, size, type, imap);
-
-	if (dax_fault) {
-		ASSERT(type == XFS_IO_OVERWRITE);
-		trace_xfs_gbmap_direct_none(XFS_I(inode), offset, size, type,
-					    imap);
-		return;
-	}
 
-	if (bh_result->b_private) {
-		ioend = bh_result->b_private;
-		ASSERT(ioend->io_size > 0);
-		ASSERT(offset >= ioend->io_offset);
-		if (offset + size > ioend->io_offset + ioend->io_size)
-			ioend->io_size = offset - ioend->io_offset + size;
-
-		if (type == XFS_IO_UNWRITTEN && type != ioend->io_type)
-			ioend->io_type = XFS_IO_UNWRITTEN;
-
-		trace_xfs_gbmap_direct_update(XFS_I(inode), ioend->io_offset,
-					      ioend->io_size, ioend->io_type,
-					      imap);
-	} else if (type == XFS_IO_UNWRITTEN ||
-		   offset + size > i_size_read(inode) ||
-		   offset + size < 0) {
-		ioend = xfs_alloc_ioend(inode, type);
-		ioend->io_offset = offset;
-		ioend->io_size = size;
+	trace_xfs_get_blocks_map_direct(XFS_I(inode), offset, size,
+		ISUNWRITTEN(imap) ? XFS_IO_UNWRITTEN : XFS_IO_OVERWRITE, imap);
 
-		bh_result->b_private = ioend;
+	if (ISUNWRITTEN(imap)) {
+		*flags |= XFS_DIO_FLAG_UNWRITTEN;
+		set_buffer_defer_completion(bh_result);
+	} else if (offset + size > i_size_read(inode) || offset + size < 0) {
+		*flags |= XFS_DIO_FLAG_APPEND;
 		set_buffer_defer_completion(bh_result);
-
-		trace_xfs_gbmap_direct_new(XFS_I(inode), offset, size, type,
-					   imap);
-	} else {
-		trace_xfs_gbmap_direct_none(XFS_I(inode), offset, size, type,
-					    imap);
 	}
 }
 
@@ -1498,9 +1441,12 @@ __xfs_get_blocks(
 		if (ISUNWRITTEN(&imap))
 			set_buffer_unwritten(bh_result);
 		/* direct IO needs special help */
-		if (create && direct)
-			xfs_map_direct(inode, bh_result, &imap, offset,
-				       dax_fault);
+		if (create && direct) {
+			if (dax_fault)
+				ASSERT(!ISUNWRITTEN(&imap));
+			else
+				xfs_map_direct(inode, bh_result, &imap, offset);
+		}
 	}
 
 	/*
@@ -1570,42 +1516,50 @@ xfs_get_blocks_dax_fault(
 	return __xfs_get_blocks(inode, iblock, bh_result, create, true, true);
 }
 
-static void
-__xfs_end_io_direct_write(
-	struct inode		*inode,
-	struct xfs_ioend	*ioend,
+/*
+ * Complete a direct I/O write request.
+ *
+ * xfs_map_direct passes us some flags in the private data to tell us what to
+ * do.  If no flags are set, then the write IO is an overwrite wholly within
+ * the existing allocated file size and so there is nothing for us to do.
+ *
+ * Note that in this case the completion can be called in interrupt context,
+ * whereas if we have flags set we will always be called in task context
+ * (i.e. from a workqueue).
+ */
+STATIC int
+xfs_end_io_direct_write(
+	struct kiocb		*iocb,
 	loff_t			offset,
-	ssize_t			size)
+	ssize_t			size,
+	void			*private)
 {
-	struct xfs_mount	*mp = XFS_I(inode)->i_mount;
+	struct inode		*inode = file_inode(iocb->ki_filp);
+	struct xfs_inode	*ip = XFS_I(inode);
+	struct xfs_mount	*mp = ip->i_mount;
+	uintptr_t		flags = (uintptr_t)private;
+	int			error = 0;
 
-	if (XFS_FORCED_SHUTDOWN(mp) || ioend->io_error)
-		goto out_end_io;
+	trace_xfs_end_io_direct_write(ip, offset, size);
 
-	/*
-	 * dio completion end_io functions are only called on writes if more
-	 * than 0 bytes was written.
-	 */
-	ASSERT(size > 0);
+	if (XFS_FORCED_SHUTDOWN(mp))
+		return -EIO;
 
-	/*
-	 * The ioend only maps whole blocks, while the IO may be sector aligned.
-	 * Hence the ioend offset/size may not match the IO offset/size exactly.
-	 * Because we don't map overwrites within EOF into the ioend, the offset
-	 * may not match, but only if the endio spans EOF.  Either way, write
-	 * the IO sizes into the ioend so that completion processing does the
-	 * right thing.
-	 */
-	ASSERT(offset + size <= ioend->io_offset + ioend->io_size);
-	ioend->io_size = size;
-	ioend->io_offset = offset;
+	if (size <= 0)
+		return size;
 
 	/*
-	 * The ioend tells us whether we are doing unwritten extent conversion
+	 * The flags tell us whether we are doing unwritten extent conversions
 	 * or an append transaction that updates the on-disk file size. These
 	 * cases are the only cases where we should *potentially* be needing
 	 * to update the VFS inode size.
-	 *
+	 */
+	if (flags == 0) {
+		ASSERT(offset + size <= i_size_read(inode));
+		return 0;
+	}
+
+	/*
 	 * We need to update the in-core inode size here so that we don't end up
 	 * with the on-disk inode size being outside the in-core inode size. We
 	 * have no other method of updating EOF for AIO, so always do it here
@@ -1616,58 +1570,30 @@ __xfs_end_io_direct_write(
 	 * here can result in EOF moving backwards and Bad Things Happen when
 	 * that occurs.
 	 */
-	spin_lock(&XFS_I(inode)->i_flags_lock);
+	spin_lock(&ip->i_flags_lock);
 	if (offset + size > i_size_read(inode))
 		i_size_write(inode, offset + size);
-	spin_unlock(&XFS_I(inode)->i_flags_lock);
+	spin_unlock(&ip->i_flags_lock);
 
-	/*
-	 * If we are doing an append IO that needs to update the EOF on disk,
-	 * do the transaction reserve now so we can use common end io
-	 * processing. Stashing the error (if there is one) in the ioend will
-	 * result in the ioend processing passing on the error if it is
-	 * possible as we can't return it from here.
-	 */
-	if (ioend->io_type == XFS_IO_OVERWRITE)
-		ioend->io_error = xfs_setfilesize_trans_alloc(ioend);
+	if (flags & XFS_DIO_FLAG_UNWRITTEN) {
+		trace_xfs_end_io_direct_write_unwritten(ip, offset, size);
 
-out_end_io:
-	xfs_end_io(&ioend->io_work);
-	return;
-}
+		error = xfs_iomap_write_unwritten(ip, offset, size);
+	} else if (flags & XFS_DIO_FLAG_APPEND) {
+		struct xfs_trans *tp;
 
-/*
- * Complete a direct I/O write request.
- *
- * The ioend structure is passed from __xfs_get_blocks() to tell us what to do.
- * If no ioend exists (i.e. @private == NULL) then the write IO is an overwrite
- * wholly within the EOF and so there is nothing for us to do. Note that in this
- * case the completion can be called in interrupt context, whereas if we have an
- * ioend we will always be called in task context (i.e. from a workqueue).
- */
-STATIC int
-xfs_end_io_direct_write(
-	struct kiocb		*iocb,
-	loff_t			offset,
-	ssize_t			size,
-	void			*private)
-{
-	struct inode		*inode = file_inode(iocb->ki_filp);
-	struct xfs_ioend	*ioend = private;
+		trace_xfs_end_io_direct_write_append(ip, offset, size);
 
-	if (size <= 0)
-		return 0;
-
-	trace_xfs_gbmap_direct_endio(XFS_I(inode), offset, size,
-				     ioend ? ioend->io_type : 0, NULL);
-
-	if (!ioend) {
-		ASSERT(offset + size <= i_size_read(inode));
-		return 0;
+		tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS);
+		error = xfs_trans_reserve(tp, &M_RES(mp)->tr_fsyncts, 0, 0);
+		if (error) {
+			xfs_trans_cancel(tp);
+			return error;
+		}
+		error = xfs_setfilesize(ip, tp, offset, size);
 	}
 
-	__xfs_end_io_direct_write(inode, ioend, offset, size);
-	return 0;
+	return error;
 }
 
 static inline ssize_t
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 391d797cb53f..c8d58426008e 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -1296,11 +1296,7 @@ DEFINE_IOMAP_EVENT(xfs_map_blocks_found);
 DEFINE_IOMAP_EVENT(xfs_map_blocks_alloc);
 DEFINE_IOMAP_EVENT(xfs_get_blocks_found);
 DEFINE_IOMAP_EVENT(xfs_get_blocks_alloc);
-DEFINE_IOMAP_EVENT(xfs_gbmap_direct);
-DEFINE_IOMAP_EVENT(xfs_gbmap_direct_new);
-DEFINE_IOMAP_EVENT(xfs_gbmap_direct_update);
-DEFINE_IOMAP_EVENT(xfs_gbmap_direct_none);
-DEFINE_IOMAP_EVENT(xfs_gbmap_direct_endio);
+DEFINE_IOMAP_EVENT(xfs_get_blocks_map_direct);
 
 DECLARE_EVENT_CLASS(xfs_simple_io_class,
 	TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count),
@@ -1340,6 +1336,9 @@ DEFINE_SIMPLE_IO_EVENT(xfs_unwritten_convert);
 DEFINE_SIMPLE_IO_EVENT(xfs_get_blocks_notfound);
 DEFINE_SIMPLE_IO_EVENT(xfs_setfilesize);
 DEFINE_SIMPLE_IO_EVENT(xfs_zero_eof);
+DEFINE_SIMPLE_IO_EVENT(xfs_end_io_direct_write);
+DEFINE_SIMPLE_IO_EVENT(xfs_end_io_direct_write_unwritten);
+DEFINE_SIMPLE_IO_EVENT(xfs_end_io_direct_write_append);
 
 DECLARE_EVENT_CLASS(xfs_itrunc_class,
 	TP_PROTO(struct xfs_inode *ip, xfs_fsize_t new_size),
-- 
cgit v1.2.3-55-g7522


From c19b104a67b3bb1ac48275a8a1c9df666e676c25 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig
Date: Mon, 8 Feb 2016 14:40:51 +1100
Subject: xfs: fold xfs_vm_do_dio into xfs_vm_direct_IO

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>


---
 fs/xfs/xfs_aops.c | 36 ++++++++++++++----------------------
 1 file changed, 14 insertions(+), 22 deletions(-)

(limited to 'fs/xfs/xfs_aops.c')

diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index f008a4f9dcea..8163910bc49f 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -1596,38 +1596,30 @@ xfs_end_io_direct_write(
 	return error;
 }
 
-static inline ssize_t
-xfs_vm_do_dio(
-	struct inode		*inode,
+STATIC ssize_t
+xfs_vm_direct_IO(
 	struct kiocb		*iocb,
 	struct iov_iter		*iter,
-	loff_t			offset,
-	dio_iodone_t		endio,
-	int			flags)
+	loff_t			offset)
 {
+	struct inode		*inode = iocb->ki_filp->f_mapping->host;
+	dio_iodone_t		*endio = NULL;
+	int			flags = 0;
 	struct block_device	*bdev;
 
-	if (IS_DAX(inode))
+	if (iov_iter_rw(iter) == WRITE) {
+		endio = xfs_end_io_direct_write;
+		flags = DIO_ASYNC_EXTEND;
+	}
+
+	if (IS_DAX(inode)) {
 		return dax_do_io(iocb, inode, iter, offset,
 				 xfs_get_blocks_direct, endio, 0);
+	}
 
 	bdev = xfs_find_bdev_for_inode(inode);
 	return  __blockdev_direct_IO(iocb, inode, bdev, iter, offset,
-				     xfs_get_blocks_direct, endio, NULL, flags);
-}
-
-STATIC ssize_t
-xfs_vm_direct_IO(
-	struct kiocb		*iocb,
-	struct iov_iter		*iter,
-	loff_t			offset)
-{
-	struct inode		*inode = iocb->ki_filp->f_mapping->host;
-
-	if (iov_iter_rw(iter) == WRITE)
-		return xfs_vm_do_dio(inode, iocb, iter, offset,
-				     xfs_end_io_direct_write, DIO_ASYNC_EXTEND);
-	return xfs_vm_do_dio(inode, iocb, iter, offset, NULL, 0);
+			xfs_get_blocks_direct, endio, NULL, flags);
 }
 
 /*
-- 
cgit v1.2.3-55-g7522


From 60630fe66ed28d43379382645ed349f7d3457330 Mon Sep 17 00:00:00 2001
From: Brian Foster
Date: Mon, 8 Feb 2016 15:00:02 +1100
Subject: xfs: clean up unwritten buffers on write failure

The xfs_vm_write_failed() handler is currently responsible for cleaning
up any delalloc blocks over the range of a failed write beyond EOF.
Failure to do so results in warning messages and other inconsistencies
between buffer and extent state. The ->releasepage() handler currently
warns in the event of a page being released with either unwritten or
delalloc buffers, as neither is ever expected by the time a page is
released.

As has been reproduced by generic/083 on a -bsize=1k fs, it is currently
possible to trigger the ->releasepage() warning for a page with
unwritten buffers when a filesystem is near ENOSPC. This is reproduced
by the following sequence:

  $ mkfs.xfs -f -b size=1k -d size=100m <dev>
  $ mount <dev> /mnt/
  $
  $ xfs_io -fc "falloc -k 0 1k" /mnt/file
  $ dd if=/dev/zero of=/mnt/enospc conv=notrunc oflag=append
  $
  $ xfs_io -c "pwrite 512 1k" /mnt/file
  $ xfs_io -d -c "pwrite 16k 1k" /mnt/file

The first pwrite command attempts a block unaligned write across an
unwritten block and a hole. The delalloc for the hole fails with ENOSPC
and the subsequent error handling does not clean up the unwritten buffer
that was instantiated during the first ->get_block() call.

The second pwrite triggers a warning as part of the inode mapping
invalidation that occurs prior to direct I/O. The releasepage() handler
detects the unwritten buffer at this time, warns and prevents the
release of the page.

To deal with this problem, update xfs_vm_write_failed() to clean up
unwritten as well as delalloc buffers that are beyond EOF and within the
range of the failed write.

Signed-off-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>


---
 fs/xfs/xfs_aops.c | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

(limited to 'fs/xfs/xfs_aops.c')

diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 379c089fb051..4a13f5311a41 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -1783,14 +1783,22 @@ xfs_vm_write_failed(
 		if (block_start >= to)
 			break;
 
-		if (!buffer_delay(bh))
+		/*
+		 * Process delalloc and unwritten buffers beyond EOF. We can
+		 * encounter unwritten buffers in the event that a file has
+		 * post-EOF unwritten extents and an extending write happens to
+		 * fail (e.g., an unaligned write that also involves a delalloc
+		 * to the same page).
+		 */
+		if (!buffer_delay(bh) && !buffer_unwritten(bh))
 			continue;
 
 		if (!buffer_new(bh) && block_offset < i_size_read(inode))
 			continue;
 
-		xfs_vm_kill_delalloc_range(inode, block_offset,
-					   block_offset + bh->b_size);
+		if (buffer_delay(bh))
+			xfs_vm_kill_delalloc_range(inode, block_offset,
+						   block_offset + bh->b_size);
 
 		/*
 		 * This buffer does not contain data anymore. make sure anyone
@@ -1801,6 +1809,7 @@ xfs_vm_write_failed(
 		clear_buffer_mapped(bh);
 		clear_buffer_new(bh);
 		clear_buffer_dirty(bh);
+		clear_buffer_unwritten(bh);
 	}
 
 }
-- 
cgit v1.2.3-55-g7522


From af055e37a91d215d7174d0b84c86795ca81086a7 Mon Sep 17 00:00:00 2001
From: Brian Foster
Date: Mon, 8 Feb 2016 15:00:02 +1100
Subject: xfs: fix xfs_log_ticket leak in xfs_end_io() after fs shutdown

If the filesystem has shut down, xfs_end_io() currently sets an
error on the ioend and proceeds to ioend destruction. The ioend
might contain a truncate transaction if the I/O extended the size of
the file. This transaction is only cleaned up in
xfs_setfilesize_ioend(), however, which is skipped in this case.
This results in an xfs_log_ticket leak message when the associate
cache slab is destroyed (e.g., on rmmod).

This was originally reproduced by xfs/141 on a distro kernel. The
problem is reproducible on an upstream kernel, but not easily
detected in current upstream if the xfs_log_ticket cache happens to
be merged with another cache. This can be reproduced more
deterministically with the 'slab_nomerge' kernel boot option.

Update xfs_end_io() to proceed with normal end I/O processing after
an error is set on an ioend due to fs shutdown. The I/O type-based
processing is already designed to handle an I/O error and ensure
that the ioend is cleaned up correctly.

Signed-off-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>


---
 fs/xfs/xfs_aops.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'fs/xfs/xfs_aops.c')

diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 4a13f5311a41..0c8dacea411e 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -214,10 +214,12 @@ xfs_end_io(
 	struct xfs_inode *ip = XFS_I(ioend->io_inode);
 	int		error = 0;
 
-	if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
+	/*
+	 * Set an error if the mount has shut down and proceed with end I/O
+	 * processing so it can perform whatever cleanups are necessary.
+	 */
+	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
 		ioend->io_error = -EIO;
-		goto done;
-	}
 
 	/*
 	 * For unwritten extents we need to issue transactions to convert a
-- 
cgit v1.2.3-55-g7522


From 988ef927926aa3481cbf144f235c0cefd7deb9e4 Mon Sep 17 00:00:00 2001
From: Dave Chinner
Date: Mon, 15 Feb 2016 17:20:50 +1100
Subject: xfs: remove nonblocking mode from xfs_vm_writepage

Remove the nonblocking optimisation done for mapping lookups during
writeback. It's not clear that leaving a hole in the writeback range
just because we couldn't get a lock is really a win, as it makes us
do another small random IO later on rather than a large sequential
IO now.

As this gets in the way of sane error handling later on, just remove
for the moment and we can re-introduce an equivalent optimisation in
future if we see problems due to extent map lock contention.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Dave Chinner <david@fromorbit.com>

---
 fs/xfs/xfs_aops.c | 20 +++-----------------
 1 file changed, 3 insertions(+), 17 deletions(-)

(limited to 'fs/xfs/xfs_aops.c')

diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 379c089fb051..807744b19233 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -283,8 +283,7 @@ xfs_map_blocks(
 	struct inode		*inode,
 	loff_t			offset,
 	struct xfs_bmbt_irec	*imap,
-	int			type,
-	int			nonblocking)
+	int			type)
 {
 	struct xfs_inode	*ip = XFS_I(inode);
 	struct xfs_mount	*mp = ip->i_mount;
@@ -300,12 +299,7 @@ xfs_map_blocks(
 	if (type == XFS_IO_UNWRITTEN)
 		bmapi_flags |= XFS_BMAPI_IGSTATE;
 
-	if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) {
-		if (nonblocking)
-			return -EAGAIN;
-		xfs_ilock(ip, XFS_ILOCK_SHARED);
-	}
-
+	xfs_ilock(ip, XFS_ILOCK_SHARED);
 	ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
 	       (ip->i_df.if_flags & XFS_IFEXTENTS));
 	ASSERT(offset <= mp->m_super->s_maxbytes);
@@ -955,7 +949,6 @@ xfs_vm_writepage(
 	ssize_t			len;
 	int			err, imap_valid = 0, uptodate = 1;
 	int			count = 0;
-	int			nonblocking = 0;
 
 	trace_xfs_writepage(inode, page, 0, 0);
 
@@ -1055,9 +1048,6 @@ xfs_vm_writepage(
 	offset = page_offset(page);
 	type = XFS_IO_OVERWRITE;
 
-	if (wbc->sync_mode == WB_SYNC_NONE)
-		nonblocking = 1;
-
 	do {
 		int new_ioend = 0;
 
@@ -1117,8 +1107,7 @@ xfs_vm_writepage(
 			 * time.
 			 */
 			new_ioend = 1;
-			err = xfs_map_blocks(inode, offset, &imap, type,
-					     nonblocking);
+			err = xfs_map_blocks(inode, offset, &imap, type);
 			if (err)
 				goto error;
 			imap_valid = xfs_imap_valid(inode, &imap, offset);
@@ -1188,9 +1177,6 @@ error:
 	if (iohead)
 		xfs_cancel_ioend(iohead);
 
-	if (err == -EAGAIN)
-		goto redirty;
-
 	xfs_aops_discard_page(page);
 	ClearPageUptodate(page);
 	unlock_page(page);
-- 
cgit v1.2.3-55-g7522


From 150d5be09ce49a9bed6feb7b7dc4e5ae188778ec Mon Sep 17 00:00:00 2001
From: Dave Chinner
Date: Mon, 15 Feb 2016 17:21:12 +1100
Subject: xfs: remove xfs_cancel_ioend

We currently have code to cancel ioends being built because we
change bufferhead state as we build the ioend. On error, this needs
to be unwound and so we have cancelling code that walks the buffers
on the ioend chain and undoes these state changes.

However, the IO submission path already handles state changes for
buffers when a submission error occurs, so we don't really need a
separate cancel function to do this - we can simply submit the
ioend chain with the specific error and it will be cancelled rather
than submitted.

Hence we can remove the explicit cancel code and just rely on
submission to deal with the error correctly.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Dave Chinner <david@fromorbit.com>

---
 fs/xfs/xfs_aops.c | 93 +++++++++++++++++++++++++++----------------------------
 1 file changed, 46 insertions(+), 47 deletions(-)

(limited to 'fs/xfs/xfs_aops.c')

diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 807744b19233..245c448a2448 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -521,38 +521,6 @@ xfs_submit_ioend(
 	} while ((ioend = next) != NULL);
 }
 
-/*
- * Cancel submission of all buffer_heads so far in this endio.
- * Toss the endio too.  Only ever called for the initial page
- * in a writepage request, so only ever one page.
- */
-STATIC void
-xfs_cancel_ioend(
-	xfs_ioend_t		*ioend)
-{
-	xfs_ioend_t		*next;
-	struct buffer_head	*bh, *next_bh;
-
-	do {
-		next = ioend->io_list;
-		bh = ioend->io_buffer_head;
-		do {
-			next_bh = bh->b_private;
-			clear_buffer_async_write(bh);
-			/*
-			 * The unwritten flag is cleared when added to the
-			 * ioend. We're not submitting for I/O so mark the
-			 * buffer unwritten again for next time around.
-			 */
-			if (ioend->io_type == XFS_IO_UNWRITTEN)
-				set_buffer_unwritten(bh);
-			unlock_buffer(bh);
-		} while ((bh = next_bh) != NULL);
-
-		mempool_free(ioend, xfs_ioend_pool);
-	} while ((ioend = next) != NULL);
-}
-
 /*
  * Test to see if we've been building up a completion structure for
  * earlier buffers -- if so, we try to append to this ioend if we
@@ -925,6 +893,28 @@ out_invalidate:
 	return;
 }
 
+static int
+xfs_writepage_submit(
+	struct xfs_ioend	*ioend,
+	struct xfs_ioend	*iohead,
+	struct writeback_control *wbc,
+	int			status)
+{
+	struct blk_plug		plug;
+
+	/* Reserve log space if we might write beyond the on-disk inode size. */
+	if (!status && ioend && ioend->io_type != XFS_IO_UNWRITTEN &&
+	    xfs_ioend_is_append(ioend))
+		status = xfs_setfilesize_trans_alloc(ioend);
+
+	if (iohead) {
+		blk_start_plug(&plug);
+		xfs_submit_ioend(wbc, iohead, status);
+		blk_finish_plug(&plug);
+	}
+	return status;
+}
+
 /*
  * Write out a dirty page.
  *
@@ -1136,6 +1126,7 @@ xfs_vm_writepage(
 		return 0;
 
 	ASSERT(iohead);
+	ASSERT(err == 0);
 
 	/*
 	 * Any errors from this point onwards need tobe reported through the IO
@@ -1161,25 +1152,33 @@ xfs_vm_writepage(
 				  wbc, end_index);
 	}
 
+	return xfs_writepage_submit(ioend, iohead, wbc, 0);
 
+error:
 	/*
-	 * Reserve log space if we might write beyond the on-disk inode size.
+	 * On error, we have to fail the iohead here because we buffers locked
+	 * in the ioend chain. If we don't do this, we'll deadlock invalidating
+	 * the page as that tries to lock the buffers on the page. Also, because
+	 * we may have set pages under writeback, we have to run IO completion to
+	 * mark the error state of the IO appropriately, so we can't cancel the
+	 * ioend directly here. That means we have to mark this page as under
+	 * writeback if we included any buffers from it in the ioend chain.
 	 */
-	err = 0;
-	if (ioend->io_type != XFS_IO_UNWRITTEN && xfs_ioend_is_append(ioend))
-		err = xfs_setfilesize_trans_alloc(ioend);
-
-	xfs_submit_ioend(wbc, iohead, err);
+	if (count)
+		xfs_start_page_writeback(page, 0, count);
+	xfs_writepage_submit(ioend, iohead, wbc, err);
 
-	return 0;
-
-error:
-	if (iohead)
-		xfs_cancel_ioend(iohead);
-
-	xfs_aops_discard_page(page);
-	ClearPageUptodate(page);
-	unlock_page(page);
+	/*
+	 * We can only discard the page we had the IO error on if we haven't
+	 * included it in the ioend above. If it has already been errored out,
+	 * the it is unlocked and we can't touch it here.
+	 */
+	if (!count) {
+		xfs_aops_discard_page(page);
+		ClearPageUptodate(page);
+		unlock_page(page);
+	}
+	mapping_set_error(page->mapping, err);
 	return err;
 
 redirty:
-- 
cgit v1.2.3-55-g7522


From fbcc025613590d7b1d15521555dcc6393a148a6b Mon Sep 17 00:00:00 2001
From: Dave Chinner
Date: Mon, 15 Feb 2016 17:21:19 +1100
Subject: xfs: Introduce writeback context for writepages

xfs_vm_writepages() calls generic_writepages to writeback a range of
a file, but then xfs_vm_writepage() clusters pages itself as it does
not have any context it can pass between->writepage calls from
__write_cache_pages().

Introduce a writeback context for xfs_vm_writepages() and call
__write_cache_pages directly with our own writepage callback so that
we can pass that context to each writepage invocation. This
encapsulates the current mapping, whether it is valid or not, the
current ioend and it's IO type and the ioend chain being built.

This requires us to move the ioend submission up to the level where
the writepage context is declared. This does mean we do not submit
IO until we packaged the entire writeback range, but with the block
plugging in the writepages call this is the way IO is submitted,
anyway.

It also means that we need to handle discontiguous page ranges.  If
the pages sent down by write_cache_pages to the writepage callback
are discontiguous, we need to detect this and put each discontiguous
page range into individual ioends. This is needed to ensure that the
ioend accurately represents the range of the file that it covers so
that file size updates during IO completion set the size correctly.
Failure to take into account the discontiguous ranges results in
files being too small when writeback patterns are non-sequential.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Dave Chinner <david@fromorbit.com>

---
 fs/xfs/xfs_aops.c | 220 ++++++++++++++++++++++++++++--------------------------
 fs/xfs/xfs_aops.h |   2 +
 2 files changed, 117 insertions(+), 105 deletions(-)

(limited to 'fs/xfs/xfs_aops.c')

diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 245c448a2448..e0be34125cf0 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -36,6 +36,18 @@
 #include <linux/pagevec.h>
 #include <linux/writeback.h>
 
+/*
+ * structure owned by writepages passed to individual writepage calls
+ */
+struct xfs_writepage_ctx {
+	struct xfs_bmbt_irec    imap;
+	bool			imap_valid;
+	unsigned int		io_type;
+	struct xfs_ioend	*iohead;
+	struct xfs_ioend	*ioend;
+	sector_t		last_block;
+};
+
 void
 xfs_count_page_state(
 	struct page		*page,
@@ -335,7 +347,7 @@ xfs_map_blocks(
 	return 0;
 }
 
-STATIC int
+STATIC bool
 xfs_imap_valid(
 	struct inode		*inode,
 	struct xfs_bmbt_irec	*imap,
@@ -532,29 +544,27 @@ xfs_add_to_ioend(
 	struct inode		*inode,
 	struct buffer_head	*bh,
 	xfs_off_t		offset,
-	unsigned int		type,
-	xfs_ioend_t		**result,
-	int			need_ioend)
+	struct xfs_writepage_ctx *wpc)
 {
-	xfs_ioend_t		*ioend = *result;
-
-	if (!ioend || need_ioend || type != ioend->io_type) {
-		xfs_ioend_t	*previous = *result;
-
-		ioend = xfs_alloc_ioend(inode, type);
-		ioend->io_offset = offset;
-		ioend->io_buffer_head = bh;
-		ioend->io_buffer_tail = bh;
-		if (previous)
-			previous->io_list = ioend;
-		*result = ioend;
+	if (!wpc->ioend || wpc->io_type != wpc->ioend->io_type ||
+	    bh->b_blocknr != wpc->last_block + 1) {
+		struct xfs_ioend	*new;
+
+		new = xfs_alloc_ioend(inode, wpc->io_type);
+		new->io_offset = offset;
+		new->io_buffer_head = bh;
+		new->io_buffer_tail = bh;
+		if (wpc->ioend)
+			wpc->ioend->io_list = new;
+		wpc->ioend = new;
 	} else {
-		ioend->io_buffer_tail->b_private = bh;
-		ioend->io_buffer_tail = bh;
+		wpc->ioend->io_buffer_tail->b_private = bh;
+		wpc->ioend->io_buffer_tail = bh;
 	}
 
 	bh->b_private = NULL;
-	ioend->io_size += bh->b_size;
+	wpc->ioend->io_size += bh->b_size;
+	wpc->last_block = bh->b_blocknr;
 }
 
 STATIC void
@@ -651,17 +661,15 @@ xfs_convert_page(
 	struct inode		*inode,
 	struct page		*page,
 	loff_t			tindex,
-	struct xfs_bmbt_irec	*imap,
-	xfs_ioend_t		**ioendp,
+	struct xfs_writepage_ctx *wpc,
 	struct writeback_control *wbc)
 {
 	struct buffer_head	*bh, *head;
 	xfs_off_t		end_offset;
 	unsigned long		p_offset;
-	unsigned int		type;
 	int			len, page_dirty;
 	int			count = 0, done = 0, uptodate = 1;
- 	xfs_off_t		offset = page_offset(page);
+	xfs_off_t		offset = page_offset(page);
 
 	if (page->index != tindex)
 		goto fail;
@@ -671,7 +679,7 @@ xfs_convert_page(
 		goto fail_unlock_page;
 	if (page->mapping != inode->i_mapping)
 		goto fail_unlock_page;
-	if (!xfs_check_page_type(page, (*ioendp)->io_type, false))
+	if (!xfs_check_page_type(page, wpc->ioend->io_type, false))
 		goto fail_unlock_page;
 
 	/*
@@ -707,7 +715,7 @@ xfs_convert_page(
 	 * writeback.  Hence for more optimal IO patterns, we should always
 	 * avoid partial page writeback due to multiple mappings on a page here.
 	 */
-	if (!xfs_imap_valid(inode, imap, end_offset))
+	if (!xfs_imap_valid(inode, &wpc->imap, end_offset))
 		goto fail_unlock_page;
 
 	len = 1 << inode->i_blkbits;
@@ -739,23 +747,22 @@ xfs_convert_page(
 		if (buffer_unwritten(bh) || buffer_delay(bh) ||
 		    buffer_mapped(bh)) {
 			if (buffer_unwritten(bh))
-				type = XFS_IO_UNWRITTEN;
+				wpc->io_type = XFS_IO_UNWRITTEN;
 			else if (buffer_delay(bh))
-				type = XFS_IO_DELALLOC;
+				wpc->io_type = XFS_IO_DELALLOC;
 			else
-				type = XFS_IO_OVERWRITE;
+				wpc->io_type = XFS_IO_OVERWRITE;
 
 			/*
 			 * imap should always be valid because of the above
 			 * partial page end_offset check on the imap.
 			 */
-			ASSERT(xfs_imap_valid(inode, imap, offset));
+			ASSERT(xfs_imap_valid(inode, &wpc->imap, offset));
 
 			lock_buffer(bh);
-			if (type != XFS_IO_OVERWRITE)
-				xfs_map_at_offset(inode, bh, imap, offset);
-			xfs_add_to_ioend(inode, bh, offset, type,
-					 ioendp, done);
+			if (wpc->io_type != XFS_IO_OVERWRITE)
+				xfs_map_at_offset(inode, bh, &wpc->imap, offset);
+			xfs_add_to_ioend(inode, bh, offset, wpc);
 
 			page_dirty--;
 			count++;
@@ -790,8 +797,7 @@ STATIC void
 xfs_cluster_write(
 	struct inode		*inode,
 	pgoff_t			tindex,
-	struct xfs_bmbt_irec	*imap,
-	xfs_ioend_t		**ioendp,
+	struct xfs_writepage_ctx *wpc,
 	struct writeback_control *wbc,
 	pgoff_t			tlast)
 {
@@ -807,7 +813,7 @@ xfs_cluster_write(
 
 		for (i = 0; i < pagevec_count(&pvec); i++) {
 			done = xfs_convert_page(inode, pvec.pages[i], tindex++,
-					imap, ioendp, wbc);
+						wpc, wbc);
 			if (done)
 				break;
 		}
@@ -895,21 +901,20 @@ out_invalidate:
 
 static int
 xfs_writepage_submit(
-	struct xfs_ioend	*ioend,
-	struct xfs_ioend	*iohead,
+	struct xfs_writepage_ctx *wpc,
 	struct writeback_control *wbc,
 	int			status)
 {
 	struct blk_plug		plug;
 
 	/* Reserve log space if we might write beyond the on-disk inode size. */
-	if (!status && ioend && ioend->io_type != XFS_IO_UNWRITTEN &&
-	    xfs_ioend_is_append(ioend))
-		status = xfs_setfilesize_trans_alloc(ioend);
+	if (!status && wpc->ioend && wpc->ioend->io_type != XFS_IO_UNWRITTEN &&
+	    xfs_ioend_is_append(wpc->ioend))
+		status = xfs_setfilesize_trans_alloc(wpc->ioend);
 
-	if (iohead) {
+	if (wpc->iohead) {
 		blk_start_plug(&plug);
-		xfs_submit_ioend(wbc, iohead, status);
+		xfs_submit_ioend(wbc, wpc->iohead, status);
 		blk_finish_plug(&plug);
 	}
 	return status;
@@ -924,20 +929,19 @@ xfs_writepage_submit(
  * For any other dirty buffer heads on the page we should flush them.
  */
 STATIC int
-xfs_vm_writepage(
+xfs_do_writepage(
 	struct page		*page,
-	struct writeback_control *wbc)
+	struct writeback_control *wbc,
+	void			*data)
 {
+	struct xfs_writepage_ctx *wpc = data;
 	struct inode		*inode = page->mapping->host;
 	struct buffer_head	*bh, *head;
-	struct xfs_bmbt_irec	imap;
-	xfs_ioend_t		*ioend = NULL, *iohead = NULL;
 	loff_t			offset;
-	unsigned int		type;
 	__uint64_t              end_offset;
 	pgoff_t                 end_index, last_index;
 	ssize_t			len;
-	int			err, imap_valid = 0, uptodate = 1;
+	int			err, uptodate = 1;
 	int			count = 0;
 
 	trace_xfs_writepage(inode, page, 0, 0);
@@ -1036,11 +1040,8 @@ xfs_vm_writepage(
 
 	bh = head = page_buffers(page);
 	offset = page_offset(page);
-	type = XFS_IO_OVERWRITE;
 
 	do {
-		int new_ioend = 0;
-
 		if (offset >= end_offset)
 			break;
 		if (!buffer_uptodate(bh))
@@ -1053,24 +1054,24 @@ xfs_vm_writepage(
 		 * buffers covering holes here.
 		 */
 		if (!buffer_mapped(bh) && buffer_uptodate(bh)) {
-			imap_valid = 0;
+			wpc->imap_valid = false;
 			continue;
 		}
 
 		if (buffer_unwritten(bh)) {
-			if (type != XFS_IO_UNWRITTEN) {
-				type = XFS_IO_UNWRITTEN;
-				imap_valid = 0;
+			if (wpc->io_type != XFS_IO_UNWRITTEN) {
+				wpc->io_type = XFS_IO_UNWRITTEN;
+				wpc->imap_valid = false;
 			}
 		} else if (buffer_delay(bh)) {
-			if (type != XFS_IO_DELALLOC) {
-				type = XFS_IO_DELALLOC;
-				imap_valid = 0;
+			if (wpc->io_type != XFS_IO_DELALLOC) {
+				wpc->io_type = XFS_IO_DELALLOC;
+				wpc->imap_valid = false;
 			}
 		} else if (buffer_uptodate(bh)) {
-			if (type != XFS_IO_OVERWRITE) {
-				type = XFS_IO_OVERWRITE;
-				imap_valid = 0;
+			if (wpc->io_type != XFS_IO_OVERWRITE) {
+				wpc->io_type = XFS_IO_OVERWRITE;
+				wpc->imap_valid = false;
 			}
 		} else {
 			if (PageUptodate(page))
@@ -1081,38 +1082,29 @@ xfs_vm_writepage(
 			 * subsequent writeable buffers into a new
 			 * ioend.
 			 */
-			imap_valid = 0;
+			wpc->imap_valid = 0;
 			continue;
 		}
 
-		if (imap_valid)
-			imap_valid = xfs_imap_valid(inode, &imap, offset);
-		if (!imap_valid) {
-			/*
-			 * If we didn't have a valid mapping then we need to
-			 * put the new mapping into a separate ioend structure.
-			 * This ensures non-contiguous extents always have
-			 * separate ioends, which is particularly important
-			 * for unwritten extent conversion at I/O completion
-			 * time.
-			 */
-			new_ioend = 1;
-			err = xfs_map_blocks(inode, offset, &imap, type);
+		if (wpc->imap_valid)
+			wpc->imap_valid = xfs_imap_valid(inode, &wpc->imap, offset);
+		if (!wpc->imap_valid) {
+			err = xfs_map_blocks(inode, offset, &wpc->imap,
+					     wpc->io_type);
 			if (err)
 				goto error;
-			imap_valid = xfs_imap_valid(inode, &imap, offset);
+			wpc->imap_valid = xfs_imap_valid(inode, &wpc->imap, offset);
 		}
-		if (imap_valid) {
+		if (wpc->imap_valid) {
 			lock_buffer(bh);
-			if (type != XFS_IO_OVERWRITE)
-				xfs_map_at_offset(inode, bh, &imap, offset);
-			xfs_add_to_ioend(inode, bh, offset, type, &ioend,
-					 new_ioend);
+			if (wpc->io_type != XFS_IO_OVERWRITE)
+				xfs_map_at_offset(inode, bh, &wpc->imap, offset);
+			xfs_add_to_ioend(inode, bh, offset, wpc);
 			count++;
 		}
 
-		if (!iohead)
-			iohead = ioend;
+		if (!wpc->iohead)
+			wpc->iohead = wpc->ioend;
 
 	} while (offset += len, ((bh = bh->b_this_page) != head));
 
@@ -1122,10 +1114,10 @@ xfs_vm_writepage(
 	xfs_start_page_writeback(page, 1, count);
 
 	/* if there is no IO to be submitted for this page, we are done */
-	if (!ioend)
+	if (!count)
 		return 0;
 
-	ASSERT(iohead);
+	ASSERT(wpc->iohead);
 	ASSERT(err == 0);
 
 	/*
@@ -1133,10 +1125,10 @@ xfs_vm_writepage(
 	 * completion path as we have marked the initial page as under writeback
 	 * and unlocked it.
 	 */
-	if (imap_valid) {
+	if (wpc->imap_valid) {
 		xfs_off_t		end_index;
 
-		end_index = imap.br_startoff + imap.br_blockcount;
+		end_index = wpc->imap.br_startoff + wpc->imap.br_blockcount;
 
 		/* to bytes */
 		end_index <<= inode->i_blkbits;
@@ -1148,32 +1140,30 @@ xfs_vm_writepage(
 		if (end_index > last_index)
 			end_index = last_index;
 
-		xfs_cluster_write(inode, page->index + 1, &imap, &ioend,
-				  wbc, end_index);
+		xfs_cluster_write(inode, page->index + 1, wpc, wbc, end_index);
 	}
-
-	return xfs_writepage_submit(ioend, iohead, wbc, 0);
+	return 0;
 
 error:
 	/*
 	 * On error, we have to fail the iohead here because we buffers locked
 	 * in the ioend chain. If we don't do this, we'll deadlock invalidating
 	 * the page as that tries to lock the buffers on the page. Also, because
-	 * we may have set pages under writeback, we have to run IO completion to
-	 * mark the error state of the IO appropriately, so we can't cancel the
-	 * ioend directly here. That means we have to mark this page as under
-	 * writeback if we included any buffers from it in the ioend chain.
+	 * we may have set pages under writeback, we have to make sure we run
+	 * IO completion to mark the error state of the IO appropriately, so we
+	 * can't cancel the ioend directly here. That means we have to mark this
+	 * page as under writeback if we included any buffers from it in the
+	 * ioend chain so that completion treats it correctly.
+	 *
+	 * If we didn't include the page in the ioend, then we can simply
+	 * discard and unlock it as there are no other users of the page or it's
+	 * buffers right now. The caller will still need to trigger submission
+	 * of outstanding ioends on the writepage context so they are treated
+	 * correctly on error.
 	 */
 	if (count)
 		xfs_start_page_writeback(page, 0, count);
-	xfs_writepage_submit(ioend, iohead, wbc, err);
-
-	/*
-	 * We can only discard the page we had the IO error on if we haven't
-	 * included it in the ioend above. If it has already been errored out,
-	 * the it is unlocked and we can't touch it here.
-	 */
-	if (!count) {
+	else {
 		xfs_aops_discard_page(page);
 		ClearPageUptodate(page);
 		unlock_page(page);
@@ -1187,13 +1177,33 @@ redirty:
 	return 0;
 }
 
+STATIC int
+xfs_vm_writepage(
+	struct page		*page,
+	struct writeback_control *wbc)
+{
+	struct xfs_writepage_ctx wpc = {
+		.io_type = XFS_IO_INVALID,
+	};
+	int			ret;
+
+	ret = xfs_do_writepage(page, wbc, &wpc);
+	return xfs_writepage_submit(&wpc, wbc, ret);
+}
+
 STATIC int
 xfs_vm_writepages(
 	struct address_space	*mapping,
 	struct writeback_control *wbc)
 {
+	struct xfs_writepage_ctx wpc = {
+		.io_type = XFS_IO_INVALID,
+	};
+	int			ret;
+
 	xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED);
-	return generic_writepages(mapping, wbc);
+	ret = write_cache_pages(mapping, wbc, xfs_do_writepage, &wpc);
+	return xfs_writepage_submit(&wpc, wbc, ret);
 }
 
 /*
diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h
index f6ffc9ae5ceb..3c3f1a37a1c7 100644
--- a/fs/xfs/xfs_aops.h
+++ b/fs/xfs/xfs_aops.h
@@ -24,12 +24,14 @@ extern mempool_t *xfs_ioend_pool;
  * Types of I/O for bmap clustering and I/O completion tracking.
  */
 enum {
+	XFS_IO_INVALID,		/* initial state */
 	XFS_IO_DELALLOC,	/* covers delalloc region */
 	XFS_IO_UNWRITTEN,	/* covers allocated but uninitialized data */
 	XFS_IO_OVERWRITE,	/* covers already allocated extent */
 };
 
 #define XFS_IO_TYPES \
+	{ XFS_IO_INVALID,		"invalid" }, \
 	{ XFS_IO_DELALLOC,		"delalloc" }, \
 	{ XFS_IO_UNWRITTEN,		"unwritten" }, \
 	{ XFS_IO_OVERWRITE,		"overwrite" }
-- 
cgit v1.2.3-55-g7522


From ad68972acb82c3e8bba316d542ab204984cb1f1c Mon Sep 17 00:00:00 2001
From: Dave Chinner
Date: Mon, 15 Feb 2016 17:21:31 +1100
Subject: xfs: xfs_cluster_write is redundant

xfs_cluster_write() is not necessary now that xfs_vm_writepages()
aggregates writepage calls across a single mapping. This means we no
longer need to do page lookups in xfs_cluster_write, so writeback
only needs to look up th epage cache once per page being written.
This also removes a large amount of mostly duplicate code between
xfs_do_writepage() and xfs_convert_page().

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Dave Chinner <david@fromorbit.com>

---
 fs/xfs/xfs_aops.c | 214 ++----------------------------------------------------
 1 file changed, 6 insertions(+), 208 deletions(-)

(limited to 'fs/xfs/xfs_aops.c')

diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index e0be34125cf0..f75c7c99cb63 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -650,179 +650,6 @@ xfs_check_page_type(
 	return false;
 }
 
-/*
- * Allocate & map buffers for page given the extent map. Write it out.
- * except for the original page of a writepage, this is called on
- * delalloc/unwritten pages only, for the original page it is possible
- * that the page has no mapping at all.
- */
-STATIC int
-xfs_convert_page(
-	struct inode		*inode,
-	struct page		*page,
-	loff_t			tindex,
-	struct xfs_writepage_ctx *wpc,
-	struct writeback_control *wbc)
-{
-	struct buffer_head	*bh, *head;
-	xfs_off_t		end_offset;
-	unsigned long		p_offset;
-	int			len, page_dirty;
-	int			count = 0, done = 0, uptodate = 1;
-	xfs_off_t		offset = page_offset(page);
-
-	if (page->index != tindex)
-		goto fail;
-	if (!trylock_page(page))
-		goto fail;
-	if (PageWriteback(page))
-		goto fail_unlock_page;
-	if (page->mapping != inode->i_mapping)
-		goto fail_unlock_page;
-	if (!xfs_check_page_type(page, wpc->ioend->io_type, false))
-		goto fail_unlock_page;
-
-	/*
-	 * page_dirty is initially a count of buffers on the page before
-	 * EOF and is decremented as we move each into a cleanable state.
-	 *
-	 * Derivation:
-	 *
-	 * End offset is the highest offset that this page should represent.
-	 * If we are on the last page, (end_offset & (PAGE_CACHE_SIZE - 1))
-	 * will evaluate non-zero and be less than PAGE_CACHE_SIZE and
-	 * hence give us the correct page_dirty count. On any other page,
-	 * it will be zero and in that case we need page_dirty to be the
-	 * count of buffers on the page.
-	 */
-	end_offset = min_t(unsigned long long,
-			(xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT,
-			i_size_read(inode));
-
-	/*
-	 * If the current map does not span the entire page we are about to try
-	 * to write, then give up. The only way we can write a page that spans
-	 * multiple mappings in a single writeback iteration is via the
-	 * xfs_vm_writepage() function. Data integrity writeback requires the
-	 * entire page to be written in a single attempt, otherwise the part of
-	 * the page we don't write here doesn't get written as part of the data
-	 * integrity sync.
-	 *
-	 * For normal writeback, we also don't attempt to write partial pages
-	 * here as it simply means that write_cache_pages() will see it under
-	 * writeback and ignore the page until some point in the future, at
-	 * which time this will be the only page in the file that needs
-	 * writeback.  Hence for more optimal IO patterns, we should always
-	 * avoid partial page writeback due to multiple mappings on a page here.
-	 */
-	if (!xfs_imap_valid(inode, &wpc->imap, end_offset))
-		goto fail_unlock_page;
-
-	len = 1 << inode->i_blkbits;
-	p_offset = min_t(unsigned long, end_offset & (PAGE_CACHE_SIZE - 1),
-					PAGE_CACHE_SIZE);
-	p_offset = p_offset ? roundup(p_offset, len) : PAGE_CACHE_SIZE;
-	page_dirty = p_offset / len;
-
-	/*
-	 * The moment we find a buffer that doesn't match our current type
-	 * specification or can't be written, abort the loop and start
-	 * writeback. As per the above xfs_imap_valid() check, only
-	 * xfs_vm_writepage() can handle partial page writeback fully - we are
-	 * limited here to the buffers that are contiguous with the current
-	 * ioend, and hence a buffer we can't write breaks that contiguity and
-	 * we have to defer the rest of the IO to xfs_vm_writepage().
-	 */
-	bh = head = page_buffers(page);
-	do {
-		if (offset >= end_offset)
-			break;
-		if (!buffer_uptodate(bh))
-			uptodate = 0;
-		if (!(PageUptodate(page) || buffer_uptodate(bh))) {
-			done = 1;
-			break;
-		}
-
-		if (buffer_unwritten(bh) || buffer_delay(bh) ||
-		    buffer_mapped(bh)) {
-			if (buffer_unwritten(bh))
-				wpc->io_type = XFS_IO_UNWRITTEN;
-			else if (buffer_delay(bh))
-				wpc->io_type = XFS_IO_DELALLOC;
-			else
-				wpc->io_type = XFS_IO_OVERWRITE;
-
-			/*
-			 * imap should always be valid because of the above
-			 * partial page end_offset check on the imap.
-			 */
-			ASSERT(xfs_imap_valid(inode, &wpc->imap, offset));
-
-			lock_buffer(bh);
-			if (wpc->io_type != XFS_IO_OVERWRITE)
-				xfs_map_at_offset(inode, bh, &wpc->imap, offset);
-			xfs_add_to_ioend(inode, bh, offset, wpc);
-
-			page_dirty--;
-			count++;
-		} else {
-			done = 1;
-			break;
-		}
-	} while (offset += len, (bh = bh->b_this_page) != head);
-
-	if (uptodate && bh == head)
-		SetPageUptodate(page);
-
-	if (count) {
-		if (--wbc->nr_to_write <= 0 &&
-		    wbc->sync_mode == WB_SYNC_NONE)
-			done = 1;
-	}
-	xfs_start_page_writeback(page, !page_dirty, count);
-
-	return done;
- fail_unlock_page:
-	unlock_page(page);
- fail:
-	return 1;
-}
-
-/*
- * Convert & write out a cluster of pages in the same extent as defined
- * by mp and following the start page.
- */
-STATIC void
-xfs_cluster_write(
-	struct inode		*inode,
-	pgoff_t			tindex,
-	struct xfs_writepage_ctx *wpc,
-	struct writeback_control *wbc,
-	pgoff_t			tlast)
-{
-	struct pagevec		pvec;
-	int			done = 0, i;
-
-	pagevec_init(&pvec, 0);
-	while (!done && tindex <= tlast) {
-		unsigned len = min_t(pgoff_t, PAGEVEC_SIZE, tlast - tindex + 1);
-
-		if (!pagevec_lookup(&pvec, inode->i_mapping, tindex, len))
-			break;
-
-		for (i = 0; i < pagevec_count(&pvec); i++) {
-			done = xfs_convert_page(inode, pvec.pages[i], tindex++,
-						wpc, wbc);
-			if (done)
-				break;
-		}
-
-		pagevec_release(&pvec);
-		cond_resched();
-	}
-}
-
 STATIC void
 xfs_vm_invalidatepage(
 	struct page		*page,
@@ -939,7 +766,7 @@ xfs_do_writepage(
 	struct buffer_head	*bh, *head;
 	loff_t			offset;
 	__uint64_t              end_offset;
-	pgoff_t                 end_index, last_index;
+	pgoff_t                 end_index;
 	ssize_t			len;
 	int			err, uptodate = 1;
 	int			count = 0;
@@ -969,12 +796,9 @@ xfs_do_writepage(
 	if (WARN_ON_ONCE(current->flags & PF_FSTRANS))
 		goto redirty;
 
-	/* Is this page beyond the end of the file? */
-	offset = i_size_read(inode);
-	end_index = offset >> PAGE_CACHE_SHIFT;
-	last_index = (offset - 1) >> PAGE_CACHE_SHIFT;
-
 	/*
+	 * Is this page beyond the end of the file?
+	 *
 	 * The page index is less than the end_index, adjust the end_offset
 	 * to the highest offset that this page should represent.
 	 * -----------------------------------------------------
@@ -985,6 +809,8 @@ xfs_do_writepage(
 	 * |     desired writeback range    |      see else    |
 	 * ---------------------------------^------------------|
 	 */
+	offset = i_size_read(inode);
+	end_index = offset >> PAGE_CACHE_SHIFT;
 	if (page->index < end_index)
 		end_offset = (xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT;
 	else {
@@ -1113,35 +939,7 @@ xfs_do_writepage(
 
 	xfs_start_page_writeback(page, 1, count);
 
-	/* if there is no IO to be submitted for this page, we are done */
-	if (!count)
-		return 0;
-
-	ASSERT(wpc->iohead);
-	ASSERT(err == 0);
-
-	/*
-	 * Any errors from this point onwards need tobe reported through the IO
-	 * completion path as we have marked the initial page as under writeback
-	 * and unlocked it.
-	 */
-	if (wpc->imap_valid) {
-		xfs_off_t		end_index;
-
-		end_index = wpc->imap.br_startoff + wpc->imap.br_blockcount;
-
-		/* to bytes */
-		end_index <<= inode->i_blkbits;
-
-		/* to pages */
-		end_index = (end_index - 1) >> PAGE_CACHE_SHIFT;
-
-		/* check against file size */
-		if (end_index > last_index)
-			end_index = last_index;
-
-		xfs_cluster_write(inode, page->index + 1, wpc, wbc, end_index);
-	}
+	ASSERT(wpc->iohead || !count);
 	return 0;
 
 error:
-- 
cgit v1.2.3-55-g7522


From bfce7d2e2d5ee05e9d465888905c66a70a9c243c Mon Sep 17 00:00:00 2001
From: Dave Chinner
Date: Mon, 15 Feb 2016 17:21:37 +1100
Subject: xfs: factor mapping out of xfs_do_writepage

Separate out the bufferhead based mapping from the writepage code so
that we have a clear separation of the page operations and the
bufferhead state.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Dave Chinner <david@fromorbit.com>

---
 fs/xfs/xfs_aops.c | 232 ++++++++++++++++++++++++++++--------------------------
 1 file changed, 122 insertions(+), 110 deletions(-)

(limited to 'fs/xfs/xfs_aops.c')

diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index f75c7c99cb63..6f5c95f94add 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -747,6 +747,127 @@ xfs_writepage_submit(
 	return status;
 }
 
+static int
+xfs_writepage_map(
+	struct xfs_writepage_ctx *wpc,
+	struct inode		*inode,
+	struct page		*page,
+	loff_t			offset,
+	__uint64_t              end_offset)
+{
+	struct buffer_head	*bh, *head;
+	ssize_t			len = 1 << inode->i_blkbits;
+	int			error = 0;
+	int			uptodate = 1;
+	int			count = 0;
+
+	bh = head = page_buffers(page);
+	offset = page_offset(page);
+
+	do {
+		if (offset >= end_offset)
+			break;
+		if (!buffer_uptodate(bh))
+			uptodate = 0;
+
+		/*
+		 * set_page_dirty dirties all buffers in a page, independent
+		 * of their state.  The dirty state however is entirely
+		 * meaningless for holes (!mapped && uptodate), so skip
+		 * buffers covering holes here.
+		 */
+		if (!buffer_mapped(bh) && buffer_uptodate(bh)) {
+			wpc->imap_valid = false;
+			continue;
+		}
+
+		if (buffer_unwritten(bh)) {
+			if (wpc->io_type != XFS_IO_UNWRITTEN) {
+				wpc->io_type = XFS_IO_UNWRITTEN;
+				wpc->imap_valid = false;
+			}
+		} else if (buffer_delay(bh)) {
+			if (wpc->io_type != XFS_IO_DELALLOC) {
+				wpc->io_type = XFS_IO_DELALLOC;
+				wpc->imap_valid = false;
+			}
+		} else if (buffer_uptodate(bh)) {
+			if (wpc->io_type != XFS_IO_OVERWRITE) {
+				wpc->io_type = XFS_IO_OVERWRITE;
+				wpc->imap_valid = false;
+			}
+		} else {
+			if (PageUptodate(page))
+				ASSERT(buffer_mapped(bh));
+			/*
+			 * This buffer is not uptodate and will not be
+			 * written to disk.  Ensure that we will put any
+			 * subsequent writeable buffers into a new
+			 * ioend.
+			 */
+			wpc->imap_valid = false;
+			continue;
+		}
+
+		if (wpc->imap_valid)
+			wpc->imap_valid = xfs_imap_valid(inode, &wpc->imap,
+							 offset);
+		if (!wpc->imap_valid) {
+			error = xfs_map_blocks(inode, offset, &wpc->imap,
+					     wpc->io_type);
+			if (error)
+				goto out_error;
+			wpc->imap_valid = xfs_imap_valid(inode, &wpc->imap,
+							 offset);
+		}
+		if (wpc->imap_valid) {
+			lock_buffer(bh);
+			if (wpc->io_type != XFS_IO_OVERWRITE)
+				xfs_map_at_offset(inode, bh, &wpc->imap, offset);
+			xfs_add_to_ioend(inode, bh, offset, wpc);
+			count++;
+		}
+
+		if (!wpc->iohead)
+			wpc->iohead = wpc->ioend;
+
+	} while (offset += len, ((bh = bh->b_this_page) != head));
+
+	if (uptodate && bh == head)
+		SetPageUptodate(page);
+
+	xfs_start_page_writeback(page, 1, count);
+	ASSERT(wpc->iohead || !count);
+	return 0;
+
+out_error:
+	/*
+	 * On error, we have to fail the iohead here because we locked buffers
+	 * in the ioend chain. If we don't do this, we'll deadlock invalidating
+	 * the page as that tries to lock the buffers on the page. Also, because
+	 * we may have set pages under writeback, we have to make sure we run
+	 * IO completion to mark the error state of the IO appropriately, so we
+	 * can't cancel the ioend directly here. That means we have to mark this
+	 * page as under writeback if we included any buffers from it in the
+	 * ioend chain so that completion treats it correctly.
+	 *
+	 * If we didn't include the page in the ioend, then we can simply
+	 * discard and unlock it as there are no other users of the page or it's
+	 * buffers right now. The caller will still need to trigger submission
+	 * of outstanding ioends on the writepage context so they are treated
+	 * correctly on error.
+	 */
+	if (count)
+		xfs_start_page_writeback(page, 0, count);
+	else {
+		xfs_aops_discard_page(page);
+		ClearPageUptodate(page);
+		unlock_page(page);
+	}
+	mapping_set_error(page->mapping, error);
+	return error;
+}
+
 /*
  * Write out a dirty page.
  *
@@ -763,13 +884,9 @@ xfs_do_writepage(
 {
 	struct xfs_writepage_ctx *wpc = data;
 	struct inode		*inode = page->mapping->host;
-	struct buffer_head	*bh, *head;
 	loff_t			offset;
 	__uint64_t              end_offset;
 	pgoff_t                 end_index;
-	ssize_t			len;
-	int			err, uptodate = 1;
-	int			count = 0;
 
 	trace_xfs_writepage(inode, page, 0, 0);
 
@@ -862,112 +979,7 @@ xfs_do_writepage(
 		end_offset = offset;
 	}
 
-	len = 1 << inode->i_blkbits;
-
-	bh = head = page_buffers(page);
-	offset = page_offset(page);
-
-	do {
-		if (offset >= end_offset)
-			break;
-		if (!buffer_uptodate(bh))
-			uptodate = 0;
-
-		/*
-		 * set_page_dirty dirties all buffers in a page, independent
-		 * of their state.  The dirty state however is entirely
-		 * meaningless for holes (!mapped && uptodate), so skip
-		 * buffers covering holes here.
-		 */
-		if (!buffer_mapped(bh) && buffer_uptodate(bh)) {
-			wpc->imap_valid = false;
-			continue;
-		}
-
-		if (buffer_unwritten(bh)) {
-			if (wpc->io_type != XFS_IO_UNWRITTEN) {
-				wpc->io_type = XFS_IO_UNWRITTEN;
-				wpc->imap_valid = false;
-			}
-		} else if (buffer_delay(bh)) {
-			if (wpc->io_type != XFS_IO_DELALLOC) {
-				wpc->io_type = XFS_IO_DELALLOC;
-				wpc->imap_valid = false;
-			}
-		} else if (buffer_uptodate(bh)) {
-			if (wpc->io_type != XFS_IO_OVERWRITE) {
-				wpc->io_type = XFS_IO_OVERWRITE;
-				wpc->imap_valid = false;
-			}
-		} else {
-			if (PageUptodate(page))
-				ASSERT(buffer_mapped(bh));
-			/*
-			 * This buffer is not uptodate and will not be
-			 * written to disk.  Ensure that we will put any
-			 * subsequent writeable buffers into a new
-			 * ioend.
-			 */
-			wpc->imap_valid = 0;
-			continue;
-		}
-
-		if (wpc->imap_valid)
-			wpc->imap_valid = xfs_imap_valid(inode, &wpc->imap, offset);
-		if (!wpc->imap_valid) {
-			err = xfs_map_blocks(inode, offset, &wpc->imap,
-					     wpc->io_type);
-			if (err)
-				goto error;
-			wpc->imap_valid = xfs_imap_valid(inode, &wpc->imap, offset);
-		}
-		if (wpc->imap_valid) {
-			lock_buffer(bh);
-			if (wpc->io_type != XFS_IO_OVERWRITE)
-				xfs_map_at_offset(inode, bh, &wpc->imap, offset);
-			xfs_add_to_ioend(inode, bh, offset, wpc);
-			count++;
-		}
-
-		if (!wpc->iohead)
-			wpc->iohead = wpc->ioend;
-
-	} while (offset += len, ((bh = bh->b_this_page) != head));
-
-	if (uptodate && bh == head)
-		SetPageUptodate(page);
-
-	xfs_start_page_writeback(page, 1, count);
-
-	ASSERT(wpc->iohead || !count);
-	return 0;
-
-error:
-	/*
-	 * On error, we have to fail the iohead here because we buffers locked
-	 * in the ioend chain. If we don't do this, we'll deadlock invalidating
-	 * the page as that tries to lock the buffers on the page. Also, because
-	 * we may have set pages under writeback, we have to make sure we run
-	 * IO completion to mark the error state of the IO appropriately, so we
-	 * can't cancel the ioend directly here. That means we have to mark this
-	 * page as under writeback if we included any buffers from it in the
-	 * ioend chain so that completion treats it correctly.
-	 *
-	 * If we didn't include the page in the ioend, then we can simply
-	 * discard and unlock it as there are no other users of the page or it's
-	 * buffers right now. The caller will still need to trigger submission
-	 * of outstanding ioends on the writepage context so they are treated
-	 * correctly on error.
-	 */
-	if (count)
-		xfs_start_page_writeback(page, 0, count);
-	else {
-		xfs_aops_discard_page(page);
-		ClearPageUptodate(page);
-		unlock_page(page);
-	}
-	mapping_set_error(page->mapping, err);
-	return err;
+	return xfs_writepage_map(wpc, inode, page, offset, end_offset);
 
 redirty:
 	redirty_page_for_writepage(wbc, page);
-- 
cgit v1.2.3-55-g7522


From e10de3723c53378e7cf441529f563c316fdc0dd3 Mon Sep 17 00:00:00 2001
From: Dave Chinner
Date: Mon, 15 Feb 2016 17:23:12 +1100
Subject: xfs: don't chain ioends during writepage submission

Currently we can build a long ioend chain during ->writepages that
gets attached to the writepage context. IO submission only then
occurs when we finish all the writepage processing. This means we
can have many ioends allocated and pending, and this violates the
mempool guarantees that we need to give about forwards progress.
i.e. we really should only have one ioend being built at a time,
otherwise we may drain the mempool trying to allocate a new ioend
and that blocks submission, completion and freeing of ioends that
are already in progress.

To prevent this situation from happening, we need to submit ioends
for IO as soon as they are ready for dispatch rather than queuing
them for later submission. This means the ioends have bios built
immediately and they get queued on any plug that is current active.
Hence if we schedule away from writeback, the ioends that have been
built will make forwards progress due to the plug flushing on
context switch. This will also prevent context switches from
creating unnecessary IO submission latency.

We can't completely avoid having nested IO allocation - when we have
a block size smaller than a page size, we still need to hold the
ioend submission until after we have marked the current page dirty.
Hence we may need multiple ioends to be held while the current page
is completely mapped and made ready for IO dispatch. We cannot avoid
this problem - the current code already has this ioend chaining
within a page so we can mostly ignore that it occurs.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Dave Chinner <david@fromorbit.com>

---
 fs/xfs/xfs_aops.c | 241 ++++++++++++++++++++++++++----------------------------
 fs/xfs/xfs_aops.h |   2 +-
 2 files changed, 119 insertions(+), 124 deletions(-)

(limited to 'fs/xfs/xfs_aops.c')

diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 6f5c95f94add..46dc9211bae7 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -43,7 +43,6 @@ struct xfs_writepage_ctx {
 	struct xfs_bmbt_irec    imap;
 	bool			imap_valid;
 	unsigned int		io_type;
-	struct xfs_ioend	*iohead;
 	struct xfs_ioend	*ioend;
 	sector_t		last_block;
 };
@@ -277,7 +276,7 @@ xfs_alloc_ioend(
 	 */
 	atomic_set(&ioend->io_remaining, 1);
 	ioend->io_error = 0;
-	ioend->io_list = NULL;
+	INIT_LIST_HEAD(&ioend->io_list);
 	ioend->io_type = type;
 	ioend->io_inode = inode;
 	ioend->io_buffer_head = NULL;
@@ -420,8 +419,7 @@ xfs_start_buffer_writeback(
 STATIC void
 xfs_start_page_writeback(
 	struct page		*page,
-	int			clear_dirty,
-	int			buffers)
+	int			clear_dirty)
 {
 	ASSERT(PageLocked(page));
 	ASSERT(!PageWriteback(page));
@@ -440,10 +438,6 @@ xfs_start_page_writeback(
 		set_page_writeback_keepwrite(page);
 
 	unlock_page(page);
-
-	/* If no buffers on the page are to be written, finish it here */
-	if (!buffers)
-		end_page_writeback(page);
 }
 
 static inline int xfs_bio_add_buffer(struct bio *bio, struct buffer_head *bh)
@@ -452,110 +446,90 @@ static inline int xfs_bio_add_buffer(struct bio *bio, struct buffer_head *bh)
 }
 
 /*
- * Submit all of the bios for all of the ioends we have saved up, covering the
- * initial writepage page and also any probed pages.
- *
- * Because we may have multiple ioends spanning a page, we need to start
- * writeback on all the buffers before we submit them for I/O. If we mark the
- * buffers as we got, then we can end up with a page that only has buffers
- * marked async write and I/O complete on can occur before we mark the other
- * buffers async write.
- *
- * The end result of this is that we trip a bug in end_page_writeback() because
- * we call it twice for the one page as the code in end_buffer_async_write()
- * assumes that all buffers on the page are started at the same time.
- *
- * The fix is two passes across the ioend list - one to start writeback on the
- * buffer_heads, and then submit them for I/O on the second pass.
+ * Submit all of the bios for an ioend. We are only passed a single ioend at a
+ * time; the caller is responsible for chaining prior to submission.
  *
  * If @fail is non-zero, it means that we have a situation where some part of
  * the submission process has failed after we have marked paged for writeback
  * and unlocked them. In this situation, we need to fail the ioend chain rather
  * than submit it to IO. This typically only happens on a filesystem shutdown.
  */
-STATIC void
+STATIC int
 xfs_submit_ioend(
 	struct writeback_control *wbc,
 	xfs_ioend_t		*ioend,
-	int			fail)
+	int			status)
 {
-	xfs_ioend_t		*head = ioend;
-	xfs_ioend_t		*next;
 	struct buffer_head	*bh;
 	struct bio		*bio;
 	sector_t		lastblock = 0;
 
-	/* Pass 1 - start writeback */
-	do {
-		next = ioend->io_list;
-		for (bh = ioend->io_buffer_head; bh; bh = bh->b_private)
-			xfs_start_buffer_writeback(bh);
-	} while ((ioend = next) != NULL);
+	/* Reserve log space if we might write beyond the on-disk inode size. */
+	if (!status &&
+	     ioend->io_type != XFS_IO_UNWRITTEN && xfs_ioend_is_append(ioend))
+		status = xfs_setfilesize_trans_alloc(ioend);
+	/*
+	 * If we are failing the IO now, just mark the ioend with an
+	 * error and finish it. This will run IO completion immediately
+	 * as there is only one reference to the ioend at this point in
+	 * time.
+	 */
+	if (status) {
+		ioend->io_error = status;
+		xfs_finish_ioend(ioend);
+		return status;
+	}
 
-	/* Pass 2 - submit I/O */
-	ioend = head;
-	do {
-		next = ioend->io_list;
-		bio = NULL;
+	bio = NULL;
+	for (bh = ioend->io_buffer_head; bh; bh = bh->b_private) {
 
-		/*
-		 * If we are failing the IO now, just mark the ioend with an
-		 * error and finish it. This will run IO completion immediately
-		 * as there is only one reference to the ioend at this point in
-		 * time.
-		 */
-		if (fail) {
-			ioend->io_error = fail;
-			xfs_finish_ioend(ioend);
-			continue;
+		if (!bio) {
+retry:
+			bio = xfs_alloc_ioend_bio(bh);
+		} else if (bh->b_blocknr != lastblock + 1) {
+			xfs_submit_ioend_bio(wbc, ioend, bio);
+			goto retry;
 		}
 
-		for (bh = ioend->io_buffer_head; bh; bh = bh->b_private) {
-
-			if (!bio) {
- retry:
-				bio = xfs_alloc_ioend_bio(bh);
-			} else if (bh->b_blocknr != lastblock + 1) {
-				xfs_submit_ioend_bio(wbc, ioend, bio);
-				goto retry;
-			}
-
-			if (xfs_bio_add_buffer(bio, bh) != bh->b_size) {
-				xfs_submit_ioend_bio(wbc, ioend, bio);
-				goto retry;
-			}
-
-			lastblock = bh->b_blocknr;
-		}
-		if (bio)
+		if (xfs_bio_add_buffer(bio, bh) != bh->b_size) {
 			xfs_submit_ioend_bio(wbc, ioend, bio);
-		xfs_finish_ioend(ioend);
-	} while ((ioend = next) != NULL);
+			goto retry;
+		}
+
+		lastblock = bh->b_blocknr;
+	}
+	if (bio)
+		xfs_submit_ioend_bio(wbc, ioend, bio);
+	xfs_finish_ioend(ioend);
+	return 0;
 }
 
 /*
  * Test to see if we've been building up a completion structure for
  * earlier buffers -- if so, we try to append to this ioend if we
  * can, otherwise we finish off any current ioend and start another.
- * Return true if we've finished the given ioend.
+ * Return the ioend we finished off so that the caller can submit it
+ * once it has finished processing the dirty page.
  */
 STATIC void
 xfs_add_to_ioend(
 	struct inode		*inode,
 	struct buffer_head	*bh,
 	xfs_off_t		offset,
-	struct xfs_writepage_ctx *wpc)
+	struct xfs_writepage_ctx *wpc,
+	struct list_head	*iolist)
 {
 	if (!wpc->ioend || wpc->io_type != wpc->ioend->io_type ||
 	    bh->b_blocknr != wpc->last_block + 1) {
 		struct xfs_ioend	*new;
 
+		if (wpc->ioend)
+			list_add(&wpc->ioend->io_list, iolist);
+
 		new = xfs_alloc_ioend(inode, wpc->io_type);
 		new->io_offset = offset;
 		new->io_buffer_head = bh;
 		new->io_buffer_tail = bh;
-		if (wpc->ioend)
-			wpc->ioend->io_list = new;
 		wpc->ioend = new;
 	} else {
 		wpc->ioend->io_buffer_tail->b_private = bh;
@@ -565,6 +539,7 @@ xfs_add_to_ioend(
 	bh->b_private = NULL;
 	wpc->ioend->io_size += bh->b_size;
 	wpc->last_block = bh->b_blocknr;
+	xfs_start_buffer_writeback(bh);
 }
 
 STATIC void
@@ -726,44 +701,41 @@ out_invalidate:
 	return;
 }
 
-static int
-xfs_writepage_submit(
-	struct xfs_writepage_ctx *wpc,
-	struct writeback_control *wbc,
-	int			status)
-{
-	struct blk_plug		plug;
-
-	/* Reserve log space if we might write beyond the on-disk inode size. */
-	if (!status && wpc->ioend && wpc->ioend->io_type != XFS_IO_UNWRITTEN &&
-	    xfs_ioend_is_append(wpc->ioend))
-		status = xfs_setfilesize_trans_alloc(wpc->ioend);
-
-	if (wpc->iohead) {
-		blk_start_plug(&plug);
-		xfs_submit_ioend(wbc, wpc->iohead, status);
-		blk_finish_plug(&plug);
-	}
-	return status;
-}
-
+/*
+ * We implement an immediate ioend submission policy here to avoid needing to
+ * chain multiple ioends and hence nest mempool allocations which can violate
+ * forward progress guarantees we need to provide. The current ioend we are
+ * adding buffers to is cached on the writepage context, and if the new buffer
+ * does not append to the cached ioend it will create a new ioend and cache that
+ * instead.
+ *
+ * If a new ioend is created and cached, the old ioend is returned and queued
+ * locally for submission once the entire page is processed or an error has been
+ * detected.  While ioends are submitted immediately after they are completed,
+ * batching optimisations are provided by higher level block plugging.
+ *
+ * At the end of a writeback pass, there will be a cached ioend remaining on the
+ * writepage context that the caller will need to submit.
+ */
 static int
 xfs_writepage_map(
 	struct xfs_writepage_ctx *wpc,
+	struct writeback_control *wbc,
 	struct inode		*inode,
 	struct page		*page,
 	loff_t			offset,
 	__uint64_t              end_offset)
 {
+	LIST_HEAD(submit_list);
+	struct xfs_ioend	*ioend, *next;
 	struct buffer_head	*bh, *head;
 	ssize_t			len = 1 << inode->i_blkbits;
 	int			error = 0;
-	int			uptodate = 1;
 	int			count = 0;
+	int			uptodate = 1;
 
 	bh = head = page_buffers(page);
 	offset = page_offset(page);
-
 	do {
 		if (offset >= end_offset)
 			break;
@@ -816,7 +788,7 @@ xfs_writepage_map(
 			error = xfs_map_blocks(inode, offset, &wpc->imap,
 					     wpc->io_type);
 			if (error)
-				goto out_error;
+				goto out;
 			wpc->imap_valid = xfs_imap_valid(inode, &wpc->imap,
 							 offset);
 		}
@@ -824,46 +796,65 @@ xfs_writepage_map(
 			lock_buffer(bh);
 			if (wpc->io_type != XFS_IO_OVERWRITE)
 				xfs_map_at_offset(inode, bh, &wpc->imap, offset);
-			xfs_add_to_ioend(inode, bh, offset, wpc);
+			xfs_add_to_ioend(inode, bh, offset, wpc, &submit_list);
 			count++;
 		}
 
-		if (!wpc->iohead)
-			wpc->iohead = wpc->ioend;
-
 	} while (offset += len, ((bh = bh->b_this_page) != head));
 
 	if (uptodate && bh == head)
 		SetPageUptodate(page);
 
-	xfs_start_page_writeback(page, 1, count);
-	ASSERT(wpc->iohead || !count);
-	return 0;
+	ASSERT(wpc->ioend || list_empty(&submit_list));
 
-out_error:
+out:
 	/*
-	 * On error, we have to fail the iohead here because we locked buffers
-	 * in the ioend chain. If we don't do this, we'll deadlock invalidating
-	 * the page as that tries to lock the buffers on the page. Also, because
-	 * we may have set pages under writeback, we have to make sure we run
-	 * IO completion to mark the error state of the IO appropriately, so we
-	 * can't cancel the ioend directly here. That means we have to mark this
-	 * page as under writeback if we included any buffers from it in the
-	 * ioend chain so that completion treats it correctly.
+	 * On error, we have to fail the ioend here because we have locked
+	 * buffers in the ioend. If we don't do this, we'll deadlock
+	 * invalidating the page as that tries to lock the buffers on the page.
+	 * Also, because we may have set pages under writeback, we have to make
+	 * sure we run IO completion to mark the error state of the IO
+	 * appropriately, so we can't cancel the ioend directly here. That means
+	 * we have to mark this page as under writeback if we included any
+	 * buffers from it in the ioend chain so that completion treats it
+	 * correctly.
 	 *
-	 * If we didn't include the page in the ioend, then we can simply
-	 * discard and unlock it as there are no other users of the page or it's
-	 * buffers right now. The caller will still need to trigger submission
-	 * of outstanding ioends on the writepage context so they are treated
-	 * correctly on error.
+	 * If we didn't include the page in the ioend, the on error we can
+	 * simply discard and unlock it as there are no other users of the page
+	 * or it's buffers right now. The caller will still need to trigger
+	 * submission of outstanding ioends on the writepage context so they are
+	 * treated correctly on error.
 	 */
-	if (count)
-		xfs_start_page_writeback(page, 0, count);
-	else {
+	if (count) {
+		xfs_start_page_writeback(page, !error);
+
+		/*
+		 * Preserve the original error if there was one, otherwise catch
+		 * submission errors here and propagate into subsequent ioend
+		 * submissions.
+		 */
+		list_for_each_entry_safe(ioend, next, &submit_list, io_list) {
+			int error2;
+
+			list_del_init(&ioend->io_list);
+			error2 = xfs_submit_ioend(wbc, ioend, error);
+			if (error2 && !error)
+				error = error2;
+		}
+	} else if (error) {
 		xfs_aops_discard_page(page);
 		ClearPageUptodate(page);
 		unlock_page(page);
+	} else {
+		/*
+		 * We can end up here with no error and nothing to write if we
+		 * race with a partial page truncate on a sub-page block sized
+		 * filesystem. In that case we need to mark the page clean.
+		 */
+		xfs_start_page_writeback(page, 1);
+		end_page_writeback(page);
 	}
+
 	mapping_set_error(page->mapping, error);
 	return error;
 }
@@ -979,7 +970,7 @@ xfs_do_writepage(
 		end_offset = offset;
 	}
 
-	return xfs_writepage_map(wpc, inode, page, offset, end_offset);
+	return xfs_writepage_map(wpc, wbc, inode, page, offset, end_offset);
 
 redirty:
 	redirty_page_for_writepage(wbc, page);
@@ -998,7 +989,9 @@ xfs_vm_writepage(
 	int			ret;
 
 	ret = xfs_do_writepage(page, wbc, &wpc);
-	return xfs_writepage_submit(&wpc, wbc, ret);
+	if (wpc.ioend)
+		ret = xfs_submit_ioend(wbc, wpc.ioend, ret);
+	return ret;
 }
 
 STATIC int
@@ -1013,7 +1006,9 @@ xfs_vm_writepages(
 
 	xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED);
 	ret = write_cache_pages(mapping, wbc, xfs_do_writepage, &wpc);
-	return xfs_writepage_submit(&wpc, wbc, ret);
+	if (wpc.ioend)
+		ret = xfs_submit_ioend(wbc, wpc.ioend, ret);
+	return ret;
 }
 
 /*
diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h
index 3c3f1a37a1c7..4e01bd5b6426 100644
--- a/fs/xfs/xfs_aops.h
+++ b/fs/xfs/xfs_aops.h
@@ -41,7 +41,7 @@ enum {
  * It can manage several multi-page bio's at once.
  */
 typedef struct xfs_ioend {
-	struct xfs_ioend	*io_list;	/* next ioend in chain */
+	struct list_head	io_list;	/* next ioend in chain */
 	unsigned int		io_type;	/* delalloc / unwritten */
 	int			io_error;	/* I/O error code */
 	atomic_t		io_remaining;	/* hold count */
-- 
cgit v1.2.3-55-g7522


From 0df61da8ac5b7acb1db6cee142822ebe4724c739 Mon Sep 17 00:00:00 2001
From: Darrick J. Wong
Date: Mon, 7 Mar 2016 09:32:14 +1100
Subject: xfs: ioends require logically contiguous file offsets

We need to create a new ioend if the current writepage call isn't
logically contiguous with the range contained in the previous ioend.
Hopefully writepage gets called in order of increasing file offset.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>


---
 fs/xfs/xfs_aops.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs/xfs/xfs_aops.c')

diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 46dc9211bae7..b125f9019145 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -520,7 +520,8 @@ xfs_add_to_ioend(
 	struct list_head	*iolist)
 {
 	if (!wpc->ioend || wpc->io_type != wpc->ioend->io_type ||
-	    bh->b_blocknr != wpc->last_block + 1) {
+	    bh->b_blocknr != wpc->last_block + 1 ||
+	    offset != wpc->ioend->io_offset + wpc->ioend->io_size) {
 		struct xfs_ioend	*new;
 
 		if (wpc->ioend)
-- 
cgit v1.2.3-55-g7522


From 801cc4e17a34c32e1527827292fac9cec5d3393b Mon Sep 17 00:00:00 2001
From: Brian Foster
Date: Tue, 15 Mar 2016 11:42:44 +1100
Subject: xfs: debug mode forced buffered write failure

Add a DEBUG mode-only sysfs knob to enable forced buffered write
failure. An additional side effect of this mode is brute force killing
of delayed allocation blocks in the range of the write. The latter is
the prime motiviation behind this patch, as userspace test
infrastructure requires a reliable mechanism to create and split
delalloc extents without causing extent conversion.

Certain fallocate operations (i.e., zero range) were used for this in
the past, but the implementations have changed such that delalloc
extents are flushed and converted to real blocks, rendering the test
useless.

Signed-off-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Dave Chinner <david@fromorbit.com>


---
 fs/xfs/xfs_aops.c  |  9 ++++++-
 fs/xfs/xfs_mount.h | 25 +++++++++++++++++
 fs/xfs/xfs_sysfs.c | 78 +++++++++++++++++++++++++++++++++++++++++++++++-------
 3 files changed, 101 insertions(+), 11 deletions(-)

(limited to 'fs/xfs/xfs_aops.c')

diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 379c089fb051..eed0bfc4443c 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -1752,6 +1752,7 @@ xfs_vm_write_failed(
 	loff_t			from = pos & (PAGE_CACHE_SIZE - 1);
 	loff_t			to = from + len;
 	struct buffer_head	*bh, *head;
+	struct xfs_mount	*mp = XFS_I(inode)->i_mount;
 
 	/*
 	 * The request pos offset might be 32 or 64 bit, this is all fine
@@ -1786,7 +1787,8 @@ xfs_vm_write_failed(
 		if (!buffer_delay(bh))
 			continue;
 
-		if (!buffer_new(bh) && block_offset < i_size_read(inode))
+		if (!xfs_mp_fail_writes(mp) && !buffer_new(bh) &&
+		    block_offset < i_size_read(inode))
 			continue;
 
 		xfs_vm_kill_delalloc_range(inode, block_offset,
@@ -1824,6 +1826,7 @@ xfs_vm_write_begin(
 	pgoff_t			index = pos >> PAGE_CACHE_SHIFT;
 	struct page		*page;
 	int			status;
+	struct xfs_mount	*mp = XFS_I(mapping->host)->i_mount;
 
 	ASSERT(len <= PAGE_CACHE_SIZE);
 
@@ -1832,6 +1835,8 @@ xfs_vm_write_begin(
 		return -ENOMEM;
 
 	status = __block_write_begin(page, pos, len, xfs_get_blocks);
+	if (xfs_mp_fail_writes(mp))
+		status = -EIO;
 	if (unlikely(status)) {
 		struct inode	*inode = mapping->host;
 		size_t		isize = i_size_read(inode);
@@ -1844,6 +1849,8 @@ xfs_vm_write_begin(
 		 * allocated in this write, not blocks that were previously
 		 * written successfully.
 		 */
+		if (xfs_mp_fail_writes(mp))
+			isize = 0;
 		if (pos + len > isize) {
 			ssize_t start = max_t(ssize_t, pos, isize);
 
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index b57098481c10..546cf46a97ce 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -147,6 +147,17 @@ typedef struct xfs_mount {
 	 * to various other kinds of pain inflicted on the pNFS server.
 	 */
 	__uint32_t		m_generation;
+
+#ifdef DEBUG
+	/*
+	 * DEBUG mode instrumentation to test and/or trigger delayed allocation
+	 * block killing in the event of failed writes. When enabled, all
+	 * buffered writes are forced to fail. All delalloc blocks in the range
+	 * of the write (including pre-existing delalloc blocks!) are tossed as
+	 * part of the write failure error handling sequence.
+	 */
+	bool			m_fail_writes;
+#endif
 } xfs_mount_t;
 
 /*
@@ -264,6 +275,20 @@ xfs_daddr_to_agbno(struct xfs_mount *mp, xfs_daddr_t d)
 	return (xfs_agblock_t) do_div(ld, mp->m_sb.sb_agblocks);
 }
 
+#ifdef DEBUG
+static inline bool
+xfs_mp_fail_writes(struct xfs_mount *mp)
+{
+	return mp->m_fail_writes;
+}
+#else
+static inline bool
+xfs_mp_fail_writes(struct xfs_mount *mp)
+{
+	return 0;
+}
+#endif
+
 /*
  * Per-ag incore structure, copies of information in agf and agi, to improve the
  * performance of allocation group selection.
diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c
index 641d625eb334..6ced4f143494 100644
--- a/fs/xfs/xfs_sysfs.c
+++ b/fs/xfs/xfs_sysfs.c
@@ -18,10 +18,13 @@
 
 #include "xfs.h"
 #include "xfs_sysfs.h"
+#include "xfs_format.h"
 #include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
 #include "xfs_log.h"
 #include "xfs_log_priv.h"
 #include "xfs_stats.h"
+#include "xfs_mount.h"
 
 struct xfs_sysfs_attr {
 	struct attribute attr;
@@ -45,16 +48,6 @@ to_attr(struct attribute *attr)
 
 #define ATTR_LIST(name) &xfs_sysfs_attr_##name.attr
 
-/*
- * xfs_mount kobject. This currently has no attributes and thus no need for show
- * and store helpers. The mp kobject serves as the per-mount parent object that
- * is identified by the fsname under sysfs.
- */
-
-struct kobj_type xfs_mp_ktype = {
-	.release = xfs_sysfs_release,
-};
-
 STATIC ssize_t
 xfs_sysfs_object_show(
 	struct kobject		*kobject,
@@ -83,6 +76,71 @@ static const struct sysfs_ops xfs_sysfs_ops = {
 	.store = xfs_sysfs_object_store,
 };
 
+/*
+ * xfs_mount kobject. The mp kobject also serves as the per-mount parent object
+ * that is identified by the fsname under sysfs.
+ */
+
+static inline struct xfs_mount *
+to_mp(struct kobject *kobject)
+{
+	struct xfs_kobj *kobj = to_kobj(kobject);
+
+	return container_of(kobj, struct xfs_mount, m_kobj);
+}
+
+#ifdef DEBUG
+
+STATIC ssize_t
+fail_writes_store(
+	struct kobject		*kobject,
+	const char		*buf,
+	size_t			count)
+{
+	struct xfs_mount	*mp = to_mp(kobject);
+	int			ret;
+	int			val;
+
+	ret = kstrtoint(buf, 0, &val);
+	if (ret)
+		return ret;
+
+	if (val == 1)
+		mp->m_fail_writes = true;
+	else if (val == 0)
+		mp->m_fail_writes = false;
+	else
+		return -EINVAL;
+
+	return count;
+}
+
+STATIC ssize_t
+fail_writes_show(
+	struct kobject		*kobject,
+	char			*buf)
+{
+	struct xfs_mount	*mp = to_mp(kobject);
+
+	return snprintf(buf, PAGE_SIZE, "%d\n", mp->m_fail_writes ? 1 : 0);
+}
+XFS_SYSFS_ATTR_RW(fail_writes);
+
+#endif /* DEBUG */
+
+static struct attribute *xfs_mp_attrs[] = {
+#ifdef DEBUG
+	ATTR_LIST(fail_writes),
+#endif
+	NULL,
+};
+
+struct kobj_type xfs_mp_ktype = {
+	.release = xfs_sysfs_release,
+	.sysfs_ops = &xfs_sysfs_ops,
+	.default_attrs = xfs_mp_attrs,
+};
+
 #ifdef DEBUG
 /* debug */
 
-- 
cgit v1.2.3-55-g7522