summaryrefslogtreecommitdiffstats
path: root/fs/ext4/inode.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/ext4/inode.c')
-rw-r--r--fs/ext4/inode.c1366
1 files changed, 1004 insertions, 362 deletions
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 064746fad581..986120f30066 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -37,6 +37,8 @@
#include <linux/namei.h>
#include <linux/uio.h>
#include <linux/bio.h>
+#include <linux/workqueue.h>
+#include <linux/kernel.h>
#include "ext4_jbd2.h"
#include "xattr.h"
@@ -70,58 +72,6 @@ static int ext4_inode_is_fast_symlink(struct inode *inode)
}
/*
- * The ext4 forget function must perform a revoke if we are freeing data
- * which has been journaled. Metadata (eg. indirect blocks) must be
- * revoked in all cases.
- *
- * "bh" may be NULL: a metadata block may have been freed from memory
- * but there may still be a record of it in the journal, and that record
- * still needs to be revoked.
- *
- * If the handle isn't valid we're not journaling, but we still need to
- * call into ext4_journal_revoke() to put the buffer head.
- */
-int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode,
- struct buffer_head *bh, ext4_fsblk_t blocknr)
-{
- int err;
-
- might_sleep();
-
- BUFFER_TRACE(bh, "enter");
-
- jbd_debug(4, "forgetting bh %p: is_metadata = %d, mode %o, "
- "data mode %x\n",
- bh, is_metadata, inode->i_mode,
- test_opt(inode->i_sb, DATA_FLAGS));
-
- /* Never use the revoke function if we are doing full data
- * journaling: there is no need to, and a V1 superblock won't
- * support it. Otherwise, only skip the revoke on un-journaled
- * data blocks. */
-
- if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA ||
- (!is_metadata && !ext4_should_journal_data(inode))) {
- if (bh) {
- BUFFER_TRACE(bh, "call jbd2_journal_forget");
- return ext4_journal_forget(handle, bh);
- }
- return 0;
- }
-
- /*
- * data!=journal && (is_metadata || should_journal_data(inode))
- */
- BUFFER_TRACE(bh, "call ext4_journal_revoke");
- err = ext4_journal_revoke(handle, blocknr, bh);
- if (err)
- ext4_abort(inode->i_sb, __func__,
- "error %d when attempting revoke", err);
- BUFFER_TRACE(bh, "exit");
- return err;
-}
-
-/*
* Work out how many blocks we need to proceed with the next chunk of a
* truncate transaction.
*/
@@ -192,7 +142,7 @@ static int try_to_extend_transaction(handle_t *handle, struct inode *inode)
* so before we call here everything must be consistently dirtied against
* this transaction.
*/
- int ext4_truncate_restart_trans(handle_t *handle, struct inode *inode,
+int ext4_truncate_restart_trans(handle_t *handle, struct inode *inode,
int nblocks)
{
int ret;
@@ -208,6 +158,7 @@ static int try_to_extend_transaction(handle_t *handle, struct inode *inode)
up_write(&EXT4_I(inode)->i_data_sem);
ret = ext4_journal_restart(handle, blocks_for_truncate(inode));
down_write(&EXT4_I(inode)->i_data_sem);
+ ext4_discard_preallocations(inode);
return ret;
}
@@ -220,6 +171,9 @@ void ext4_delete_inode(struct inode *inode)
handle_t *handle;
int err;
+ if (!is_bad_inode(inode))
+ dquot_initialize(inode);
+
if (ext4_should_order_data(inode))
ext4_begin_ordered_truncate(inode, 0);
truncate_inode_pages(&inode->i_data, 0);
@@ -244,7 +198,7 @@ void ext4_delete_inode(struct inode *inode)
inode->i_size = 0;
err = ext4_mark_inode_dirty(handle, inode);
if (err) {
- ext4_warning(inode->i_sb, __func__,
+ ext4_warning(inode->i_sb,
"couldn't mark inode dirty (err %d)", err);
goto stop_handle;
}
@@ -262,7 +216,7 @@ void ext4_delete_inode(struct inode *inode)
if (err > 0)
err = ext4_journal_restart(handle, 3);
if (err != 0) {
- ext4_warning(inode->i_sb, __func__,
+ ext4_warning(inode->i_sb,
"couldn't extend journal (err %d)", err);
stop_handle:
ext4_journal_stop(handle);
@@ -373,8 +327,7 @@ static int ext4_block_to_path(struct inode *inode,
offsets[n++] = i_block & (ptrs - 1);
final = ptrs;
} else {
- ext4_warning(inode->i_sb, "ext4_block_to_path",
- "block %lu > max in inode %lu",
+ ext4_warning(inode->i_sb, "block %lu > max in inode %lu",
i_block + direct_blocks +
indirect_blocks + double_blocks, inode->i_ino);
}
@@ -394,7 +347,7 @@ static int __ext4_check_blockref(const char *function, struct inode *inode,
if (blk &&
unlikely(!ext4_data_block_valid(EXT4_SB(inode->i_sb),
blk, 1))) {
- ext4_error(inode->i_sb, function,
+ __ext4_error(inode->i_sb, function,
"invalid block reference %u "
"in inode #%lu", blk, inode->i_ino);
return -EIO;
@@ -657,7 +610,14 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
if (*err)
goto failed_out;
- BUG_ON(current_block + count > EXT4_MAX_BLOCK_FILE_PHYS);
+ if (unlikely(current_block + count > EXT4_MAX_BLOCK_FILE_PHYS)) {
+ EXT4_ERROR_INODE(inode,
+ "current_block %llu + count %lu > %d!",
+ current_block, count,
+ EXT4_MAX_BLOCK_FILE_PHYS);
+ *err = -EIO;
+ goto failed_out;
+ }
target -= count;
/* allocate blocks for indirect blocks */
@@ -693,7 +653,14 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
ar.flags = EXT4_MB_HINT_DATA;
current_block = ext4_mb_new_blocks(handle, &ar, err);
- BUG_ON(current_block + ar.len > EXT4_MAX_BLOCK_FILE_PHYS);
+ if (unlikely(current_block + ar.len > EXT4_MAX_BLOCK_FILE_PHYS)) {
+ EXT4_ERROR_INODE(inode,
+ "current_block %llu + ar.len %d > %d!",
+ current_block, ar.len,
+ EXT4_MAX_BLOCK_FILE_PHYS);
+ *err = -EIO;
+ goto failed_out;
+ }
if (*err && (target == blks)) {
/*
@@ -719,7 +686,7 @@ allocated:
return ret;
failed_out:
for (i = 0; i < index; i++)
- ext4_free_blocks(handle, inode, new_blocks[i], 1, 0);
+ ext4_free_blocks(handle, inode, 0, new_blocks[i], 1, 0);
return ret;
}
@@ -815,14 +782,20 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
return err;
failed:
/* Allocation failed, free what we already allocated */
+ ext4_free_blocks(handle, inode, 0, new_blocks[0], 1, 0);
for (i = 1; i <= n ; i++) {
- BUFFER_TRACE(branch[i].bh, "call jbd2_journal_forget");
- ext4_journal_forget(handle, branch[i].bh);
+ /*
+ * branch[i].bh is newly allocated, so there is no
+ * need to revoke the block, which is why we don't
+ * need to set EXT4_FREE_BLOCKS_METADATA.
+ */
+ ext4_free_blocks(handle, inode, 0, new_blocks[i], 1,
+ EXT4_FREE_BLOCKS_FORGET);
}
- for (i = 0; i < indirect_blks; i++)
- ext4_free_blocks(handle, inode, new_blocks[i], 1, 0);
+ for (i = n+1; i < indirect_blks; i++)
+ ext4_free_blocks(handle, inode, 0, new_blocks[i], 1, 0);
- ext4_free_blocks(handle, inode, new_blocks[i], num, 0);
+ ext4_free_blocks(handle, inode, 0, new_blocks[i], num, 0);
return err;
}
@@ -901,12 +874,16 @@ static int ext4_splice_branch(handle_t *handle, struct inode *inode,
err_out:
for (i = 1; i <= num; i++) {
- BUFFER_TRACE(where[i].bh, "call jbd2_journal_forget");
- ext4_journal_forget(handle, where[i].bh);
- ext4_free_blocks(handle, inode,
- le32_to_cpu(where[i-1].key), 1, 0);
+ /*
+ * branch[i].bh is newly allocated, so there is no
+ * need to revoke the block, which is why we don't
+ * need to set EXT4_FREE_BLOCKS_METADATA.
+ */
+ ext4_free_blocks(handle, inode, where[i].bh, 0, 1,
+ EXT4_FREE_BLOCKS_FORGET);
}
- ext4_free_blocks(handle, inode, le32_to_cpu(where[num].key), blks, 0);
+ ext4_free_blocks(handle, inode, 0, le32_to_cpu(where[num].key),
+ blks, 0);
return err;
}
@@ -1019,10 +996,12 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode,
if (!err)
err = ext4_splice_branch(handle, inode, iblock,
partial, indirect_blks, count);
- else
+ if (err)
goto cleanup;
set_buffer_new(bh_result);
+
+ ext4_update_inode_fsync_trans(handle, inode, 1);
got_it:
map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key));
if (count > blocks_to_boundary)
@@ -1041,92 +1020,121 @@ out:
return err;
}
-qsize_t ext4_get_reserved_space(struct inode *inode)
+#ifdef CONFIG_QUOTA
+qsize_t *ext4_get_reserved_space(struct inode *inode)
{
- unsigned long long total;
-
- spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
- total = EXT4_I(inode)->i_reserved_data_blocks +
- EXT4_I(inode)->i_reserved_meta_blocks;
- spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
-
- return total;
+ return &EXT4_I(inode)->i_reserved_quota;
}
+#endif
+
/*
* Calculate the number of metadata blocks need to reserve
- * to allocate @blocks for non extent file based file
+ * to allocate a new block at @lblocks for non extent file based file
*/
-static int ext4_indirect_calc_metadata_amount(struct inode *inode, int blocks)
+static int ext4_indirect_calc_metadata_amount(struct inode *inode,
+ sector_t lblock)
{
- int icap = EXT4_ADDR_PER_BLOCK(inode->i_sb);
- int ind_blks, dind_blks, tind_blks;
-
- /* number of new indirect blocks needed */
- ind_blks = (blocks + icap - 1) / icap;
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ int dind_mask = EXT4_ADDR_PER_BLOCK(inode->i_sb) - 1;
+ int blk_bits;
- dind_blks = (ind_blks + icap - 1) / icap;
+ if (lblock < EXT4_NDIR_BLOCKS)
+ return 0;
- tind_blks = 1;
+ lblock -= EXT4_NDIR_BLOCKS;
- return ind_blks + dind_blks + tind_blks;
+ if (ei->i_da_metadata_calc_len &&
+ (lblock & dind_mask) == ei->i_da_metadata_calc_last_lblock) {
+ ei->i_da_metadata_calc_len++;
+ return 0;
+ }
+ ei->i_da_metadata_calc_last_lblock = lblock & dind_mask;
+ ei->i_da_metadata_calc_len = 1;
+ blk_bits = roundup_pow_of_two(lblock + 1);
+ return (blk_bits / EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb)) + 1;
}
/*
* Calculate the number of metadata blocks need to reserve
- * to allocate given number of blocks
+ * to allocate a block located at @lblock
*/
-static int ext4_calc_metadata_amount(struct inode *inode, int blocks)
+static int ext4_calc_metadata_amount(struct inode *inode, sector_t lblock)
{
- if (!blocks)
- return 0;
-
if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)
- return ext4_ext_calc_metadata_amount(inode, blocks);
+ return ext4_ext_calc_metadata_amount(inode, lblock);
- return ext4_indirect_calc_metadata_amount(inode, blocks);
+ return ext4_indirect_calc_metadata_amount(inode, lblock);
}
-static void ext4_da_update_reserve_space(struct inode *inode, int used)
+/*
+ * Called with i_data_sem down, which is important since we can call
+ * ext4_discard_preallocations() from here.
+ */
+void ext4_da_update_reserve_space(struct inode *inode,
+ int used, int quota_claim)
{
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
- int total, mdb, mdb_free;
-
- spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
- /* recalculate the number of metablocks still need to be reserved */
- total = EXT4_I(inode)->i_reserved_data_blocks - used;
- mdb = ext4_calc_metadata_amount(inode, total);
-
- /* figure out how many metablocks to release */
- BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks);
- mdb_free = EXT4_I(inode)->i_reserved_meta_blocks - mdb;
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ int mdb_free = 0, allocated_meta_blocks = 0;
+
+ spin_lock(&ei->i_block_reservation_lock);
+ trace_ext4_da_update_reserve_space(inode, used);
+ if (unlikely(used > ei->i_reserved_data_blocks)) {
+ ext4_msg(inode->i_sb, KERN_NOTICE, "%s: ino %lu, used %d "
+ "with only %d reserved data blocks\n",
+ __func__, inode->i_ino, used,
+ ei->i_reserved_data_blocks);
+ WARN_ON(1);
+ used = ei->i_reserved_data_blocks;
+ }
- if (mdb_free) {
- /* Account for allocated meta_blocks */
- mdb_free -= EXT4_I(inode)->i_allocated_meta_blocks;
+ /* Update per-inode reservations */
+ ei->i_reserved_data_blocks -= used;
+ used += ei->i_allocated_meta_blocks;
+ ei->i_reserved_meta_blocks -= ei->i_allocated_meta_blocks;
+ allocated_meta_blocks = ei->i_allocated_meta_blocks;
+ ei->i_allocated_meta_blocks = 0;
+ percpu_counter_sub(&sbi->s_dirtyblocks_counter, used);
- /* update fs dirty blocks counter */
+ if (ei->i_reserved_data_blocks == 0) {
+ /*
+ * We can release all of the reserved metadata blocks
+ * only when we have written all of the delayed
+ * allocation blocks.
+ */
+ mdb_free = ei->i_reserved_meta_blocks;
+ ei->i_reserved_meta_blocks = 0;
+ ei->i_da_metadata_calc_len = 0;
percpu_counter_sub(&sbi->s_dirtyblocks_counter, mdb_free);
- EXT4_I(inode)->i_allocated_meta_blocks = 0;
- EXT4_I(inode)->i_reserved_meta_blocks = mdb;
}
-
- /* update per-inode reservations */
- BUG_ON(used > EXT4_I(inode)->i_reserved_data_blocks);
- EXT4_I(inode)->i_reserved_data_blocks -= used;
spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
- /*
- * free those over-booking quota for metadata blocks
- */
- if (mdb_free)
- vfs_dq_release_reservation_block(inode, mdb_free);
+ /* Update quota subsystem */
+ if (quota_claim) {
+ dquot_claim_block(inode, used);
+ if (mdb_free)
+ dquot_release_reservation_block(inode, mdb_free);
+ } else {
+ /*
+ * We did fallocate with an offset that is already delayed
+ * allocated. So on delayed allocated writeback we should
+ * not update the quota for allocated blocks. But then
+ * converting an fallocate region to initialized region would
+ * have caused a metadata allocation. So claim quota for
+ * that
+ */
+ if (allocated_meta_blocks)
+ dquot_claim_block(inode, allocated_meta_blocks);
+ dquot_release_reservation_block(inode, mdb_free + used);
+ }
/*
* If we have done all the pending block allocations and if
* there aren't any writers on the inode, we can discard the
* inode's preallocations.
*/
- if (!total && (atomic_read(&inode->i_writecount) == 0))
+ if ((ei->i_reserved_data_blocks == 0) &&
+ (atomic_read(&inode->i_writecount) == 0))
ext4_discard_preallocations(inode);
}
@@ -1134,7 +1142,7 @@ static int check_block_validity(struct inode *inode, const char *msg,
sector_t logical, sector_t phys, int len)
{
if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), phys, len)) {
- ext4_error(inode->i_sb, msg,
+ __ext4_error(inode->i_sb, msg,
"inode #%lu logical block %llu mapped to %llu "
"(size %d)", inode->i_ino,
(unsigned long long) logical,
@@ -1145,6 +1153,64 @@ static int check_block_validity(struct inode *inode, const char *msg,
}
/*
+ * Return the number of contiguous dirty pages in a given inode
+ * starting at page frame idx.
+ */
+static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx,
+ unsigned int max_pages)
+{
+ struct address_space *mapping = inode->i_mapping;
+ pgoff_t index;
+ struct pagevec pvec;
+ pgoff_t num = 0;
+ int i, nr_pages, done = 0;
+
+ if (max_pages == 0)
+ return 0;
+ pagevec_init(&pvec, 0);
+ while (!done) {
+ index = idx;
+ nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
+ PAGECACHE_TAG_DIRTY,
+ (pgoff_t)PAGEVEC_SIZE);
+ if (nr_pages == 0)
+ break;
+ for (i = 0; i < nr_pages; i++) {
+ struct page *page = pvec.pages[i];
+ struct buffer_head *bh, *head;
+
+ lock_page(page);
+ if (unlikely(page->mapping != mapping) ||
+ !PageDirty(page) ||
+ PageWriteback(page) ||
+ page->index != idx) {
+ done = 1;
+ unlock_page(page);
+ break;
+ }
+ if (page_has_buffers(page)) {
+ bh = head = page_buffers(page);
+ do {
+ if (!buffer_delay(bh) &&
+ !buffer_unwritten(bh))
+ done = 1;
+ bh = bh->b_this_page;
+ } while (!done && (bh != head));
+ }
+ unlock_page(page);
+ if (done)
+ break;
+ idx++;
+ num++;
+ if (num >= max_pages)
+ break;
+ }
+ pagevec_release(&pvec);
+ }
+ return num;
+}
+
+/*
* The ext4_get_blocks() function tries to look up the requested blocks,
* and returns if the blocks are already mapped.
*
@@ -1175,6 +1241,9 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
clear_buffer_mapped(bh);
clear_buffer_unwritten(bh);
+ ext_debug("ext4_get_blocks(): inode %lu, flag %d, max_blocks %u,"
+ "logical block %lu\n", inode->i_ino, flags, max_blocks,
+ (unsigned long)block);
/*
* Try to see if we can get the block without requesting a new
* file system block.
@@ -1255,20 +1324,22 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
* i_data's format changing. Force the migrate
* to fail by clearing migrate flags
*/
- EXT4_I(inode)->i_state &= ~EXT4_STATE_EXT_MIGRATE;
+ ext4_clear_inode_state(inode, EXT4_STATE_EXT_MIGRATE);
}
- }
+ /*
+ * Update reserved blocks/metadata blocks after successful
+ * block allocation which had been deferred till now. We don't
+ * support fallocate for non extent files. So we can update
+ * reserve space here.
+ */
+ if ((retval > 0) &&
+ (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE))
+ ext4_da_update_reserve_space(inode, retval, 1);
+ }
if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
EXT4_I(inode)->i_delalloc_reserved_flag = 0;
- /*
- * Update reserved blocks/metadata blocks after successful
- * block allocation which had been deferred till now.
- */
- if ((retval > 0) && (flags & EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE))
- ext4_da_update_reserve_space(inode, retval);
-
up_write((&EXT4_I(inode)->i_data_sem));
if (retval > 0 && buffer_mapped(bh)) {
int ret = check_block_validity(inode, "file system "
@@ -1471,6 +1542,18 @@ static int do_journal_get_write_access(handle_t *handle,
return ext4_journal_get_write_access(handle, bh);
}
+/*
+ * Truncate blocks that were not used by write. We have to truncate the
+ * pagecache as well so that corresponding buffers get properly unmapped.
+ */
+static void ext4_truncate_failed_write(struct inode *inode)
+{
+ truncate_inode_pages(inode->i_mapping, inode->i_size);
+ ext4_truncate(inode);
+}
+
+static int ext4_get_block_write(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh_result, int create);
static int ext4_write_begin(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned flags,
struct page **pagep, void **fsdata)
@@ -1512,8 +1595,12 @@ retry:
}
*pagep = page;
- ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
- ext4_get_block);
+ if (ext4_should_dioread_nolock(inode))
+ ret = block_write_begin(file, mapping, pos, len, flags, pagep,
+ fsdata, ext4_get_block_write);
+ else
+ ret = block_write_begin(file, mapping, pos, len, flags, pagep,
+ fsdata, ext4_get_block);
if (!ret && ext4_should_journal_data(inode)) {
ret = walk_page_buffers(handle, page_buffers(page),
@@ -1536,7 +1623,7 @@ retry:
ext4_journal_stop(handle);
if (pos + len > inode->i_size) {
- ext4_truncate(inode);
+ ext4_truncate_failed_write(inode);
/*
* If truncate failed early the inode might
* still be on the orphan list; we need to
@@ -1646,7 +1733,7 @@ static int ext4_ordered_write_end(struct file *file,
ret = ret2;
if (pos + len > inode->i_size) {
- ext4_truncate(inode);
+ ext4_truncate_failed_write(inode);
/*
* If truncate failed early the inode might still be
* on the orphan list; we need to make sure the inode
@@ -1688,7 +1775,7 @@ static int ext4_writeback_write_end(struct file *file,
ret = ret2;
if (pos + len > inode->i_size) {
- ext4_truncate(inode);
+ ext4_truncate_failed_write(inode);
/*
* If truncate failed early the inode might still be
* on the orphan list; we need to make sure the inode
@@ -1730,7 +1817,7 @@ static int ext4_journalled_write_end(struct file *file,
new_i_size = pos + copied;
if (new_i_size > inode->i_size)
i_size_write(inode, pos+copied);
- EXT4_I(inode)->i_state |= EXT4_STATE_JDATA;
+ ext4_set_inode_state(inode, EXT4_STATE_JDATA);
if (new_i_size > EXT4_I(inode)->i_disksize) {
ext4_update_i_disksize(inode, new_i_size);
ret2 = ext4_mark_inode_dirty(handle, inode);
@@ -1751,7 +1838,7 @@ static int ext4_journalled_write_end(struct file *file,
if (!ret)
ret = ret2;
if (pos + len > inode->i_size) {
- ext4_truncate(inode);
+ ext4_truncate_failed_write(inode);
/*
* If truncate failed early the inode might still be
* on the orphan list; we need to make sure the inode
@@ -1764,11 +1851,16 @@ static int ext4_journalled_write_end(struct file *file,
return ret ? ret : copied;
}
-static int ext4_da_reserve_space(struct inode *inode, int nrblocks)
+/*
+ * Reserve a single block located at lblock
+ */
+static int ext4_da_reserve_space(struct inode *inode, sector_t lblock)
{
int retries = 0;
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
- unsigned long md_needed, mdblocks, total = 0;
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ unsigned long md_needed, md_reserved;
+ int ret;
/*
* recalculate the amount of metadata blocks to reserve
@@ -1776,86 +1868,80 @@ static int ext4_da_reserve_space(struct inode *inode, int nrblocks)
* worse case is one extent per block
*/
repeat:
- spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
- total = EXT4_I(inode)->i_reserved_data_blocks + nrblocks;
- mdblocks = ext4_calc_metadata_amount(inode, total);
- BUG_ON(mdblocks < EXT4_I(inode)->i_reserved_meta_blocks);
-
- md_needed = mdblocks - EXT4_I(inode)->i_reserved_meta_blocks;
- total = md_needed + nrblocks;
+ spin_lock(&ei->i_block_reservation_lock);
+ md_reserved = ei->i_reserved_meta_blocks;
+ md_needed = ext4_calc_metadata_amount(inode, lblock);
+ trace_ext4_da_reserve_space(inode, md_needed);
+ spin_unlock(&ei->i_block_reservation_lock);
/*
* Make quota reservation here to prevent quota overflow
* later. Real quota accounting is done at pages writeout
* time.
*/
- if (vfs_dq_reserve_block(inode, total)) {
- spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
- return -EDQUOT;
- }
+ ret = dquot_reserve_block(inode, md_needed + 1);
+ if (ret)
+ return ret;
- if (ext4_claim_free_blocks(sbi, total)) {
- spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
+ if (ext4_claim_free_blocks(sbi, md_needed + 1)) {
+ dquot_release_reservation_block(inode, md_needed + 1);
if (ext4_should_retry_alloc(inode->i_sb, &retries)) {
yield();
goto repeat;
}
- vfs_dq_release_reservation_block(inode, total);
return -ENOSPC;
}
- EXT4_I(inode)->i_reserved_data_blocks += nrblocks;
- EXT4_I(inode)->i_reserved_meta_blocks = mdblocks;
+ spin_lock(&ei->i_block_reservation_lock);
+ ei->i_reserved_data_blocks++;
+ ei->i_reserved_meta_blocks += md_needed;
+ spin_unlock(&ei->i_block_reservation_lock);
- spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
return 0; /* success */
}
static void ext4_da_release_space(struct inode *inode, int to_free)
{
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
- int total, mdb, mdb_free, release;
+ struct ext4_inode_info *ei = EXT4_I(inode);
if (!to_free)
return; /* Nothing to release, exit */
spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
- if (!EXT4_I(inode)->i_reserved_data_blocks) {
+ if (unlikely(to_free > ei->i_reserved_data_blocks)) {
/*
- * if there is no reserved blocks, but we try to free some
- * then the counter is messed up somewhere.
- * but since this function is called from invalidate
- * page, it's harmless to return without any action
+ * if there aren't enough reserved blocks, then the
+ * counter is messed up somewhere. Since this
+ * function is called from invalidate page, it's
+ * harmless to return without any action.
*/
- printk(KERN_INFO "ext4 delalloc try to release %d reserved "
- "blocks for inode %lu, but there is no reserved "
- "data blocks\n", to_free, inode->i_ino);
- spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
- return;
+ ext4_msg(inode->i_sb, KERN_NOTICE, "ext4_da_release_space: "
+ "ino %lu, to_free %d with only %d reserved "
+ "data blocks\n", inode->i_ino, to_free,
+ ei->i_reserved_data_blocks);
+ WARN_ON(1);
+ to_free = ei->i_reserved_data_blocks;
}
+ ei->i_reserved_data_blocks -= to_free;
- /* recalculate the number of metablocks still need to be reserved */
- total = EXT4_I(inode)->i_reserved_data_blocks - to_free;
- mdb = ext4_calc_metadata_amount(inode, total);
-
- /* figure out how many metablocks to release */
- BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks);
- mdb_free = EXT4_I(inode)->i_reserved_meta_blocks - mdb;
-
- release = to_free + mdb_free;
-
- /* update fs dirty blocks counter for truncate case */
- percpu_counter_sub(&sbi->s_dirtyblocks_counter, release);
+ if (ei->i_reserved_data_blocks == 0) {
+ /*
+ * We can release all of the reserved metadata blocks
+ * only when we have written all of the delayed
+ * allocation blocks.
+ */
+ to_free += ei->i_reserved_meta_blocks;
+ ei->i_reserved_meta_blocks = 0;
+ ei->i_da_metadata_calc_len = 0;
+ }
- /* update per-inode reservations */
- BUG_ON(to_free > EXT4_I(inode)->i_reserved_data_blocks);
- EXT4_I(inode)->i_reserved_data_blocks -= to_free;
+ /* update fs dirty blocks counter */
+ percpu_counter_sub(&sbi->s_dirtyblocks_counter, to_free);
- BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks);
- EXT4_I(inode)->i_reserved_meta_blocks = mdb;
spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
- vfs_dq_release_reservation_block(inode, release);
+ dquot_release_reservation_block(inode, to_free);
}
static void ext4_da_page_release_reservation(struct page *page,
@@ -2032,6 +2118,8 @@ static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical,
} else if (buffer_mapped(bh))
BUG_ON(bh->b_blocknr != pblock);
+ if (buffer_uninit(exbh))
+ set_buffer_uninit(bh);
cur_logical++;
pblock++;
} while ((bh = bh->b_this_page) != head);
@@ -2074,17 +2162,16 @@ static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd,
break;
for (i = 0; i < nr_pages; i++) {
struct page *page = pvec.pages[i];
- index = page->index;
- if (index > end)
+ if (page->index > end)
break;
- index++;
-
BUG_ON(!PageLocked(page));
BUG_ON(PageWriteback(page));
block_invalidatepage(page, 0);
ClearPageUptodate(page);
unlock_page(page);
}
+ index = pvec.pages[nr_pages - 1]->index + 1;
+ pagevec_release(&pvec);
}
return;
}
@@ -2092,18 +2179,18 @@ static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd,
static void ext4_print_free_blocks(struct inode *inode)
{
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
- printk(KERN_EMERG "Total free blocks count %lld\n",
- ext4_count_free_blocks(inode->i_sb));
- printk(KERN_EMERG "Free/Dirty block details\n");
- printk(KERN_EMERG "free_blocks=%lld\n",
- (long long)percpu_counter_sum(&sbi->s_freeblocks_counter));
- printk(KERN_EMERG "dirty_blocks=%lld\n",
- (long long)percpu_counter_sum(&sbi->s_dirtyblocks_counter));
- printk(KERN_EMERG "Block reservation details\n");
- printk(KERN_EMERG "i_reserved_data_blocks=%u\n",
- EXT4_I(inode)->i_reserved_data_blocks);
- printk(KERN_EMERG "i_reserved_meta_blocks=%u\n",
- EXT4_I(inode)->i_reserved_meta_blocks);
+ printk(KERN_CRIT "Total free blocks count %lld\n",
+ ext4_count_free_blocks(inode->i_sb));
+ printk(KERN_CRIT "Free/Dirty block details\n");
+ printk(KERN_CRIT "free_blocks=%lld\n",
+ (long long) percpu_counter_sum(&sbi->s_freeblocks_counter));
+ printk(KERN_CRIT "dirty_blocks=%lld\n",
+ (long long) percpu_counter_sum(&sbi->s_dirtyblocks_counter));
+ printk(KERN_CRIT "Block reservation details\n");
+ printk(KERN_CRIT "i_reserved_data_blocks=%u\n",
+ EXT4_I(inode)->i_reserved_data_blocks);
+ printk(KERN_CRIT "i_reserved_meta_blocks=%u\n",
+ EXT4_I(inode)->i_reserved_meta_blocks);
return;
}
@@ -2160,10 +2247,12 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
* variables are updated after the blocks have been allocated.
*/
new.b_state = 0;
- get_blocks_flags = (EXT4_GET_BLOCKS_CREATE |
- EXT4_GET_BLOCKS_DELALLOC_RESERVE);
+ get_blocks_flags = EXT4_GET_BLOCKS_CREATE;
+ if (ext4_should_dioread_nolock(mpd->inode))
+ get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT;
if (mpd->b_state & (1 << BH_Delay))
- get_blocks_flags |= EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE;
+ get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE;
+
blks = ext4_get_blocks(handle, mpd->inode, next, max_blocks,
&new, get_blocks_flags);
if (blks < 0) {
@@ -2189,14 +2278,14 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
* writepage and writepages will again try to write
* the same.
*/
- printk(KERN_EMERG "%s block allocation failed for inode %lu "
- "at logical offset %llu with max blocks "
- "%zd with error %d\n",
- __func__, mpd->inode->i_ino,
- (unsigned long long)next,
- mpd->b_size >> mpd->inode->i_blkbits, err);
- printk(KERN_EMERG "This should not happen.!! "
- "Data will be lost\n");
+ ext4_msg(mpd->inode->i_sb, KERN_CRIT,
+ "delayed block allocation failed for inode %lu at "
+ "logical offset %llu with max blocks %zd with "
+ "error %d\n", mpd->inode->i_ino,
+ (unsigned long long) next,
+ mpd->b_size >> mpd->inode->i_blkbits, err);
+ printk(KERN_CRIT "This should not happen!! "
+ "Data will be lost\n");
if (err == -ENOSPC) {
ext4_print_free_blocks(mpd->inode);
}
@@ -2461,7 +2550,7 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
* XXX: __block_prepare_write() unmaps passed block,
* is it OK?
*/
- ret = ext4_da_reserve_space(inode, 1);
+ ret = ext4_da_reserve_space(inode, iblock);
if (ret)
/* not enough space to reserve */
return ret;
@@ -2537,7 +2626,6 @@ static int bput_one(handle_t *handle, struct buffer_head *bh)
}
static int __ext4_journalled_writepage(struct page *page,
- struct writeback_control *wbc,
unsigned int len)
{
struct address_space *mapping = page->mapping;
@@ -2572,11 +2660,14 @@ static int __ext4_journalled_writepage(struct page *page,
ret = err;
walk_page_buffers(handle, page_bufs, 0, len, NULL, bput_one);
- EXT4_I(inode)->i_state |= EXT4_STATE_JDATA;
+ ext4_set_inode_state(inode, EXT4_STATE_JDATA);
out:
return ret;
}
+static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode);
+static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate);
+
/*
* Note that we don't need to start a transaction unless we're journaling data
* because we should have holes filled from ext4_page_mkwrite(). We even don't
@@ -2624,7 +2715,7 @@ static int ext4_writepage(struct page *page,
int ret = 0;
loff_t size;
unsigned int len;
- struct buffer_head *page_bufs;
+ struct buffer_head *page_bufs = NULL;
struct inode *inode = page->mapping->host;
trace_ext4_writepage(inode, page);
@@ -2695,12 +2786,16 @@ static int ext4_writepage(struct page *page,
* doesn't seem much point in redirtying the page here.
*/
ClearPageChecked(page);
- return __ext4_journalled_writepage(page, wbc, len);
+ return __ext4_journalled_writepage(page, len);
}
if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode))
ret = nobh_writepage(page, noalloc_get_block_write, wbc);
- else
+ else if (page_bufs && buffer_uninit(page_bufs)) {
+ ext4_set_bh_endio(page_bufs, inode);
+ ret = block_write_full_page_endio(page, noalloc_get_block_write,
+ wbc, ext4_end_io_buffer_write);
+ } else
ret = block_write_full_page(page, noalloc_get_block_write,
wbc);
@@ -2725,7 +2820,7 @@ static int ext4_da_writepages_trans_blocks(struct inode *inode)
* number of contiguous block. So we will limit
* number of contiguous block to a sane value
*/
- if (!(inode->i_flags & EXT4_EXTENTS_FL) &&
+ if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) &&
(max_blocks > EXT4_MAX_TRANS_DATA))
max_blocks = EXT4_MAX_TRANS_DATA;
@@ -2743,8 +2838,10 @@ static int ext4_da_writepages(struct address_space *mapping,
int no_nrwrite_index_update;
int pages_written = 0;
long pages_skipped;
+ unsigned int max_pages;
int range_cyclic, cycled = 1, io_done = 0;
- int needed_blocks, ret = 0, nr_to_writebump = 0;
+ int needed_blocks, ret = 0;
+ long desired_nr_to_write, nr_to_writebump = 0;
loff_t range_start = wbc->range_start;
struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
@@ -2771,16 +2868,6 @@ static int ext4_da_writepages(struct address_space *mapping,
if (unlikely(sbi->s_mount_flags & EXT4_MF_FS_ABORTED))
return -EROFS;
- /*
- * Make sure nr_to_write is >= sbi->s_mb_stream_request
- * This make sure small files blocks are allocated in
- * single attempt. This ensure that small files
- * get less fragmented.
- */
- if (wbc->nr_to_write < sbi->s_mb_stream_request) {
- nr_to_writebump = sbi->s_mb_stream_request - wbc->nr_to_write;
- wbc->nr_to_write = sbi->s_mb_stream_request;
- }
if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
range_whole = 1;
@@ -2795,6 +2882,36 @@ static int ext4_da_writepages(struct address_space *mapping,
} else
index = wbc->range_start >> PAGE_CACHE_SHIFT;
+ /*
+ * This works around two forms of stupidity. The first is in
+ * the writeback code, which caps the maximum number of pages
+ * written to be 1024 pages. This is wrong on multiple
+ * levels; different architectues have a different page size,
+ * which changes the maximum amount of data which gets
+ * written. Secondly, 4 megabytes is way too small. XFS
+ * forces this value to be 16 megabytes by multiplying
+ * nr_to_write parameter by four, and then relies on its
+ * allocator to allocate larger extents to make them
+ * contiguous. Unfortunately this brings us to the second
+ * stupidity, which is that ext4's mballoc code only allocates
+ * at most 2048 blocks. So we force contiguous writes up to
+ * the number of dirty blocks in the inode, or
+ * sbi->max_writeback_mb_bump whichever is smaller.
+ */
+ max_pages = sbi->s_max_writeback_mb_bump << (20 - PAGE_CACHE_SHIFT);
+ if (!range_cyclic && range_whole)
+ desired_nr_to_write = wbc->nr_to_write * 8;
+ else
+ desired_nr_to_write = ext4_num_dirty_pages(inode, index,
+ max_pages);
+ if (desired_nr_to_write > max_pages)
+ desired_nr_to_write = max_pages;
+
+ if (wbc->nr_to_write < desired_nr_to_write) {
+ nr_to_writebump = desired_nr_to_write - wbc->nr_to_write;
+ wbc->nr_to_write = desired_nr_to_write;
+ }
+
mpd.wbc = wbc;
mpd.inode = mapping->host;
@@ -2822,10 +2939,9 @@ retry:
handle = ext4_journal_start(inode, needed_blocks);
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
- printk(KERN_CRIT "%s: jbd2_start: "
+ ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: "
"%ld pages, ino %lu; err %d\n", __func__,
wbc->nr_to_write, inode->i_ino, ret);
- dump_stack();
goto out_writepages;
}
@@ -2849,7 +2965,7 @@ retry:
ret = write_cache_pages(mapping, wbc, __mpage_da_writepage,
&mpd);
/*
- * If we have a contigous extent of pages and we
+ * If we have a contiguous extent of pages and we
* haven't done the I/O yet, map the blocks and submit
* them for I/O.
*/
@@ -2897,9 +3013,10 @@ retry:
goto retry;
}
if (pages_skipped != wbc->pages_skipped)
- printk(KERN_EMERG "This should not happen leaving %s "
- "with nr_to_write = %ld ret = %d\n",
- __func__, wbc->nr_to_write, ret);
+ ext4_msg(inode->i_sb, KERN_CRIT,
+ "This should not happen leaving %s "
+ "with nr_to_write = %ld ret = %d\n",
+ __func__, wbc->nr_to_write, ret);
/* Update index */
index += pages_written;
@@ -2939,11 +3056,18 @@ static int ext4_nonda_switch(struct super_block *sb)
if (2 * free_blocks < 3 * dirty_blocks ||
free_blocks < (dirty_blocks + EXT4_FREEBLOCKS_WATERMARK)) {
/*
- * free block count is less that 150% of dirty blocks
- * or free blocks is less that watermark
+ * free block count is less than 150% of dirty blocks
+ * or free blocks is less than watermark
*/
return 1;
}
+ /*
+ * Even if we don't switch but are nearing capacity,
+ * start pushing delalloc when 1/2 of free blocks are dirty.
+ */
+ if (free_blocks < 2 * dirty_blocks)
+ writeback_inodes_sb_if_idle(sb);
+
return 0;
}
@@ -2951,7 +3075,7 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned flags,
struct page **pagep, void **fsdata)
{
- int ret, retries = 0;
+ int ret, retries = 0, quota_retries = 0;
struct page *page;
pgoff_t index;
unsigned from, to;
@@ -3005,11 +3129,27 @@ retry:
* i_size_read because we hold i_mutex.
*/
if (pos + len > inode->i_size)
- ext4_truncate(inode);
+ ext4_truncate_failed_write(inode);
}
if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
goto retry;
+
+ if ((ret == -EDQUOT) &&
+ EXT4_I(inode)->i_reserved_meta_blocks &&
+ (quota_retries++ < 3)) {
+ /*
+ * Since we often over-estimate the number of meta
+ * data blocks required, we may sometimes get a
+ * spurios out of quota error even though there would
+ * be enough space once we write the data blocks and
+ * find out how many meta data blocks were _really_
+ * required. So try forcing the inode write to see if
+ * that helps.
+ */
+ write_inode_now(inode, (quota_retries == 3));
+ goto retry;
+ }
out:
return ret;
}
@@ -3198,7 +3338,8 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
filemap_write_and_wait(mapping);
}
- if (EXT4_JOURNAL(inode) && EXT4_I(inode)->i_state & EXT4_STATE_JDATA) {
+ if (EXT4_JOURNAL(inode) &&
+ ext4_test_inode_state(inode, EXT4_STATE_JDATA)) {
/*
* This is a REALLY heavyweight approach, but the use of
* bmap on dirty files is expected to be extremely rare:
@@ -3217,7 +3358,7 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
* everything they get.
*/
- EXT4_I(inode)->i_state &= ~EXT4_STATE_JDATA;
+ ext4_clear_inode_state(inode, EXT4_STATE_JDATA);
journal = EXT4_JOURNAL(inode);
jbd2_journal_lock_updates(journal);
err = jbd2_journal_flush(journal);
@@ -3242,11 +3383,45 @@ ext4_readpages(struct file *file, struct address_space *mapping,
return mpage_readpages(mapping, pages, nr_pages, ext4_get_block);
}
+static void ext4_free_io_end(ext4_io_end_t *io)
+{
+ BUG_ON(!io);
+ if (io->page)
+ put_page(io->page);
+ iput(io->inode);
+ kfree(io);
+}
+
+static void ext4_invalidatepage_free_endio(struct page *page, unsigned long offset)
+{
+ struct buffer_head *head, *bh;
+ unsigned int curr_off = 0;
+
+ if (!page_has_buffers(page))
+ return;
+ head = bh = page_buffers(page);
+ do {
+ if (offset <= curr_off && test_clear_buffer_uninit(bh)
+ && bh->b_private) {
+ ext4_free_io_end(bh->b_private);
+ bh->b_private = NULL;
+ bh->b_end_io = NULL;
+ }
+ curr_off = curr_off + bh->b_size;
+ bh = bh->b_this_page;
+ } while (bh != head);
+}
+
static void ext4_invalidatepage(struct page *page, unsigned long offset)
{
journal_t *journal = EXT4_JOURNAL(page->mapping->host);
/*
+ * free any io_end structure allocated for buffers to be discarded
+ */
+ if (ext4_should_dioread_nolock(page->mapping->host))
+ ext4_invalidatepage_free_endio(page, offset);
+ /*
* If it's a full truncate we just forget about the pending dirtying
*/
if (offset == 0)
@@ -3272,6 +3447,8 @@ static int ext4_releasepage(struct page *page, gfp_t wait)
}
/*
+ * O_DIRECT for ext3 (or indirect map) based files
+ *
* If the O_DIRECT write will extend the file then add this inode to the
* orphan list. So recovery will truncate it back to the original size
* if the machine crashes during the write.
@@ -3280,7 +3457,7 @@ static int ext4_releasepage(struct page *page, gfp_t wait)
* crashes then stale disk data _may_ be exposed inside the file. But current
* VFS code falls back into buffered path in that case so we are safe.
*/
-static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
+static ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
const struct iovec *iov, loff_t offset,
unsigned long nr_segs)
{
@@ -3291,6 +3468,7 @@ static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
ssize_t ret;
int orphan = 0;
size_t count = iov_length(iov, nr_segs);
+ int retries = 0;
if (rw == WRITE) {
loff_t final_size = offset + count;
@@ -3313,9 +3491,19 @@ static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
}
}
- ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
+retry:
+ if (rw == READ && ext4_should_dioread_nolock(inode))
+ ret = blockdev_direct_IO_no_locking(rw, iocb, inode,
+ inode->i_sb->s_bdev, iov,
+ offset, nr_segs,
+ ext4_get_block, NULL);
+ else
+ ret = blockdev_direct_IO(rw, iocb, inode,
+ inode->i_sb->s_bdev, iov,
offset, nr_segs,
ext4_get_block, NULL);
+ if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
+ goto retry;
if (orphan) {
int err;
@@ -3327,6 +3515,9 @@ static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
* but cannot extend i_size. Bail out and pretend
* the write failed... */
ret = PTR_ERR(handle);
+ if (inode->i_nlink)
+ ext4_orphan_del(NULL, inode);
+
goto out;
}
if (inode->i_nlink)
@@ -3354,6 +3545,439 @@ out:
return ret;
}
+static int ext4_get_block_write(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh_result, int create)
+{
+ handle_t *handle = ext4_journal_current_handle();
+ int ret = 0;
+ unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
+ int dio_credits;
+ int started = 0;
+
+ ext4_debug("ext4_get_block_write: inode %lu, create flag %d\n",
+ inode->i_ino, create);
+ /*
+ * ext4_get_block in prepare for a DIO write or buffer write.
+ * We allocate an uinitialized extent if blocks haven't been allocated.
+ * The extent will be converted to initialized after IO complete.
+ */
+ create = EXT4_GET_BLOCKS_IO_CREATE_EXT;
+
+ if (!handle) {
+ if (max_blocks > DIO_MAX_BLOCKS)
+ max_blocks = DIO_MAX_BLOCKS;
+ dio_credits = ext4_chunk_trans_blocks(inode, max_blocks);
+ handle = ext4_journal_start(inode, dio_credits);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ goto out;
+ }
+ started = 1;
+ }
+
+ ret = ext4_get_blocks(handle, inode, iblock, max_blocks, bh_result,
+ create);
+ if (ret > 0) {
+ bh_result->b_size = (ret << inode->i_blkbits);
+ ret = 0;
+ }
+ if (started)
+ ext4_journal_stop(handle);
+out:
+ return ret;
+}
+
+static void dump_completed_IO(struct inode * inode)
+{
+#ifdef EXT4_DEBUG
+ struct list_head *cur, *before, *after;
+ ext4_io_end_t *io, *io0, *io1;
+ unsigned long flags;
+
+ if (list_empty(&EXT4_I(inode)->i_completed_io_list)){
+ ext4_debug("inode %lu completed_io list is empty\n", inode->i_ino);
+ return;
+ }
+
+ ext4_debug("Dump inode %lu completed_io list \n", inode->i_ino);
+ spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
+ list_for_each_entry(io, &EXT4_I(inode)->i_completed_io_list, list){
+ cur = &io->list;
+ before = cur->prev;
+ io0 = container_of(before, ext4_io_end_t, list);
+ after = cur->next;
+ io1 = container_of(after, ext4_io_end_t, list);
+
+ ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n",
+ io, inode->i_ino, io0, io1);
+ }
+ spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags);
+#endif
+}
+
+/*
+ * check a range of space and convert unwritten extents to written.
+ */
+static int ext4_end_io_nolock(ext4_io_end_t *io)
+{
+ struct inode *inode = io->inode;
+ loff_t offset = io->offset;
+ ssize_t size = io->size;
+ int ret = 0;
+
+ ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p,"
+ "list->prev 0x%p\n",
+ io, inode->i_ino, io->list.next, io->list.prev);
+
+ if (list_empty(&io->list))
+ return ret;
+
+ if (io->flag != EXT4_IO_UNWRITTEN)
+ return ret;
+
+ ret = ext4_convert_unwritten_extents(inode, offset, size);
+ if (ret < 0) {
+ printk(KERN_EMERG "%s: failed to convert unwritten"
+ "extents to written extents, error is %d"
+ " io is still on inode %lu aio dio list\n",
+ __func__, ret, inode->i_ino);
+ return ret;
+ }
+
+ /* clear the DIO AIO unwritten flag */
+ io->flag = 0;
+ return ret;
+}
+
+/*
+ * work on completed aio dio IO, to convert unwritten extents to extents
+ */
+static void ext4_end_io_work(struct work_struct *work)
+{
+ ext4_io_end_t *io = container_of(work, ext4_io_end_t, work);
+ struct inode *inode = io->inode;
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ unsigned long flags;
+ int ret;
+
+ mutex_lock(&inode->i_mutex);
+ ret = ext4_end_io_nolock(io);
+ if (ret < 0) {
+ mutex_unlock(&inode->i_mutex);
+ return;
+ }
+
+ spin_lock_irqsave(&ei->i_completed_io_lock, flags);
+ if (!list_empty(&io->list))
+ list_del_init(&io->list);
+ spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
+ mutex_unlock(&inode->i_mutex);
+ ext4_free_io_end(io);
+}
+
+/*
+ * This function is called from ext4_sync_file().
+ *
+ * When IO is completed, the work to convert unwritten extents to
+ * written is queued on workqueue but may not get immediately
+ * scheduled. When fsync is called, we need to ensure the
+ * conversion is complete before fsync returns.
+ * The inode keeps track of a list of pending/completed IO that
+ * might needs to do the conversion. This function walks through
+ * the list and convert the related unwritten extents for completed IO
+ * to written.
+ * The function return the number of pending IOs on success.
+ */
+int flush_completed_IO(struct inode *inode)
+{
+ ext4_io_end_t *io;
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ unsigned long flags;
+ int ret = 0;
+ int ret2 = 0;
+
+ if (list_empty(&ei->i_completed_io_list))
+ return ret;
+
+ dump_completed_IO(inode);
+ spin_lock_irqsave(&ei->i_completed_io_lock, flags);
+ while (!list_empty(&ei->i_completed_io_list)){
+ io = list_entry(ei->i_completed_io_list.next,
+ ext4_io_end_t, list);
+ /*
+ * Calling ext4_end_io_nolock() to convert completed
+ * IO to written.
+ *
+ * When ext4_sync_file() is called, run_queue() may already
+ * about to flush the work corresponding to this io structure.
+ * It will be upset if it founds the io structure related
+ * to the work-to-be schedule is freed.
+ *
+ * Thus we need to keep the io structure still valid here after
+ * convertion finished. The io structure has a flag to
+ * avoid double converting from both fsync and background work
+ * queue work.
+ */
+ spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
+ ret = ext4_end_io_nolock(io);
+ spin_lock_irqsave(&ei->i_completed_io_lock, flags);
+ if (ret < 0)
+ ret2 = ret;
+ else
+ list_del_init(&io->list);
+ }
+ spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
+ return (ret2 < 0) ? ret2 : 0;
+}
+
+static ext4_io_end_t *ext4_init_io_end (struct inode *inode, gfp_t flags)
+{
+ ext4_io_end_t *io = NULL;
+
+ io = kmalloc(sizeof(*io), flags);
+
+ if (io) {
+ igrab(inode);
+ io->inode = inode;
+ io->flag = 0;
+ io->offset = 0;
+ io->size = 0;
+ io->page = NULL;
+ INIT_WORK(&io->work, ext4_end_io_work);
+ INIT_LIST_HEAD(&io->list);
+ }
+
+ return io;
+}
+
+static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
+ ssize_t size, void *private)
+{
+ ext4_io_end_t *io_end = iocb->private;
+ struct workqueue_struct *wq;
+ unsigned long flags;
+ struct ext4_inode_info *ei;
+
+ /* if not async direct IO or dio with 0 bytes write, just return */
+ if (!io_end || !size)
+ return;
+
+ ext_debug("ext4_end_io_dio(): io_end 0x%p"
+ "for inode %lu, iocb 0x%p, offset %llu, size %llu\n",
+ iocb->private, io_end->inode->i_ino, iocb, offset,
+ size);
+
+ /* if not aio dio with unwritten extents, just free io and return */
+ if (io_end->flag != EXT4_IO_UNWRITTEN){
+ ext4_free_io_end(io_end);
+ iocb->private = NULL;
+ return;
+ }
+
+ io_end->offset = offset;
+ io_end->size = size;
+ io_end->flag = EXT4_IO_UNWRITTEN;
+ wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq;
+
+ /* queue the work to convert unwritten extents to written */
+ queue_work(wq, &io_end->work);
+
+ /* Add the io_end to per-inode completed aio dio list*/
+ ei = EXT4_I(io_end->inode);
+ spin_lock_irqsave(&ei->i_completed_io_lock, flags);
+ list_add_tail(&io_end->list, &ei->i_completed_io_list);
+ spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
+ iocb->private = NULL;
+}
+
+static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate)
+{
+ ext4_io_end_t *io_end = bh->b_private;
+ struct workqueue_struct *wq;
+ struct inode *inode;
+ unsigned long flags;
+
+ if (!test_clear_buffer_uninit(bh) || !io_end)
+ goto out;
+
+ if (!(io_end->inode->i_sb->s_flags & MS_ACTIVE)) {
+ printk("sb umounted, discard end_io request for inode %lu\n",
+ io_end->inode->i_ino);
+ ext4_free_io_end(io_end);
+ goto out;
+ }
+
+ io_end->flag = EXT4_IO_UNWRITTEN;
+ inode = io_end->inode;
+
+ /* Add the io_end to per-inode completed io list*/
+ spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
+ list_add_tail(&io_end->list, &EXT4_I(inode)->i_completed_io_list);
+ spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags);
+
+ wq = EXT4_SB(inode->i_sb)->dio_unwritten_wq;
+ /* queue the work to convert unwritten extents to written */
+ queue_work(wq, &io_end->work);
+out:
+ bh->b_private = NULL;
+ bh->b_end_io = NULL;
+ clear_buffer_uninit(bh);
+ end_buffer_async_write(bh, uptodate);
+}
+
+static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode)
+{
+ ext4_io_end_t *io_end;
+ struct page *page = bh->b_page;
+ loff_t offset = (sector_t)page->index << PAGE_CACHE_SHIFT;
+ size_t size = bh->b_size;
+
+retry:
+ io_end = ext4_init_io_end(inode, GFP_ATOMIC);
+ if (!io_end) {
+ if (printk_ratelimit())
+ printk(KERN_WARNING "%s: allocation fail\n", __func__);
+ schedule();
+ goto retry;
+ }
+ io_end->offset = offset;
+ io_end->size = size;
+ /*
+ * We need to hold a reference to the page to make sure it
+ * doesn't get evicted before ext4_end_io_work() has a chance
+ * to convert the extent from written to unwritten.
+ */
+ io_end->page = page;
+ get_page(io_end->page);
+
+ bh->b_private = io_end;
+ bh->b_end_io = ext4_end_io_buffer_write;
+ return 0;
+}
+
+/*
+ * For ext4 extent files, ext4 will do direct-io write to holes,
+ * preallocated extents, and those write extend the file, no need to
+ * fall back to buffered IO.
+ *
+ * For holes, we fallocate those blocks, mark them as unintialized
+ * If those blocks were preallocated, we mark sure they are splited, but
+ * still keep the range to write as unintialized.
+ *
+ * The unwrritten extents will be converted to written when DIO is completed.
+ * For async direct IO, since the IO may still pending when return, we
+ * set up an end_io call back function, which will do the convertion
+ * when async direct IO completed.
+ *
+ * If the O_DIRECT write will extend the file then add this inode to the
+ * orphan list. So recovery will truncate it back to the original size
+ * if the machine crashes during the write.
+ *
+ */
+static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
+ const struct iovec *iov, loff_t offset,
+ unsigned long nr_segs)
+{
+ struct file *file = iocb->ki_filp;
+ struct inode *inode = file->f_mapping->host;
+ ssize_t ret;
+ size_t count = iov_length(iov, nr_segs);
+
+ loff_t final_size = offset + count;
+ if (rw == WRITE && final_size <= inode->i_size) {
+ /*
+ * We could direct write to holes and fallocate.
+ *
+ * Allocated blocks to fill the hole are marked as uninitialized
+ * to prevent paralel buffered read to expose the stale data
+ * before DIO complete the data IO.
+ *
+ * As to previously fallocated extents, ext4 get_block
+ * will just simply mark the buffer mapped but still
+ * keep the extents uninitialized.
+ *
+ * for non AIO case, we will convert those unwritten extents
+ * to written after return back from blockdev_direct_IO.
+ *
+ * for async DIO, the conversion needs to be defered when
+ * the IO is completed. The ext4 end_io callback function
+ * will be called to take care of the conversion work.
+ * Here for async case, we allocate an io_end structure to
+ * hook to the iocb.
+ */
+ iocb->private = NULL;
+ EXT4_I(inode)->cur_aio_dio = NULL;
+ if (!is_sync_kiocb(iocb)) {
+ iocb->private = ext4_init_io_end(inode, GFP_NOFS);
+ if (!iocb->private)
+ return -ENOMEM;
+ /*
+ * we save the io structure for current async
+ * direct IO, so that later ext4_get_blocks()
+ * could flag the io structure whether there
+ * is a unwritten extents needs to be converted
+ * when IO is completed.
+ */
+ EXT4_I(inode)->cur_aio_dio = iocb->private;
+ }
+
+ ret = blockdev_direct_IO(rw, iocb, inode,
+ inode->i_sb->s_bdev, iov,
+ offset, nr_segs,
+ ext4_get_block_write,
+ ext4_end_io_dio);
+ if (iocb->private)
+ EXT4_I(inode)->cur_aio_dio = NULL;
+ /*
+ * The io_end structure takes a reference to the inode,
+ * that structure needs to be destroyed and the
+ * reference to the inode need to be dropped, when IO is
+ * complete, even with 0 byte write, or failed.
+ *
+ * In the successful AIO DIO case, the io_end structure will be
+ * desctroyed and the reference to the inode will be dropped
+ * after the end_io call back function is called.
+ *
+ * In the case there is 0 byte write, or error case, since
+ * VFS direct IO won't invoke the end_io call back function,
+ * we need to free the end_io structure here.
+ */
+ if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) {
+ ext4_free_io_end(iocb->private);
+ iocb->private = NULL;
+ } else if (ret > 0 && ext4_test_inode_state(inode,
+ EXT4_STATE_DIO_UNWRITTEN)) {
+ int err;
+ /*
+ * for non AIO case, since the IO is already
+ * completed, we could do the convertion right here
+ */
+ err = ext4_convert_unwritten_extents(inode,
+ offset, ret);
+ if (err < 0)
+ ret = err;
+ ext4_clear_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
+ }
+ return ret;
+ }
+
+ /* for write the the end of file case, we fall back to old way */
+ return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs);
+}
+
+static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
+ const struct iovec *iov, loff_t offset,
+ unsigned long nr_segs)
+{
+ struct file *file = iocb->ki_filp;
+ struct inode *inode = file->f_mapping->host;
+
+ if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)
+ return ext4_ext_direct_IO(rw, iocb, iov, offset, nr_segs);
+
+ return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs);
+}
+
/*
* Pages can be marked dirty completely asynchronously from ext4's journalling
* activity. By filemap_sync_pte(), try_to_unmap_one(), etc. We cannot do
@@ -3614,7 +4238,7 @@ static Indirect *ext4_find_shared(struct inode *inode, int depth,
int k, err;
*top = 0;
- /* Make k index the deepest non-null offest + 1 */
+ /* Make k index the deepest non-null offset + 1 */
for (k = depth; k > 1 && !offsets[k-1]; k--)
;
partial = ext4_get_branch(inode, k, offsets, chain, &err);
@@ -3663,13 +4287,27 @@ no_top:
* We release `count' blocks on disk, but (last - first) may be greater
* than `count' because there can be holes in there.
*/
-static void ext4_clear_blocks(handle_t *handle, struct inode *inode,
- struct buffer_head *bh,
- ext4_fsblk_t block_to_free,
- unsigned long count, __le32 *first,
- __le32 *last)
+static int ext4_clear_blocks(handle_t *handle, struct inode *inode,
+ struct buffer_head *bh,
+ ext4_fsblk_t block_to_free,
+ unsigned long count, __le32 *first,
+ __le32 *last)
{
__le32 *p;
+ int flags = EXT4_FREE_BLOCKS_FORGET | EXT4_FREE_BLOCKS_VALIDATED;
+
+ if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
+ flags |= EXT4_FREE_BLOCKS_METADATA;
+
+ if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), block_to_free,
+ count)) {
+ ext4_error(inode->i_sb, "inode #%lu: "
+ "attempt to clear blocks %llu len %lu, invalid",
+ inode->i_ino, (unsigned long long) block_to_free,
+ count);
+ return 1;
+ }
+
if (try_to_extend_transaction(handle, inode)) {
if (bh) {
BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
@@ -3684,27 +4322,11 @@ static void ext4_clear_blocks(handle_t *handle, struct inode *inode,
}
}
- /*
- * Any buffers which are on the journal will be in memory. We
- * find them on the hash table so jbd2_journal_revoke() will
- * run jbd2_journal_forget() on them. We've already detached
- * each block from the file, so bforget() in
- * jbd2_journal_forget() should be safe.
- *
- * AKPM: turn on bforget in jbd2_journal_forget()!!!
- */
- for (p = first; p < last; p++) {
- u32 nr = le32_to_cpu(*p);
- if (nr) {
- struct buffer_head *tbh;
-
- *p = 0;
- tbh = sb_find_get_block(inode->i_sb, nr);
- ext4_forget(handle, 0, inode, tbh, nr);
- }
- }
+ for (p = first; p < last; p++)
+ *p = 0;
- ext4_free_blocks(handle, inode, block_to_free, count, 0);
+ ext4_free_blocks(handle, inode, 0, block_to_free, count, flags);
+ return 0;
}
/**
@@ -3760,9 +4382,10 @@ static void ext4_free_data(handle_t *handle, struct inode *inode,
} else if (nr == block_to_free + count) {
count++;
} else {
- ext4_clear_blocks(handle, inode, this_bh,
- block_to_free,
- count, block_to_free_p, p);
+ if (ext4_clear_blocks(handle, inode, this_bh,
+ block_to_free, count,
+ block_to_free_p, p))
+ break;
block_to_free = nr;
block_to_free_p = p;
count = 1;
@@ -3786,7 +4409,7 @@ static void ext4_free_data(handle_t *handle, struct inode *inode,
if ((EXT4_JOURNAL(inode) == NULL) || bh2jh(this_bh))
ext4_handle_dirty_metadata(handle, inode, this_bh);
else
- ext4_error(inode->i_sb, __func__,
+ ext4_error(inode->i_sb,
"circular indirect block detected, "
"inode=%lu, block=%llu",
inode->i_ino,
@@ -3826,6 +4449,16 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
if (!nr)
continue; /* A hole */
+ if (!ext4_data_block_valid(EXT4_SB(inode->i_sb),
+ nr, 1)) {
+ ext4_error(inode->i_sb,
+ "indirect mapped block in inode "
+ "#%lu invalid (level %d, blk #%lu)",
+ inode->i_ino, depth,
+ (unsigned long) nr);
+ break;
+ }
+
/* Go read the buffer for the next level down */
bh = sb_bread(inode->i_sb, nr);
@@ -3834,7 +4467,7 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
* (should be rare).
*/
if (!bh) {
- ext4_error(inode->i_sb, "ext4_free_branches",
+ ext4_error(inode->i_sb,
"Read failure, inode=%lu, block=%llu",
inode->i_ino, nr);
continue;
@@ -3892,7 +4525,8 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
blocks_for_truncate(inode));
}
- ext4_free_blocks(handle, inode, nr, 1, 1);
+ ext4_free_blocks(handle, inode, 0, nr, 1,
+ EXT4_FREE_BLOCKS_METADATA);
if (parent_bh) {
/*
@@ -3977,8 +4611,10 @@ void ext4_truncate(struct inode *inode)
if (!ext4_can_truncate(inode))
return;
+ EXT4_I(inode)->i_flags &= ~EXT4_EOFBLOCKS_FL;
+
if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC))
- ei->i_state |= EXT4_STATE_DA_ALLOC_CLOSE;
+ ext4_set_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE);
if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
ext4_ext_truncate(inode);
@@ -4148,9 +4784,8 @@ static int __ext4_get_inode_loc(struct inode *inode,
bh = sb_getblk(sb, block);
if (!bh) {
- ext4_error(sb, "ext4_get_inode_loc", "unable to read "
- "inode block - inode=%lu, block=%llu",
- inode->i_ino, block);
+ ext4_error(sb, "unable to read inode block - "
+ "inode=%lu, block=%llu", inode->i_ino, block);
return -EIO;
}
if (!buffer_uptodate(bh)) {
@@ -4248,9 +4883,8 @@ make_io:
submit_bh(READ_META, bh);
wait_on_buffer(bh);
if (!buffer_uptodate(bh)) {
- ext4_error(sb, __func__,
- "unable to read inode block - inode=%lu, "
- "block=%llu", inode->i_ino, block);
+ ext4_error(sb, "unable to read inode block - inode=%lu,"
+ " block=%llu", inode->i_ino, block);
brelse(bh);
return -EIO;
}
@@ -4264,7 +4898,7 @@ int ext4_get_inode_loc(struct inode *inode, struct ext4_iloc *iloc)
{
/* We have all inode data except xattrs in memory here. */
return __ext4_get_inode_loc(inode, iloc,
- !(EXT4_I(inode)->i_state & EXT4_STATE_XATTR));
+ !ext4_test_inode_state(inode, EXT4_STATE_XATTR));
}
void ext4_set_inode_flags(struct inode *inode)
@@ -4331,8 +4965,8 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
struct ext4_iloc iloc;
struct ext4_inode *raw_inode;
struct ext4_inode_info *ei;
- struct buffer_head *bh;
struct inode *inode;
+ journal_t *journal = EXT4_SB(sb)->s_journal;
long ret;
int block;
@@ -4343,11 +4977,11 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
return inode;
ei = EXT4_I(inode);
+ iloc.bh = 0;
ret = __ext4_get_inode_loc(inode, &iloc, 0);
if (ret < 0)
goto bad_inode;
- bh = iloc.bh;
raw_inode = ext4_raw_inode(&iloc);
inode->i_mode = le16_to_cpu(raw_inode->i_mode);
inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low);
@@ -4358,7 +4992,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
}
inode->i_nlink = le16_to_cpu(raw_inode->i_links_count);
- ei->i_state = 0;
+ ei->i_state_flags = 0;
ei->i_dir_start_lookup = 0;
ei->i_dtime = le32_to_cpu(raw_inode->i_dtime);
/* We now have enough fields to check if the inode was active or not.
@@ -4370,7 +5004,6 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
if (inode->i_mode == 0 ||
!(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) {
/* this inode is deleted */
- brelse(bh);
ret = -ESTALE;
goto bad_inode;
}
@@ -4387,6 +5020,9 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32;
inode->i_size = ext4_isize(raw_inode);
ei->i_disksize = inode->i_size;
+#ifdef CONFIG_QUOTA
+ ei->i_reserved_quota = 0;
+#endif
inode->i_generation = le32_to_cpu(raw_inode->i_generation);
ei->i_block_group = iloc.block_group;
ei->i_last_alloc_group = ~0;
@@ -4398,11 +5034,35 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
ei->i_data[block] = raw_inode->i_block[block];
INIT_LIST_HEAD(&ei->i_orphan);
+ /*
+ * Set transaction id's of transactions that have to be committed
+ * to finish f[data]sync. We set them to currently running transaction
+ * as we cannot be sure that the inode or some of its metadata isn't
+ * part of the transaction - the inode could have been reclaimed and
+ * now it is reread from disk.
+ */
+ if (journal) {
+ transaction_t *transaction;
+ tid_t tid;
+
+ spin_lock(&journal->j_state_lock);
+ if (journal->j_running_transaction)
+ transaction = journal->j_running_transaction;
+ else
+ transaction = journal->j_committing_transaction;
+ if (transaction)
+ tid = transaction->t_tid;
+ else
+ tid = journal->j_commit_sequence;
+ spin_unlock(&journal->j_state_lock);
+ ei->i_sync_tid = tid;
+ ei->i_datasync_tid = tid;
+ }
+
if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize);
if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize >
EXT4_INODE_SIZE(inode->i_sb)) {
- brelse(bh);
ret = -EIO;
goto bad_inode;
}
@@ -4415,7 +5075,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
EXT4_GOOD_OLD_INODE_SIZE +
ei->i_extra_isize;
if (*magic == cpu_to_le32(EXT4_XATTR_MAGIC))
- ei->i_state |= EXT4_STATE_XATTR;
+ ext4_set_inode_state(inode, EXT4_STATE_XATTR);
}
} else
ei->i_extra_isize = 0;
@@ -4434,12 +5094,8 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
ret = 0;
if (ei->i_file_acl &&
- ((ei->i_file_acl <
- (le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block) +
- EXT4_SB(sb)->s_gdb_count)) ||
- (ei->i_file_acl >= ext4_blocks_count(EXT4_SB(sb)->s_es)))) {
- ext4_error(sb, __func__,
- "bad extended attribute block %llu in inode #%lu",
+ !ext4_data_block_valid(EXT4_SB(sb), ei->i_file_acl, 1)) {
+ ext4_error(sb, "bad extended attribute block %llu inode #%lu",
ei->i_file_acl, inode->i_ino);
ret = -EIO;
goto bad_inode;
@@ -4455,10 +5111,8 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
/* Validate block references which are part of inode */
ret = ext4_check_inode_blockref(inode);
}
- if (ret) {
- brelse(bh);
+ if (ret)
goto bad_inode;
- }
if (S_ISREG(inode->i_mode)) {
inode->i_op = &ext4_file_inode_operations;
@@ -4486,10 +5140,8 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
init_special_inode(inode, inode->i_mode,
new_decode_dev(le32_to_cpu(raw_inode->i_block[1])));
} else {
- brelse(bh);
ret = -EIO;
- ext4_error(inode->i_sb, __func__,
- "bogus i_mode (%o) for inode=%lu",
+ ext4_error(inode->i_sb, "bogus i_mode (%o) for inode=%lu",
inode->i_mode, inode->i_ino);
goto bad_inode;
}
@@ -4499,6 +5151,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
return inode;
bad_inode:
+ brelse(iloc.bh);
iget_failed(inode);
return ERR_PTR(ret);
}
@@ -4551,8 +5204,7 @@ static int ext4_inode_blocks_set(handle_t *handle,
*/
static int ext4_do_update_inode(handle_t *handle,
struct inode *inode,
- struct ext4_iloc *iloc,
- int do_sync)
+ struct ext4_iloc *iloc)
{
struct ext4_inode *raw_inode = ext4_raw_inode(iloc);
struct ext4_inode_info *ei = EXT4_I(inode);
@@ -4561,7 +5213,7 @@ static int ext4_do_update_inode(handle_t *handle,
/* For fields not not tracking in the in-memory inode,
* initialise them to zero for new inodes. */
- if (ei->i_state & EXT4_STATE_NEW)
+ if (ext4_test_inode_state(inode, EXT4_STATE_NEW))
memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size);
ext4_get_inode_flags(ei);
@@ -4625,7 +5277,7 @@ static int ext4_do_update_inode(handle_t *handle,
EXT4_FEATURE_RO_COMPAT_LARGE_FILE);
sb->s_dirt = 1;
ext4_handle_sync(handle);
- err = ext4_handle_dirty_metadata(handle, inode,
+ err = ext4_handle_dirty_metadata(handle, NULL,
EXT4_SB(sb)->s_sbh);
}
}
@@ -4653,24 +5305,13 @@ static int ext4_do_update_inode(handle_t *handle,
raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize);
}
- /*
- * If we're not using a journal and we were called from
- * ext4_write_inode() to sync the inode (making do_sync true),
- * we can just use sync_dirty_buffer() directly to do our dirty
- * work. Testing s_journal here is a bit redundant but it's
- * worth it to avoid potential future trouble.
- */
- if (EXT4_SB(inode->i_sb)->s_journal == NULL && do_sync) {
- BUFFER_TRACE(bh, "call sync_dirty_buffer");
- sync_dirty_buffer(bh);
- } else {
- BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
- rc = ext4_handle_dirty_metadata(handle, inode, bh);
- if (!err)
- err = rc;
- }
- ei->i_state &= ~EXT4_STATE_NEW;
+ BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
+ rc = ext4_handle_dirty_metadata(handle, NULL, bh);
+ if (!err)
+ err = rc;
+ ext4_clear_inode_state(inode, EXT4_STATE_NEW);
+ ext4_update_inode_fsync_trans(handle, inode, 0);
out_brelse:
brelse(bh);
ext4_std_error(inode->i_sb, err);
@@ -4712,7 +5353,7 @@ out_brelse:
* `stuff()' is running, and the new i_size will be lost. Plus the inode
* will no longer be on the superblock's dirty inode list.
*/
-int ext4_write_inode(struct inode *inode, int wait)
+int ext4_write_inode(struct inode *inode, struct writeback_control *wbc)
{
int err;
@@ -4726,7 +5367,7 @@ int ext4_write_inode(struct inode *inode, int wait)
return -EIO;
}
- if (!wait)
+ if (wbc->sync_mode != WB_SYNC_ALL)
return 0;
err = ext4_force_commit(inode->i_sb);
@@ -4736,8 +5377,14 @@ int ext4_write_inode(struct inode *inode, int wait)
err = ext4_get_inode_loc(inode, &iloc);
if (err)
return err;
- err = ext4_do_update_inode(EXT4_NOJOURNAL_HANDLE,
- inode, &iloc, wait);
+ if (wbc->sync_mode == WB_SYNC_ALL)
+ sync_dirty_buffer(iloc.bh);
+ if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) {
+ ext4_error(inode->i_sb, "IO error syncing inode, "
+ "inode=%lu, block=%llu", inode->i_ino,
+ (unsigned long long)iloc.bh->b_blocknr);
+ err = -EIO;
+ }
}
return err;
}
@@ -4776,19 +5423,21 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
if (error)
return error;
+ if (ia_valid & ATTR_SIZE)
+ dquot_initialize(inode);
if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
(ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
handle_t *handle;
/* (user+group)*(old+new) structure, inode write (sb,
* inode block, ? - but truncate inode update has it) */
- handle = ext4_journal_start(inode, 2*(EXT4_QUOTA_INIT_BLOCKS(inode->i_sb)+
- EXT4_QUOTA_DEL_BLOCKS(inode->i_sb))+3);
+ handle = ext4_journal_start(inode, (EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb)+
+ EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb))+3);
if (IS_ERR(handle)) {
error = PTR_ERR(handle);
goto err_out;
}
- error = vfs_dq_transfer(inode, attr) ? -EDQUOT : 0;
+ error = dquot_transfer(inode, attr);
if (error) {
ext4_journal_stop(handle);
return error;
@@ -4815,7 +5464,9 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
}
if (S_ISREG(inode->i_mode) &&
- attr->ia_valid & ATTR_SIZE && attr->ia_size < inode->i_size) {
+ attr->ia_valid & ATTR_SIZE &&
+ (attr->ia_size < inode->i_size ||
+ (EXT4_I(inode)->i_flags & EXT4_EOFBLOCKS_FL))) {
handle_t *handle;
handle = ext4_journal_start(inode, 3);
@@ -4846,6 +5497,9 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
goto err_out;
}
}
+ /* ext4_truncate will clear the flag */
+ if ((EXT4_I(inode)->i_flags & EXT4_EOFBLOCKS_FL))
+ ext4_truncate(inode);
}
rc = inode_setattr(inode, attr);
@@ -4931,7 +5585,7 @@ static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
* worse case, the indexs blocks spread over different block groups
*
* If datablocks are discontiguous, they are possible to spread over
- * different block groups too. If they are contiugous, with flexbg,
+ * different block groups too. If they are contiuguous, with flexbg,
* they could still across block group boundary.
*
* Also account for superblock, inode, quota and xattr blocks
@@ -5007,7 +5661,7 @@ int ext4_writepage_trans_blocks(struct inode *inode)
* Calculate the journal credits for a chunk of data modification.
*
* This is called from DIO, fallocate or whoever calling
- * ext4_get_blocks() to map/allocate a chunk of contigous disk blocks.
+ * ext4_get_blocks() to map/allocate a chunk of contiguous disk blocks.
*
* journal buffers for data blocks are not included here, as DIO
* and fallocate do no need to journal data buffers.
@@ -5033,7 +5687,7 @@ int ext4_mark_iloc_dirty(handle_t *handle,
get_bh(iloc->bh);
/* ext4_do_update_inode() does jbd2_journal_dirty_metadata */
- err = ext4_do_update_inode(handle, inode, iloc, 0);
+ err = ext4_do_update_inode(handle, inode, iloc);
put_bh(iloc->bh);
return err;
}
@@ -5084,8 +5738,8 @@ static int ext4_expand_extra_isize(struct inode *inode,
entry = IFIRST(header);
/* No extended attributes present */
- if (!(EXT4_I(inode)->i_state & EXT4_STATE_XATTR) ||
- header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC)) {
+ if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR) ||
+ header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC)) {
memset((void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE, 0,
new_extra_isize);
EXT4_I(inode)->i_extra_isize = new_extra_isize;
@@ -5129,7 +5783,7 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
err = ext4_reserve_inode_write(handle, inode, &iloc);
if (ext4_handle_valid(handle) &&
EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize &&
- !(EXT4_I(inode)->i_state & EXT4_STATE_NO_EXPAND)) {
+ !ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND)) {
/*
* We need extra buffer credits since we may write into EA block
* with this same handle. If journal_extend fails, then it will
@@ -5143,10 +5797,11 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
sbi->s_want_extra_isize,
iloc, handle);
if (ret) {
- EXT4_I(inode)->i_state |= EXT4_STATE_NO_EXPAND;
+ ext4_set_inode_state(inode,
+ EXT4_STATE_NO_EXPAND);
if (mnt_count !=
le16_to_cpu(sbi->s_es->s_mnt_count)) {
- ext4_warning(inode->i_sb, __func__,
+ ext4_warning(inode->i_sb,
"Unable to expand inode %lu. Delete"
" some EAs or run e2fsck.",
inode->i_ino);
@@ -5168,7 +5823,7 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
* i_size has been changed by generic_commit_write() and we thus need
* to include the updated inode in the current transaction.
*
- * Also, vfs_dq_alloc_block() will always dirty the inode when blocks
+ * Also, dquot_alloc_block() will always dirty the inode when blocks
* are allocated to the file.
*
* If the inode is marked synchronous, we don't honour that here - doing
@@ -5177,27 +5832,14 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
*/
void ext4_dirty_inode(struct inode *inode)
{
- handle_t *current_handle = ext4_journal_current_handle();
handle_t *handle;
- if (!ext4_handle_valid(current_handle)) {
- ext4_mark_inode_dirty(current_handle, inode);
- return;
- }
-
handle = ext4_journal_start(inode, 2);
if (IS_ERR(handle))
goto out;
- if (current_handle &&
- current_handle->h_transaction != handle->h_transaction) {
- /* This task has a transaction open against a different fs */
- printk(KERN_EMERG "%s: transactions do not match!\n",
- __func__);
- } else {
- jbd_debug(5, "marking dirty. outer handle=%p\n",
- current_handle);
- ext4_mark_inode_dirty(handle, inode);
- }
+
+ ext4_mark_inode_dirty(handle, inode);
+
ext4_journal_stop(handle);
out:
return;
@@ -5223,7 +5865,7 @@ static int ext4_pin_inode(handle_t *handle, struct inode *inode)
err = jbd2_journal_get_write_access(handle, iloc.bh);
if (!err)
err = ext4_handle_dirty_metadata(handle,
- inode,
+ NULL,
iloc.bh);
brelse(iloc.bh);
}