From 40431d6c1288793a682fc6f5e5b5c9d5cac34608 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Wed, 5 Aug 2009 12:57:59 -0400 Subject: Btrfs: optimize set extent bit The Btrfs set_extent_bit call currently searches the rbtree every time it needs to find more extent_state objects to fill the requested operation. This adds a simple test with rb_next to see if the next object in the tree was adjacent to the one we just found. If so, we skip the search and just use the next object. Signed-off-by: Chris Mason --- fs/btrfs/extent_io.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) (limited to 'fs/btrfs/extent_io.c') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 68260180f587..7e5c5a0749e2 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -694,8 +694,8 @@ again: BUG_ON(err == -EEXIST); goto out; } - state = rb_entry(node, struct extent_state, rb_node); +hit_next: last_start = state->start; last_end = state->end; @@ -706,6 +706,7 @@ again: * Just lock what we found and keep going */ if (state->start == start && state->end <= end) { + struct rb_node *next_node; set = state->state & bits; if (set && exclusive) { *failed_start = state->start; @@ -716,7 +717,17 @@ again: merge_state(tree, state); if (last_end == (u64)-1) goto out; + start = last_end + 1; + if (start < end && prealloc && !need_resched()) { + next_node = rb_next(node); + if (next_node) { + state = rb_entry(next_node, struct extent_state, + rb_node); + if (state->start == start) + goto hit_next; + } + } goto search_again; } @@ -852,7 +863,7 @@ int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask) { return set_extent_bit(tree, start, end, - EXTENT_DELALLOC | EXTENT_DIRTY, + EXTENT_DELALLOC | EXTENT_DIRTY | EXTENT_UPTODATE, 0, NULL, mask); } -- cgit v1.2.3-55-g7522 From a97adc9fffb1707da4e97f91c801660c6be92aac Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Fri, 7 Aug 2009 09:28:20 -0400 Subject: Btrfs: use larger nr_to_write for larger extents When btrfs fills a large delayed allocation extent, it is a good idea to try and convince the write_cache_pages caller to go ahead and write a good chunk of that extent. The extra IO is basically free because we know it is contiguous. Signed-off-by: Chris Mason --- fs/btrfs/extent_io.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'fs/btrfs/extent_io.c') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 7e5c5a0749e2..8d7a152a90c6 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2135,6 +2135,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, delalloc_end = 0; page_started = 0; if (!epd->extent_locked) { + u64 delalloc_to_write; /* * make sure the wbc mapping index is at least updated * to this page. @@ -2154,6 +2155,14 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, tree->ops->fill_delalloc(inode, page, delalloc_start, delalloc_end, &page_started, &nr_written); + delalloc_to_write = (delalloc_end - + max_t(u64, page_offset(page), + delalloc_start) + 1) >> + PAGE_CACHE_SHIFT; + if (wbc->nr_to_write < delalloc_to_write) { + wbc->nr_to_write = min_t(long, 8192, + delalloc_to_write); + } delalloc_start = delalloc_end + 1; } @@ -2350,7 +2359,6 @@ static int extent_write_cache_pages(struct extent_io_tree *tree, writepage_t writepage, void *data, void (*flush_fn)(void *)) { - struct backing_dev_info *bdi = mapping->backing_dev_info; int ret = 0; int done = 0; struct pagevec pvec; @@ -2425,10 +2433,6 @@ retry: } if (ret || wbc->nr_to_write <= 0) done = 1; - if (wbc->nonblocking && bdi_write_congested(bdi)) { - wbc->encountered_congestion = 1; - done = 1; - } } pagevec_release(&pvec); cond_resched(); -- cgit v1.2.3-55-g7522 From 890871be854b5f5e43e7ba2475f706209906cc24 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Wed, 2 Sep 2009 16:24:52 -0400 Subject: Btrfs: switch extent_map to a rw lock There are two main users of the extent_map tree. The first is regular file inodes, where it is evenly spread between readers and writers. The second is the chunk allocation tree, which maps blocks from logical addresses to phyiscal ones, and it is 99.99% reads. The mapping tree is a point of lock contention during heavy IO workloads, so this commit switches things to a rw lock. Signed-off-by: Chris Mason --- fs/btrfs/compression.c | 8 ++++---- fs/btrfs/disk-io.c | 14 +++++++------- fs/btrfs/extent-tree.c | 4 ++-- fs/btrfs/extent_io.c | 8 ++++---- fs/btrfs/extent_map.c | 5 +---- fs/btrfs/extent_map.h | 2 +- fs/btrfs/file.c | 8 ++++---- fs/btrfs/inode.c | 24 ++++++++++++------------ fs/btrfs/relocation.c | 4 ++-- fs/btrfs/volumes.c | 40 ++++++++++++++++++++-------------------- 10 files changed, 57 insertions(+), 60 deletions(-) (limited to 'fs/btrfs/extent_io.c') diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index de1e2fd32080..78451a58f209 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -507,10 +507,10 @@ static noinline int add_ra_bio_pages(struct inode *inode, */ set_page_extent_mapped(page); lock_extent(tree, last_offset, end, GFP_NOFS); - spin_lock(&em_tree->lock); + read_lock(&em_tree->lock); em = lookup_extent_mapping(em_tree, last_offset, PAGE_CACHE_SIZE); - spin_unlock(&em_tree->lock); + read_unlock(&em_tree->lock); if (!em || last_offset < em->start || (last_offset + PAGE_CACHE_SIZE > extent_map_end(em)) || @@ -594,11 +594,11 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, em_tree = &BTRFS_I(inode)->extent_tree; /* we need the actual starting offset of this extent in the file */ - spin_lock(&em_tree->lock); + read_lock(&em_tree->lock); em = lookup_extent_mapping(em_tree, page_offset(bio->bi_io_vec->bv_page), PAGE_CACHE_SIZE); - spin_unlock(&em_tree->lock); + read_unlock(&em_tree->lock); compressed_len = em->block_len; cb = kmalloc(compressed_bio_size(root, compressed_len), GFP_NOFS); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 20cefc6f22c4..b6cfdd9164e2 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -121,15 +121,15 @@ static struct extent_map *btree_get_extent(struct inode *inode, struct extent_map *em; int ret; - spin_lock(&em_tree->lock); + read_lock(&em_tree->lock); em = lookup_extent_mapping(em_tree, start, len); if (em) { em->bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev; - spin_unlock(&em_tree->lock); + read_unlock(&em_tree->lock); goto out; } - spin_unlock(&em_tree->lock); + read_unlock(&em_tree->lock); em = alloc_extent_map(GFP_NOFS); if (!em) { @@ -142,7 +142,7 @@ static struct extent_map *btree_get_extent(struct inode *inode, em->block_start = 0; em->bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev; - spin_lock(&em_tree->lock); + write_lock(&em_tree->lock); ret = add_extent_mapping(em_tree, em); if (ret == -EEXIST) { u64 failed_start = em->start; @@ -161,7 +161,7 @@ static struct extent_map *btree_get_extent(struct inode *inode, free_extent_map(em); em = NULL; } - spin_unlock(&em_tree->lock); + write_unlock(&em_tree->lock); if (ret) em = ERR_PTR(ret); @@ -1323,9 +1323,9 @@ static void btrfs_unplug_io_fn(struct backing_dev_info *bdi, struct page *page) offset = page_offset(page); em_tree = &BTRFS_I(inode)->extent_tree; - spin_lock(&em_tree->lock); + read_lock(&em_tree->lock); em = lookup_extent_mapping(em_tree, offset, PAGE_CACHE_SIZE); - spin_unlock(&em_tree->lock); + read_unlock(&em_tree->lock); if (!em) { __unplug_io_fn(bdi, page); return; diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 72a2b9c28e9f..edd86ae9e149 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -5396,9 +5396,9 @@ static noinline int relocate_data_extent(struct inode *reloc_inode, lock_extent(&BTRFS_I(reloc_inode)->io_tree, start, end, GFP_NOFS); while (1) { int ret; - spin_lock(&em_tree->lock); + write_lock(&em_tree->lock); ret = add_extent_mapping(em_tree, em); - spin_unlock(&em_tree->lock); + write_unlock(&em_tree->lock); if (ret != -EEXIST) { free_extent_map(em); break; diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 8d7a152a90c6..41cf1b451b41 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2786,15 +2786,15 @@ int try_release_extent_mapping(struct extent_map_tree *map, u64 len; while (start <= end) { len = end - start + 1; - spin_lock(&map->lock); + write_lock(&map->lock); em = lookup_extent_mapping(map, start, len); if (!em || IS_ERR(em)) { - spin_unlock(&map->lock); + write_unlock(&map->lock); break; } if (test_bit(EXTENT_FLAG_PINNED, &em->flags) || em->start != start) { - spin_unlock(&map->lock); + write_unlock(&map->lock); free_extent_map(em); break; } @@ -2808,7 +2808,7 @@ int try_release_extent_mapping(struct extent_map_tree *map, free_extent_map(em); } start = extent_map_end(em); - spin_unlock(&map->lock); + write_unlock(&map->lock); /* once for us */ free_extent_map(em); diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 30c9365861e6..72e9fa3c31f5 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -36,7 +36,7 @@ void extent_map_exit(void) void extent_map_tree_init(struct extent_map_tree *tree, gfp_t mask) { tree->map.rb_node = NULL; - spin_lock_init(&tree->lock); + rwlock_init(&tree->lock); } /** @@ -222,7 +222,6 @@ int add_extent_mapping(struct extent_map_tree *tree, ret = -EEXIST; goto out; } - assert_spin_locked(&tree->lock); rb = tree_insert(&tree->map, em->start, &em->rb_node); if (rb) { ret = -EEXIST; @@ -285,7 +284,6 @@ struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree, struct rb_node *next = NULL; u64 end = range_end(start, len); - assert_spin_locked(&tree->lock); rb_node = __tree_search(&tree->map, start, &prev, &next); if (!rb_node && prev) { em = rb_entry(prev, struct extent_map, rb_node); @@ -331,7 +329,6 @@ int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em) int ret = 0; WARN_ON(test_bit(EXTENT_FLAG_PINNED, &em->flags)); - assert_spin_locked(&tree->lock); rb_erase(&em->rb_node, &tree->map); em->in_tree = 0; return ret; diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h index fb6eeef06bb0..6216dfbcf9be 100644 --- a/fs/btrfs/extent_map.h +++ b/fs/btrfs/extent_map.h @@ -31,7 +31,7 @@ struct extent_map { struct extent_map_tree { struct rb_root map; - spinlock_t lock; + rwlock_t lock; }; static inline u64 extent_map_end(struct extent_map *em) diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index a760d97279ac..8a9c76aecdf3 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -188,15 +188,15 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, if (!split2) split2 = alloc_extent_map(GFP_NOFS); - spin_lock(&em_tree->lock); + write_lock(&em_tree->lock); em = lookup_extent_mapping(em_tree, start, len); if (!em) { - spin_unlock(&em_tree->lock); + write_unlock(&em_tree->lock); break; } flags = em->flags; if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) { - spin_unlock(&em_tree->lock); + write_unlock(&em_tree->lock); if (em->start <= start && (!testend || em->start + em->len >= start + len)) { free_extent_map(em); @@ -259,7 +259,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, free_extent_map(split); split = NULL; } - spin_unlock(&em_tree->lock); + write_unlock(&em_tree->lock); /* once for us */ free_extent_map(em); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 04b53b5ebe59..f1df11718618 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -612,9 +612,9 @@ static noinline int submit_compressed_extents(struct inode *inode, set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); while (1) { - spin_lock(&em_tree->lock); + write_lock(&em_tree->lock); ret = add_extent_mapping(em_tree, em); - spin_unlock(&em_tree->lock); + write_unlock(&em_tree->lock); if (ret != -EEXIST) { free_extent_map(em); break; @@ -748,9 +748,9 @@ static noinline int cow_file_range(struct inode *inode, set_bit(EXTENT_FLAG_PINNED, &em->flags); while (1) { - spin_lock(&em_tree->lock); + write_lock(&em_tree->lock); ret = add_extent_mapping(em_tree, em); - spin_unlock(&em_tree->lock); + write_unlock(&em_tree->lock); if (ret != -EEXIST) { free_extent_map(em); break; @@ -1081,9 +1081,9 @@ out_check: em->bdev = root->fs_info->fs_devices->latest_bdev; set_bit(EXTENT_FLAG_PINNED, &em->flags); while (1) { - spin_lock(&em_tree->lock); + write_lock(&em_tree->lock); ret = add_extent_mapping(em_tree, em); - spin_unlock(&em_tree->lock); + write_unlock(&em_tree->lock); if (ret != -EEXIST) { free_extent_map(em); break; @@ -1670,13 +1670,13 @@ static int btrfs_io_failed_hook(struct bio *failed_bio, failrec->last_mirror = 0; failrec->bio_flags = 0; - spin_lock(&em_tree->lock); + read_lock(&em_tree->lock); em = lookup_extent_mapping(em_tree, start, failrec->len); if (em->start > start || em->start + em->len < start) { free_extent_map(em); em = NULL; } - spin_unlock(&em_tree->lock); + read_unlock(&em_tree->lock); if (!em || IS_ERR(em)) { kfree(failrec); @@ -4069,11 +4069,11 @@ struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page, int compressed; again: - spin_lock(&em_tree->lock); + read_lock(&em_tree->lock); em = lookup_extent_mapping(em_tree, start, len); if (em) em->bdev = root->fs_info->fs_devices->latest_bdev; - spin_unlock(&em_tree->lock); + read_unlock(&em_tree->lock); if (em) { if (em->start > start || em->start + em->len <= start) @@ -4264,7 +4264,7 @@ insert: } err = 0; - spin_lock(&em_tree->lock); + write_lock(&em_tree->lock); ret = add_extent_mapping(em_tree, em); /* it is possible that someone inserted the extent into the tree * while we had the lock dropped. It is also possible that @@ -4304,7 +4304,7 @@ insert: err = 0; } } - spin_unlock(&em_tree->lock); + write_unlock(&em_tree->lock); out: if (path) btrfs_free_path(path); diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index c04f7f212602..4adab903fc2b 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -2646,9 +2646,9 @@ int relocate_data_extent(struct inode *inode, struct btrfs_key *extent_key) lock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS); while (1) { int ret; - spin_lock(&em_tree->lock); + write_lock(&em_tree->lock); ret = add_extent_mapping(em_tree, em); - spin_unlock(&em_tree->lock); + write_unlock(&em_tree->lock); if (ret != -EEXIST) { free_extent_map(em); break; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index a7e53773e743..d2358c06bbd9 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1749,9 +1749,9 @@ static int btrfs_relocate_chunk(struct btrfs_root *root, * step two, delete the device extents and the * chunk tree entries */ - spin_lock(&em_tree->lock); + read_lock(&em_tree->lock); em = lookup_extent_mapping(em_tree, chunk_offset, 1); - spin_unlock(&em_tree->lock); + read_unlock(&em_tree->lock); BUG_ON(em->start > chunk_offset || em->start + em->len < chunk_offset); @@ -1780,9 +1780,9 @@ static int btrfs_relocate_chunk(struct btrfs_root *root, ret = btrfs_remove_block_group(trans, extent_root, chunk_offset); BUG_ON(ret); - spin_lock(&em_tree->lock); + write_lock(&em_tree->lock); remove_extent_mapping(em_tree, em); - spin_unlock(&em_tree->lock); + write_unlock(&em_tree->lock); kfree(map); em->bdev = NULL; @@ -2294,9 +2294,9 @@ again: em->block_len = em->len; em_tree = &extent_root->fs_info->mapping_tree.map_tree; - spin_lock(&em_tree->lock); + write_lock(&em_tree->lock); ret = add_extent_mapping(em_tree, em); - spin_unlock(&em_tree->lock); + write_unlock(&em_tree->lock); BUG_ON(ret); free_extent_map(em); @@ -2491,9 +2491,9 @@ int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset) int readonly = 0; int i; - spin_lock(&map_tree->map_tree.lock); + read_lock(&map_tree->map_tree.lock); em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1); - spin_unlock(&map_tree->map_tree.lock); + read_unlock(&map_tree->map_tree.lock); if (!em) return 1; @@ -2518,11 +2518,11 @@ void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree) struct extent_map *em; while (1) { - spin_lock(&tree->map_tree.lock); + write_lock(&tree->map_tree.lock); em = lookup_extent_mapping(&tree->map_tree, 0, (u64)-1); if (em) remove_extent_mapping(&tree->map_tree, em); - spin_unlock(&tree->map_tree.lock); + write_unlock(&tree->map_tree.lock); if (!em) break; kfree(em->bdev); @@ -2540,9 +2540,9 @@ int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len) struct extent_map_tree *em_tree = &map_tree->map_tree; int ret; - spin_lock(&em_tree->lock); + read_lock(&em_tree->lock); em = lookup_extent_mapping(em_tree, logical, len); - spin_unlock(&em_tree->lock); + read_unlock(&em_tree->lock); BUG_ON(!em); BUG_ON(em->start > logical || em->start + em->len < logical); @@ -2604,9 +2604,9 @@ again: atomic_set(&multi->error, 0); } - spin_lock(&em_tree->lock); + read_lock(&em_tree->lock); em = lookup_extent_mapping(em_tree, logical, *length); - spin_unlock(&em_tree->lock); + read_unlock(&em_tree->lock); if (!em && unplug_page) return 0; @@ -2763,9 +2763,9 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, u64 stripe_nr; int i, j, nr = 0; - spin_lock(&em_tree->lock); + read_lock(&em_tree->lock); em = lookup_extent_mapping(em_tree, chunk_start, 1); - spin_unlock(&em_tree->lock); + read_unlock(&em_tree->lock); BUG_ON(!em || em->start != chunk_start); map = (struct map_lookup *)em->bdev; @@ -3053,9 +3053,9 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key, logical = key->offset; length = btrfs_chunk_length(leaf, chunk); - spin_lock(&map_tree->map_tree.lock); + read_lock(&map_tree->map_tree.lock); em = lookup_extent_mapping(&map_tree->map_tree, logical, 1); - spin_unlock(&map_tree->map_tree.lock); + read_unlock(&map_tree->map_tree.lock); /* already mapped? */ if (em && em->start <= logical && em->start + em->len > logical) { @@ -3114,9 +3114,9 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key, map->stripes[i].dev->in_fs_metadata = 1; } - spin_lock(&map_tree->map_tree.lock); + write_lock(&map_tree->map_tree.lock); ret = add_extent_mapping(&map_tree->map_tree, em); - spin_unlock(&map_tree->map_tree.lock); + write_unlock(&map_tree->map_tree.lock); BUG_ON(ret); free_extent_map(em); -- cgit v1.2.3-55-g7522 From e48c465bb366c0169f7908bfe62ae7080874ee7d Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Fri, 11 Sep 2009 11:25:02 -0400 Subject: Btrfs: Fix new state initialization order As the extent state tree is manipulated, there are call backs that are used to take extra actions when different state bits are set or cleared. One example of this is a counter for the total number of delayed allocation bytes in a single inode and in the whole FS. When new states are inserted, this callback is being done before we properly setup the new state. This hasn't caused problems before because the lock bit was always done first, and the existing call backs don't care about the lock bit. This patch makes sure the state is properly setup before using the callback, which is important for later optimizations that do more work without using the lock bit. Signed-off-by: Chris Mason --- fs/btrfs/extent_io.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/btrfs/extent_io.c') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 41cf1b451b41..8e168a457a37 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -367,10 +367,10 @@ static int insert_state(struct extent_io_tree *tree, } if (bits & EXTENT_DIRTY) tree->dirty_bytes += end - start + 1; - set_state_cb(tree, state, bits); - state->state |= bits; state->start = start; state->end = end; + set_state_cb(tree, state, bits); + state->state |= bits; node = tree_insert(&tree->state, end, &state->rb_node); if (node) { struct extent_state *found; -- cgit v1.2.3-55-g7522 From 1edbb734b4e010974c41d2859d22a43d04f5f1cf Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Wed, 2 Sep 2009 13:24:36 -0400 Subject: Btrfs: reduce CPU usage in the extent_state tree Btrfs is currently mirroring some of the page state bits into its extent state tree. The goal behind this was to use it in supporting blocksizes other than the page size. But, we don't currently support that, and we're using quite a lot of CPU on the rb tree and its spin lock. This commit starts a series of cleanups to reduce the amount of work done in the extent state tree as part of each IO. This commit: * Adds the ability to lock an extent in the state tree and also set other bits. The idea is to do locking and delalloc in one call * Removes the EXTENT_WRITEBACK and EXTENT_DIRTY bits. Btrfs is using a combination of the page bits and the ordered write code for this instead. Signed-off-by: Chris Mason --- fs/btrfs/extent_io.c | 75 ++++++++++++++++++---------------------------------- fs/btrfs/extent_io.h | 2 ++ fs/btrfs/file.c | 19 ------------- 3 files changed, 28 insertions(+), 68 deletions(-) (limited to 'fs/btrfs/extent_io.c') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 8e168a457a37..7c70613eb72c 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -654,25 +654,24 @@ static void set_state_bits(struct extent_io_tree *tree, } /* - * set some bits on a range in the tree. This may require allocations - * or sleeping, so the gfp mask is used to indicate what is allowed. + * set some bits on a range in the tree. This may require allocations or + * sleeping, so the gfp mask is used to indicate what is allowed. * - * If 'exclusive' == 1, this will fail with -EEXIST if some part of the - * range already has the desired bits set. The start of the existing - * range is returned in failed_start in this case. + * If any of the exclusive bits are set, this will fail with -EEXIST if some + * part of the range already has the desired bits set. The start of the + * existing range is returned in failed_start in this case. * - * [start, end] is inclusive - * This takes the tree lock. + * [start, end] is inclusive This takes the tree lock. */ + static int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - int bits, int exclusive, u64 *failed_start, + int bits, int exclusive_bits, u64 *failed_start, gfp_t mask) { struct extent_state *state; struct extent_state *prealloc = NULL; struct rb_node *node; int err = 0; - int set; u64 last_start; u64 last_end; again: @@ -707,8 +706,7 @@ hit_next: */ if (state->start == start && state->end <= end) { struct rb_node *next_node; - set = state->state & bits; - if (set && exclusive) { + if (state->state & exclusive_bits) { *failed_start = state->start; err = -EEXIST; goto out; @@ -748,8 +746,7 @@ hit_next: * desired bit on it. */ if (state->start < start) { - set = state->state & bits; - if (exclusive && set) { + if (state->state & exclusive_bits) { *failed_start = start; err = -EEXIST; goto out; @@ -799,8 +796,7 @@ hit_next: * on the first half */ if (state->start <= end && state->end > end) { - set = state->state & bits; - if (exclusive && set) { + if (state->state & exclusive_bits) { *failed_start = start; err = -EEXIST; goto out; @@ -906,19 +902,6 @@ static int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0, mask); } -static int set_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end, - gfp_t mask) -{ - return set_extent_bit(tree, start, end, EXTENT_WRITEBACK, - 0, NULL, mask); -} - -static int clear_extent_writeback(struct extent_io_tree *tree, u64 start, - u64 end, gfp_t mask) -{ - return clear_extent_bit(tree, start, end, EXTENT_WRITEBACK, 1, 0, mask); -} - int wait_on_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end) { return wait_extent_bit(tree, start, end, EXTENT_WRITEBACK); @@ -928,13 +911,14 @@ int wait_on_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end) * either insert or lock state struct between start and end use mask to tell * us if waiting is desired. */ -int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask) +int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, + int bits, gfp_t mask) { int err; u64 failed_start; while (1) { - err = set_extent_bit(tree, start, end, EXTENT_LOCKED, 1, - &failed_start, mask); + err = set_extent_bit(tree, start, end, EXTENT_LOCKED | bits, + EXTENT_LOCKED, &failed_start, mask); if (err == -EEXIST && (mask & __GFP_WAIT)) { wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED); start = failed_start; @@ -946,6 +930,11 @@ int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask) return err; } +int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask) +{ + return lock_extent_bits(tree, start, end, 0, mask); +} + int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask) { @@ -985,7 +974,6 @@ int set_range_dirty(struct extent_io_tree *tree, u64 start, u64 end) page_cache_release(page); index++; } - set_extent_dirty(tree, start, end, GFP_NOFS); return 0; } @@ -1005,7 +993,6 @@ static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end) page_cache_release(page); index++; } - set_extent_writeback(tree, start, end, GFP_NOFS); return 0; } @@ -1563,10 +1550,7 @@ static int check_page_locked(struct extent_io_tree *tree, static int check_page_writeback(struct extent_io_tree *tree, struct page *page) { - u64 start = (u64)page->index << PAGE_CACHE_SHIFT; - u64 end = start + PAGE_CACHE_SIZE - 1; - if (!test_range_bit(tree, start, end, EXTENT_WRITEBACK, 0)) - end_page_writeback(page); + end_page_writeback(page); return 0; } @@ -1624,13 +1608,11 @@ static void end_bio_extent_writepage(struct bio *bio, int err) } if (!uptodate) { - clear_extent_uptodate(tree, start, end, GFP_ATOMIC); + clear_extent_uptodate(tree, start, end, GFP_NOFS); ClearPageUptodate(page); SetPageError(page); } - clear_extent_writeback(tree, start, end, GFP_ATOMIC); - if (whole_page) end_page_writeback(page); else @@ -2208,8 +2190,9 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, printk(KERN_ERR "btrfs delalloc bits after lock_extent\n"); if (last_byte <= start) { - clear_extent_dirty(tree, start, page_end, GFP_NOFS); - unlock_extent(tree, start, page_end, GFP_NOFS); + clear_extent_bit(tree, start, page_end, + EXTENT_LOCKED | EXTENT_DIRTY, + 1, 0, GFP_NOFS); if (tree->ops && tree->ops->writepage_end_io_hook) tree->ops->writepage_end_io_hook(page, start, page_end, NULL, 1); @@ -2217,12 +2200,10 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, goto done; } - set_extent_uptodate(tree, start, page_end, GFP_NOFS); blocksize = inode->i_sb->s_blocksize; while (cur <= end) { if (cur >= last_byte) { - clear_extent_dirty(tree, cur, page_end, GFP_NOFS); unlock_extent(tree, unlock_start, page_end, GFP_NOFS); if (tree->ops && tree->ops->writepage_end_io_hook) tree->ops->writepage_end_io_hook(page, cur, @@ -2255,9 +2236,6 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, */ if (compressed || block_start == EXTENT_MAP_HOLE || block_start == EXTENT_MAP_INLINE) { - clear_extent_dirty(tree, cur, - cur + iosize - 1, GFP_NOFS); - unlock_extent(tree, unlock_start, cur + iosize - 1, GFP_NOFS); @@ -2291,7 +2269,6 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, continue; } - clear_extent_dirty(tree, cur, cur + iosize - 1, GFP_NOFS); if (tree->ops && tree->ops->writepage_io_hook) { ret = tree->ops->writepage_io_hook(page, cur, cur + iosize - 1); @@ -2619,7 +2596,7 @@ int extent_invalidatepage(struct extent_io_tree *tree, return 0; lock_extent(tree, start, end, GFP_NOFS); - wait_on_extent_writeback(tree, start, end); + wait_on_page_writeback(page); clear_extent_bit(tree, start, end, EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC, 1, 1, GFP_NOFS); diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 5bc20abf3f3d..88d134d01fbc 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -142,6 +142,8 @@ int try_release_extent_state(struct extent_map_tree *map, struct extent_io_tree *tree, struct page *page, gfp_t mask); int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask); +int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, + int bits, gfp_t mask); int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask); int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 8a9c76aecdf3..ef66c3d989b9 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -113,8 +113,6 @@ static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans, int err = 0; int i; struct inode *inode = fdentry(file)->d_inode; - struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; - u64 hint_byte; u64 num_bytes; u64 start_pos; u64 end_of_last_block; @@ -126,20 +124,6 @@ static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans, root->sectorsize - 1) & ~((u64)root->sectorsize - 1); end_of_last_block = start_pos + num_bytes - 1; - - lock_extent(io_tree, start_pos, end_of_last_block, GFP_NOFS); - trans = btrfs_join_transaction(root, 1); - if (!trans) { - err = -ENOMEM; - goto out_unlock; - } - btrfs_set_trans_block_group(trans, inode); - hint_byte = 0; - - /* check for reserved extents on each page, we don't want - * to reset the delalloc bit on things that already have - * extents reserved. - */ btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block); for (i = 0; i < num_pages; i++) { struct page *p = pages[i]; @@ -154,9 +138,6 @@ static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans, * at this time. */ } - err = btrfs_end_transaction(trans, root); -out_unlock: - unlock_extent(io_tree, start_pos, end_of_last_block, GFP_NOFS); return err; } -- cgit v1.2.3-55-g7522 From 2c64c53d8d30d43d0670482503a3914dfd3d6d46 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Wed, 2 Sep 2009 15:04:12 -0400 Subject: Btrfs: cache values for locking extents Many of the btrfs extent state tree users follow the same pattern. They lock an extent range in the tree, do some operation and then unlock. This translates to at least 2 rbtree searches, and maybe more if they are doing operations on the extent state tree. A locked extent in the tree isn't going to be merged or changed, and so we can safely return the extent state structure as a cached handle. This changes set_extent_bit to give back a cached handle, and also changes both set_extent_bit and clear_extent_bit to use the cached handle if it is available. Signed-off-by: Chris Mason --- fs/btrfs/extent_io.c | 125 ++++++++++++++++++++++++++++++++++++++------------- fs/btrfs/extent_io.h | 5 ++- fs/btrfs/inode.c | 6 +-- 3 files changed, 100 insertions(+), 36 deletions(-) (limited to 'fs/btrfs/extent_io.c') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 7c70613eb72c..c7a5e860fe21 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -471,10 +471,14 @@ static int clear_state_bit(struct extent_io_tree *tree, * bits were already set, or zero if none of the bits were already set. */ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - int bits, int wake, int delete, gfp_t mask) + int bits, int wake, int delete, + struct extent_state **cached_state, + gfp_t mask) { struct extent_state *state; + struct extent_state *cached; struct extent_state *prealloc = NULL; + struct rb_node *next_node; struct rb_node *node; u64 last_end; int err; @@ -488,6 +492,17 @@ again: } spin_lock(&tree->lock); + if (cached_state) { + cached = *cached_state; + *cached_state = NULL; + if (cached->tree && cached->start == start) { + atomic_dec(&cached->refs); + state = cached; + last_end = state->end; + goto found; + } + free_extent_state(cached); + } /* * this search will find the extents that end after * our range starts @@ -496,6 +511,7 @@ again: if (!node) goto out; state = rb_entry(node, struct extent_state, rb_node); +hit_next: if (state->start > end) goto out; WARN_ON(state->end < start); @@ -555,11 +571,21 @@ again: prealloc = NULL; goto out; } - +found: + if (state->end < end && prealloc && !need_resched()) + next_node = rb_next(&state->rb_node); + else + next_node = NULL; set |= clear_state_bit(tree, state, bits, wake, delete); if (last_end == (u64)-1) goto out; start = last_end + 1; + if (start <= end && next_node) { + state = rb_entry(next_node, struct extent_state, + rb_node); + if (state->start == start) + goto hit_next; + } goto search_again; out: @@ -653,6 +679,17 @@ static void set_state_bits(struct extent_io_tree *tree, state->state |= bits; } +static void cache_state(struct extent_state *state, + struct extent_state **cached_ptr) +{ + if (cached_ptr && !(*cached_ptr)) { + if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY)) { + *cached_ptr = state; + atomic_inc(&state->refs); + } + } +} + /* * set some bits on a range in the tree. This may require allocations or * sleeping, so the gfp mask is used to indicate what is allowed. @@ -666,6 +703,7 @@ static void set_state_bits(struct extent_io_tree *tree, static int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits, int exclusive_bits, u64 *failed_start, + struct extent_state **cached_state, gfp_t mask) { struct extent_state *state; @@ -712,6 +750,7 @@ hit_next: goto out; } set_state_bits(tree, state, bits); + cache_state(state, cached_state); merge_state(tree, state); if (last_end == (u64)-1) goto out; @@ -758,6 +797,7 @@ hit_next: goto out; if (state->end <= end) { set_state_bits(tree, state, bits); + cache_state(state, cached_state); merge_state(tree, state); if (last_end == (u64)-1) goto out; @@ -782,6 +822,7 @@ hit_next: this_end = last_start - 1; err = insert_state(tree, prealloc, start, this_end, bits); + cache_state(prealloc, cached_state); prealloc = NULL; BUG_ON(err == -EEXIST); if (err) @@ -805,6 +846,7 @@ hit_next: BUG_ON(err == -EEXIST); set_state_bits(tree, prealloc, bits); + cache_state(prealloc, cached_state); merge_state(tree, prealloc); prealloc = NULL; goto out; @@ -833,26 +875,27 @@ int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask) { return set_extent_bit(tree, start, end, EXTENT_DIRTY, 0, NULL, - mask); + NULL, mask); } int set_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask) { - return set_extent_bit(tree, start, end, EXTENT_ORDERED, 0, NULL, mask); + return set_extent_bit(tree, start, end, EXTENT_ORDERED, 0, NULL, NULL, + mask); } int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, int bits, gfp_t mask) { return set_extent_bit(tree, start, end, bits, 0, NULL, - mask); + NULL, mask); } int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, int bits, gfp_t mask) { - return clear_extent_bit(tree, start, end, bits, 0, 0, mask); + return clear_extent_bit(tree, start, end, bits, 0, 0, NULL, mask); } int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end, @@ -860,46 +903,50 @@ int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end, { return set_extent_bit(tree, start, end, EXTENT_DELALLOC | EXTENT_DIRTY | EXTENT_UPTODATE, - 0, NULL, mask); + 0, NULL, NULL, mask); } int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask) { return clear_extent_bit(tree, start, end, - EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, mask); + EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, + NULL, mask); } int clear_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask) { - return clear_extent_bit(tree, start, end, EXTENT_ORDERED, 1, 0, mask); + return clear_extent_bit(tree, start, end, EXTENT_ORDERED, 1, 0, + NULL, mask); } int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask) { return set_extent_bit(tree, start, end, EXTENT_NEW, 0, NULL, - mask); + NULL, mask); } static int clear_extent_new(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask) { - return clear_extent_bit(tree, start, end, EXTENT_NEW, 0, 0, mask); + return clear_extent_bit(tree, start, end, EXTENT_NEW, 0, 0, + NULL, mask); } int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask) { return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, NULL, - mask); + NULL, mask); } static int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask) { - return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0, mask); + return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0, + NULL, mask); } int wait_on_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end) @@ -912,13 +959,14 @@ int wait_on_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end) * us if waiting is desired. */ int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, - int bits, gfp_t mask) + int bits, struct extent_state **cached_state, gfp_t mask) { int err; u64 failed_start; while (1) { err = set_extent_bit(tree, start, end, EXTENT_LOCKED | bits, - EXTENT_LOCKED, &failed_start, mask); + EXTENT_LOCKED, &failed_start, + cached_state, mask); if (err == -EEXIST && (mask & __GFP_WAIT)) { wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED); start = failed_start; @@ -932,7 +980,7 @@ int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask) { - return lock_extent_bits(tree, start, end, 0, mask); + return lock_extent_bits(tree, start, end, 0, NULL, mask); } int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end, @@ -941,21 +989,29 @@ int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end, int err; u64 failed_start; - err = set_extent_bit(tree, start, end, EXTENT_LOCKED, 1, - &failed_start, mask); + err = set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED, + &failed_start, NULL, mask); if (err == -EEXIST) { if (failed_start > start) clear_extent_bit(tree, start, failed_start - 1, - EXTENT_LOCKED, 1, 0, mask); + EXTENT_LOCKED, 1, 0, NULL, mask); return 0; } return 1; } +int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end, + struct extent_state **cached, gfp_t mask) +{ + return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, cached, + mask); +} + int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask) { - return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, mask); + return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, NULL, + mask); } /* @@ -1323,7 +1379,7 @@ int extent_clear_unlock_delalloc(struct inode *inode, if (clear_delalloc) clear_bits |= EXTENT_DELALLOC; - clear_extent_bit(tree, start, end, clear_bits, 1, 0, GFP_NOFS); + clear_extent_bit(tree, start, end, clear_bits, 1, 0, NULL, GFP_NOFS); if (!(unlock_pages || clear_dirty || set_writeback || end_writeback)) return 0; @@ -2071,6 +2127,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, u64 iosize; u64 unlock_start; sector_t sector; + struct extent_state *cached_state = NULL; struct extent_map *em; struct block_device *bdev; int ret; @@ -2162,7 +2219,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, goto done_unlocked; } } - lock_extent(tree, start, page_end, GFP_NOFS); + lock_extent_bits(tree, start, page_end, 0, &cached_state, GFP_NOFS); unlock_start = start; @@ -2170,7 +2227,8 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, ret = tree->ops->writepage_start_hook(page, start, page_end); if (ret == -EAGAIN) { - unlock_extent(tree, start, page_end, GFP_NOFS); + unlock_extent_cached(tree, start, page_end, + &cached_state, GFP_NOFS); redirty_page_for_writepage(wbc, page); update_nr_written(page, wbc, nr_written); unlock_page(page); @@ -2192,7 +2250,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, if (last_byte <= start) { clear_extent_bit(tree, start, page_end, EXTENT_LOCKED | EXTENT_DIRTY, - 1, 0, GFP_NOFS); + 1, 0, NULL, GFP_NOFS); if (tree->ops && tree->ops->writepage_end_io_hook) tree->ops->writepage_end_io_hook(page, start, page_end, NULL, 1); @@ -2204,7 +2262,8 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, while (cur <= end) { if (cur >= last_byte) { - unlock_extent(tree, unlock_start, page_end, GFP_NOFS); + unlock_extent_cached(tree, unlock_start, page_end, + &cached_state, GFP_NOFS); if (tree->ops && tree->ops->writepage_end_io_hook) tree->ops->writepage_end_io_hook(page, cur, page_end, NULL, 1); @@ -2236,8 +2295,9 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, */ if (compressed || block_start == EXTENT_MAP_HOLE || block_start == EXTENT_MAP_INLINE) { - unlock_extent(tree, unlock_start, cur + iosize - 1, - GFP_NOFS); + unlock_extent_cached(tree, unlock_start, + cur + iosize - 1, &cached_state, + GFP_NOFS); /* * end_io notification does not happen here for @@ -2307,11 +2367,14 @@ done: end_page_writeback(page); } if (unlock_start <= page_end) - unlock_extent(tree, unlock_start, page_end, GFP_NOFS); + unlock_extent_cached(tree, unlock_start, page_end, + &cached_state, GFP_NOFS); unlock_page(page); done_unlocked: + /* drop our reference on any cached states */ + free_extent_state(cached_state); return 0; } @@ -2599,7 +2662,7 @@ int extent_invalidatepage(struct extent_io_tree *tree, wait_on_page_writeback(page); clear_extent_bit(tree, start, end, EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC, - 1, 1, GFP_NOFS); + 1, 1, NULL, GFP_NOFS); return 0; } @@ -2693,7 +2756,7 @@ int extent_prepare_write(struct extent_io_tree *tree, */ set_extent_bit(tree, block_start, block_start + iosize - 1, - EXTENT_LOCKED, 0, NULL, GFP_NOFS); + EXTENT_LOCKED, 0, NULL, NULL, GFP_NOFS); ret = submit_extent_page(READ, tree, page, sector, iosize, page_offset, em->bdev, NULL, 1, @@ -2740,7 +2803,7 @@ int try_release_extent_state(struct extent_map_tree *map, if ((mask & GFP_NOFS) == GFP_NOFS) mask = GFP_NOFS; clear_extent_bit(tree, start, end, EXTENT_UPTODATE, - 1, 1, mask); + 1, 1, NULL, mask); } return ret; } diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 88d134d01fbc..c8ead2b8c4c9 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -143,7 +143,7 @@ int try_release_extent_state(struct extent_map_tree *map, gfp_t mask); int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask); int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, - int bits, gfp_t mask); + int bits, struct extent_state **cached, gfp_t mask); int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask); int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask); @@ -161,7 +161,8 @@ int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, int bits, gfp_t mask); int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - int bits, int wake, int delete, gfp_t mask); + int bits, int wake, int delete, struct extent_state **cached, + gfp_t mask); int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, int bits, gfp_t mask); int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index f1df11718618..e494545c4202 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -854,7 +854,7 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page, int limit = 10 * 1024 * 1042; clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED | - EXTENT_DELALLOC, 1, 0, GFP_NOFS); + EXTENT_DELALLOC, 1, 0, NULL, GFP_NOFS); while (start < end) { async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS); async_cow->inode = inode; @@ -4420,7 +4420,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset) */ clear_extent_bit(tree, page_start, page_end, EXTENT_DIRTY | EXTENT_DELALLOC | - EXTENT_LOCKED, 1, 0, GFP_NOFS); + EXTENT_LOCKED, 1, 0, NULL, GFP_NOFS); btrfs_finish_ordered_io(page->mapping->host, page_start, page_end); btrfs_put_ordered_extent(ordered); @@ -4429,7 +4429,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset) clear_extent_bit(tree, page_start, page_end, EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_ORDERED, - 1, 1, GFP_NOFS); + 1, 1, NULL, GFP_NOFS); __btrfs_releasepage(page, GFP_NOFS); ClearPageChecked(page); -- cgit v1.2.3-55-g7522 From d5550c6315fe0647b7ac21a6a736bf4a42620eac Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Wed, 2 Sep 2009 15:11:07 -0400 Subject: Btrfs: don't lock bits in the extent tree during writepage At writepage time, we have the page locked and we have the extent_map entry for this extent pinned in the extent_map tree. So, the page can't go away and its mapping can't change. There is no need for the extra extent_state lock bits during writepage. Signed-off-by: Chris Mason --- fs/btrfs/extent_io.c | 21 --------------------- 1 file changed, 21 deletions(-) (limited to 'fs/btrfs/extent_io.c') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index c7a5e860fe21..04fafc3cffc0 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2219,16 +2219,10 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, goto done_unlocked; } } - lock_extent_bits(tree, start, page_end, 0, &cached_state, GFP_NOFS); - - unlock_start = start; - if (tree->ops && tree->ops->writepage_start_hook) { ret = tree->ops->writepage_start_hook(page, start, page_end); if (ret == -EAGAIN) { - unlock_extent_cached(tree, start, page_end, - &cached_state, GFP_NOFS); redirty_page_for_writepage(wbc, page); update_nr_written(page, wbc, nr_written); unlock_page(page); @@ -2244,13 +2238,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, update_nr_written(page, wbc, nr_written + 1); end = page_end; - if (test_range_bit(tree, start, page_end, EXTENT_DELALLOC, 0)) - printk(KERN_ERR "btrfs delalloc bits after lock_extent\n"); - if (last_byte <= start) { - clear_extent_bit(tree, start, page_end, - EXTENT_LOCKED | EXTENT_DIRTY, - 1, 0, NULL, GFP_NOFS); if (tree->ops && tree->ops->writepage_end_io_hook) tree->ops->writepage_end_io_hook(page, start, page_end, NULL, 1); @@ -2262,8 +2250,6 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, while (cur <= end) { if (cur >= last_byte) { - unlock_extent_cached(tree, unlock_start, page_end, - &cached_state, GFP_NOFS); if (tree->ops && tree->ops->writepage_end_io_hook) tree->ops->writepage_end_io_hook(page, cur, page_end, NULL, 1); @@ -2295,10 +2281,6 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, */ if (compressed || block_start == EXTENT_MAP_HOLE || block_start == EXTENT_MAP_INLINE) { - unlock_extent_cached(tree, unlock_start, - cur + iosize - 1, &cached_state, - GFP_NOFS); - /* * end_io notification does not happen here for * compressed extents @@ -2366,9 +2348,6 @@ done: set_page_writeback(page); end_page_writeback(page); } - if (unlock_start <= page_end) - unlock_extent_cached(tree, unlock_start, page_end, - &cached_state, GFP_NOFS); unlock_page(page); done_unlocked: -- cgit v1.2.3-55-g7522 From 9655d2982b53fdb38a9e0f2f11315b99b92d66e2 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Wed, 2 Sep 2009 15:22:30 -0400 Subject: Btrfs: use a cached state for extent state operations during delalloc This changes the btrfs code to find delalloc ranges in the extent state tree to use the new state caching code from set/test bit. It reduces one of the biggest causes of rbtree searches in the writeback path. test_range_bit is also modified to take the cached state as a starting point while searching. Signed-off-by: Chris Mason --- fs/btrfs/extent_io.c | 46 +++++++++++++++++++++++++++++++--------------- fs/btrfs/extent_io.h | 2 +- fs/btrfs/inode.c | 6 +++--- fs/btrfs/ordered-data.c | 8 ++++---- fs/btrfs/relocation.c | 2 +- 5 files changed, 40 insertions(+), 24 deletions(-) (limited to 'fs/btrfs/extent_io.c') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 04fafc3cffc0..c9a438d374b6 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -720,6 +720,13 @@ again: } spin_lock(&tree->lock); + if (cached_state && *cached_state) { + state = *cached_state; + if (state->start == start && state->tree) { + node = &state->rb_node; + goto hit_next; + } + } /* * this search will find all the extents that end after * our range starts. @@ -1286,6 +1293,7 @@ static noinline u64 find_lock_delalloc_range(struct inode *inode, u64 delalloc_start; u64 delalloc_end; u64 found; + struct extent_state *cached_state = NULL; int ret; int loops = 0; @@ -1323,6 +1331,7 @@ again: /* some of the pages are gone, lets avoid looping by * shortening the size of the delalloc range we're searching */ + free_extent_state(cached_state); if (!loops) { unsigned long offset = (*start) & (PAGE_CACHE_SIZE - 1); max_bytes = PAGE_CACHE_SIZE - offset; @@ -1336,18 +1345,21 @@ again: BUG_ON(ret); /* step three, lock the state bits for the whole range */ - lock_extent(tree, delalloc_start, delalloc_end, GFP_NOFS); + lock_extent_bits(tree, delalloc_start, delalloc_end, + 0, &cached_state, GFP_NOFS); /* then test to make sure it is all still delalloc */ ret = test_range_bit(tree, delalloc_start, delalloc_end, - EXTENT_DELALLOC, 1); + EXTENT_DELALLOC, 1, cached_state); if (!ret) { - unlock_extent(tree, delalloc_start, delalloc_end, GFP_NOFS); + unlock_extent_cached(tree, delalloc_start, delalloc_end, + &cached_state, GFP_NOFS); __unlock_for_delalloc(inode, locked_page, delalloc_start, delalloc_end); cond_resched(); goto again; } + free_extent_state(cached_state); *start = delalloc_start; *end = delalloc_end; out_failed: @@ -1530,14 +1542,17 @@ out: * range is found set. */ int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, - int bits, int filled) + int bits, int filled, struct extent_state *cached) { struct extent_state *state = NULL; struct rb_node *node; int bitset = 0; spin_lock(&tree->lock); - node = tree_search(tree, start); + if (cached && cached->tree && cached->start == start) + node = &cached->rb_node; + else + node = tree_search(tree, start); while (node && start <= end) { state = rb_entry(node, struct extent_state, rb_node); @@ -1580,7 +1595,7 @@ static int check_page_uptodate(struct extent_io_tree *tree, { u64 start = (u64)page->index << PAGE_CACHE_SHIFT; u64 end = start + PAGE_CACHE_SIZE - 1; - if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1)) + if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL)) SetPageUptodate(page); return 0; } @@ -1594,7 +1609,7 @@ static int check_page_locked(struct extent_io_tree *tree, { u64 start = (u64)page->index << PAGE_CACHE_SHIFT; u64 end = start + PAGE_CACHE_SIZE - 1; - if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0)) + if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0, NULL)) unlock_page(page); return 0; } @@ -2032,7 +2047,8 @@ static int __extent_read_full_page(struct extent_io_tree *tree, continue; } /* the get_extent function already copied into the page */ - if (test_range_bit(tree, cur, cur_end, EXTENT_UPTODATE, 1)) { + if (test_range_bit(tree, cur, cur_end, + EXTENT_UPTODATE, 1, NULL)) { check_page_uptodate(tree, page); unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS); cur = cur + iosize; @@ -2305,7 +2321,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, } /* leave this out until we have a page_mkwrite call */ if (0 && !test_range_bit(tree, cur, cur + iosize - 1, - EXTENT_DIRTY, 0)) { + EXTENT_DIRTY, 0, NULL)) { cur = cur + iosize; pg_offset += iosize; continue; @@ -2721,7 +2737,7 @@ int extent_prepare_write(struct extent_io_tree *tree, !isnew && !PageUptodate(page) && (block_off_end > to || block_off_start < from) && !test_range_bit(tree, block_start, cur_end, - EXTENT_UPTODATE, 1)) { + EXTENT_UPTODATE, 1, NULL)) { u64 sector; u64 extent_offset = block_start - em->start; size_t iosize; @@ -2776,7 +2792,7 @@ int try_release_extent_state(struct extent_map_tree *map, int ret = 1; if (test_range_bit(tree, start, end, - EXTENT_IOBITS | EXTENT_ORDERED, 0)) + EXTENT_IOBITS | EXTENT_ORDERED, 0, NULL)) ret = 0; else { if ((mask & GFP_NOFS) == GFP_NOFS) @@ -2821,7 +2837,7 @@ int try_release_extent_mapping(struct extent_map_tree *map, extent_map_end(em) - 1, EXTENT_LOCKED | EXTENT_WRITEBACK | EXTENT_ORDERED, - 0)) { + 0, NULL)) { remove_extent_mapping(map, em); /* once for the rb tree */ free_extent_map(em); @@ -3237,7 +3253,7 @@ int extent_range_uptodate(struct extent_io_tree *tree, int uptodate; unsigned long index; - ret = test_range_bit(tree, start, end, EXTENT_UPTODATE, 1); + ret = test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL); if (ret) return 1; while (start <= end) { @@ -3267,7 +3283,7 @@ int extent_buffer_uptodate(struct extent_io_tree *tree, return 1; ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1, - EXTENT_UPTODATE, 1); + EXTENT_UPTODATE, 1, NULL); if (ret) return ret; @@ -3303,7 +3319,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree, return 0; if (test_range_bit(tree, eb->start, eb->start + eb->len - 1, - EXTENT_UPTODATE, 1)) { + EXTENT_UPTODATE, 1, NULL)) { return 0; } diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index c8ead2b8c4c9..09cd6fa3cc86 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -157,7 +157,7 @@ u64 count_range_bits(struct extent_io_tree *tree, u64 max_bytes, unsigned long bits); int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, - int bits, int filled); + int bits, int filled, struct extent_state *cached_state); int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, int bits, gfp_t mask); int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index e494545c4202..3f8e93de2989 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1376,7 +1376,7 @@ again: /* already ordered? We're done */ if (test_range_bit(&BTRFS_I(inode)->io_tree, page_start, page_end, - EXTENT_ORDERED, 0)) { + EXTENT_ORDERED, 0, NULL)) { goto out; } @@ -1417,7 +1417,7 @@ static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end) int ret; ret = test_range_bit(&BTRFS_I(inode)->io_tree, start, end, - EXTENT_ORDERED, 0); + EXTENT_ORDERED, 0, NULL); if (ret) return 0; @@ -1795,7 +1795,7 @@ static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end, return 0; if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID && - test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1)) { + test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1, NULL)) { clear_extent_bits(io_tree, start, end, EXTENT_NODATASUM, GFP_NOFS); return 0; diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index d6f0806c682f..7f751e462f0b 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -262,7 +262,7 @@ int btrfs_dec_test_ordered_pending(struct inode *inode, ret = test_range_bit(io_tree, entry->file_offset, entry->file_offset + entry->len - 1, - EXTENT_ORDERED, 0); + EXTENT_ORDERED, 0, NULL); if (ret == 0) ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags); out: @@ -522,7 +522,7 @@ again: end--; } if (test_range_bit(&BTRFS_I(inode)->io_tree, start, orig_end, - EXTENT_ORDERED | EXTENT_DELALLOC, 0)) { + EXTENT_ORDERED | EXTENT_DELALLOC, 0, NULL)) { schedule_timeout(1); goto again; } @@ -613,7 +613,7 @@ int btrfs_ordered_update_i_size(struct inode *inode, */ if (test_range_bit(io_tree, disk_i_size, ordered->file_offset + ordered->len - 1, - EXTENT_DELALLOC, 0)) { + EXTENT_DELALLOC, 0, NULL)) { goto out; } /* @@ -664,7 +664,7 @@ int btrfs_ordered_update_i_size(struct inode *inode, */ if (i_size_test > entry_end(ordered) && !test_range_bit(io_tree, entry_end(ordered), i_size_test - 1, - EXTENT_DELALLOC, 0)) { + EXTENT_DELALLOC, 0, NULL)) { new_i_size = min_t(u64, i_size_test, i_size_read(inode)); } BTRFS_I(inode)->disk_i_size = new_i_size; diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 4adab903fc2b..3be16ccc7eea 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -2180,7 +2180,7 @@ static int tree_block_processed(u64 bytenr, u32 blocksize, struct reloc_control *rc) { if (test_range_bit(&rc->processed_blocks, bytenr, - bytenr + blocksize - 1, EXTENT_DIRTY, 1)) + bytenr + blocksize - 1, EXTENT_DIRTY, 1, NULL)) return 1; return 0; } -- cgit v1.2.3-55-g7522 From 8b62b72b26bcd72082c4a69d179dd906bcc22200 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Wed, 2 Sep 2009 16:53:46 -0400 Subject: Btrfs: Use PagePrivate2 to track pages in the data=ordered code. Btrfs writes go through delalloc to the data=ordered code. This makes sure that all of the data is on disk before the metadata that references it. The tracking means that we have to make sure each page in an extent is fully written before we add that extent into the on-disk btree. This was done in the past by setting the EXTENT_ORDERED bit for the range of an extent when it was added to the data=ordered code, and then clearing the EXTENT_ORDERED bit in the extent state tree as each page finished IO. One of the reasons we had to do this was because sometimes pages are magically dirtied without page_mkwrite being called. The EXTENT_ORDERED bit is checked at writepage time, and if it isn't there, our page become dirty without going through the proper path. These bit operations make for a number of rbtree searches for each page, and can cause considerable lock contention. This commit switches from the EXTENT_ORDERED bit to use PagePrivate2. As pages go into the ordered code, PagePrivate2 is set on each one. This is a cheap operation because we already have all the pages locked and ready to go. As IO finishes, the PagePrivate2 bit is cleared and the ordered accoutning is updated for each page. At writepage time, if the PagePrivate2 bit is missing, we go into the writepage fixup code to handle improperly dirtied pages. Signed-off-by: Chris Mason --- fs/btrfs/extent_io.c | 29 ++++++++++------------------- fs/btrfs/extent_io.h | 9 ++++----- fs/btrfs/inode.c | 47 ++++++++++++++++++++++++++++++----------------- fs/btrfs/ordered-data.c | 29 +++++++++++++++-------------- fs/btrfs/ordered-data.h | 3 +++ 5 files changed, 62 insertions(+), 55 deletions(-) (limited to 'fs/btrfs/extent_io.c') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index c9a438d374b6..a102422cd92e 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -885,13 +885,6 @@ int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, NULL, mask); } -int set_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end, - gfp_t mask) -{ - return set_extent_bit(tree, start, end, EXTENT_ORDERED, 0, NULL, NULL, - mask); -} - int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, int bits, gfp_t mask) { @@ -921,13 +914,6 @@ int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, NULL, mask); } -int clear_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end, - gfp_t mask) -{ - return clear_extent_bit(tree, start, end, EXTENT_ORDERED, 1, 0, - NULL, mask); -} - int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask) { @@ -1373,7 +1359,8 @@ int extent_clear_unlock_delalloc(struct inode *inode, int clear_unlock, int clear_delalloc, int clear_dirty, int set_writeback, - int end_writeback) + int end_writeback, + int set_private2) { int ret; struct page *pages[16]; @@ -1392,7 +1379,8 @@ int extent_clear_unlock_delalloc(struct inode *inode, clear_bits |= EXTENT_DELALLOC; clear_extent_bit(tree, start, end, clear_bits, 1, 0, NULL, GFP_NOFS); - if (!(unlock_pages || clear_dirty || set_writeback || end_writeback)) + if (!(unlock_pages || clear_dirty || set_writeback || end_writeback || + set_private2)) return 0; while (nr_pages > 0) { @@ -1400,6 +1388,10 @@ int extent_clear_unlock_delalloc(struct inode *inode, min_t(unsigned long, nr_pages, ARRAY_SIZE(pages)), pages); for (i = 0; i < ret; i++) { + + if (set_private2) + SetPagePrivate2(pages[i]); + if (pages[i] == locked_page) { page_cache_release(pages[i]); continue; @@ -2792,7 +2784,7 @@ int try_release_extent_state(struct extent_map_tree *map, int ret = 1; if (test_range_bit(tree, start, end, - EXTENT_IOBITS | EXTENT_ORDERED, 0, NULL)) + EXTENT_IOBITS, 0, NULL)) ret = 0; else { if ((mask & GFP_NOFS) == GFP_NOFS) @@ -2835,8 +2827,7 @@ int try_release_extent_mapping(struct extent_map_tree *map, } if (!test_range_bit(tree, em->start, extent_map_end(em) - 1, - EXTENT_LOCKED | EXTENT_WRITEBACK | - EXTENT_ORDERED, + EXTENT_LOCKED | EXTENT_WRITEBACK, 0, NULL)) { remove_extent_mapping(map, em); /* once for the rb tree */ diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 09cd6fa3cc86..14ed16fd862d 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -13,10 +13,8 @@ #define EXTENT_DEFRAG (1 << 6) #define EXTENT_DEFRAG_DONE (1 << 7) #define EXTENT_BUFFER_FILLED (1 << 8) -#define EXTENT_ORDERED (1 << 9) -#define EXTENT_ORDERED_METADATA (1 << 10) -#define EXTENT_BOUNDARY (1 << 11) -#define EXTENT_NODATASUM (1 << 12) +#define EXTENT_BOUNDARY (1 << 9) +#define EXTENT_NODATASUM (1 << 10) #define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK) /* flags for bio submission */ @@ -285,5 +283,6 @@ int extent_clear_unlock_delalloc(struct inode *inode, int clear_unlock, int clear_delalloc, int clear_dirty, int set_writeback, - int end_writeback); + int end_writeback, + int set_private2); #endif diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 3f8e93de2989..739a245e25d6 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -426,7 +426,7 @@ again: extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree, start, end, NULL, 1, 0, - 0, 1, 1, 1); + 0, 1, 1, 1, 0); ret = 0; goto free_pages_out; } @@ -641,7 +641,7 @@ static noinline int submit_compressed_extents(struct inode *inode, async_extent->start, async_extent->start + async_extent->ram_size - 1, - NULL, 1, 1, 0, 1, 1, 0); + NULL, 1, 1, 0, 1, 1, 0, 0); ret = btrfs_submit_compressed_write(inode, async_extent->start, @@ -714,7 +714,7 @@ static noinline int cow_file_range(struct inode *inode, extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree, start, end, NULL, 1, 1, - 1, 1, 1, 1); + 1, 1, 1, 1, 0); *nr_written = *nr_written + (end - start + PAGE_CACHE_SIZE) / PAGE_CACHE_SIZE; *page_started = 1; @@ -777,11 +777,14 @@ static noinline int cow_file_range(struct inode *inode, /* we're not doing compressed IO, don't unlock the first * page (which the caller expects to stay locked), don't * clear any dirty bits and don't set any writeback bits + * + * Do set the Private2 bit so we know this page was properly + * setup for writepage */ extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree, start, start + ram_size - 1, locked_page, unlock, 1, - 1, 0, 0, 0); + 1, 0, 0, 0, 1); disk_num_bytes -= cur_alloc_size; num_bytes -= cur_alloc_size; alloc_hint = ins.objectid + ins.offset; @@ -1102,7 +1105,7 @@ out_check: extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree, cur_offset, cur_offset + num_bytes - 1, - locked_page, 1, 1, 1, 0, 0, 0); + locked_page, 1, 1, 1, 0, 0, 0, 1); cur_offset = extent_end; if (cur_offset > end) break; @@ -1375,10 +1378,8 @@ again: lock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end, GFP_NOFS); /* already ordered? We're done */ - if (test_range_bit(&BTRFS_I(inode)->io_tree, page_start, page_end, - EXTENT_ORDERED, 0, NULL)) { + if (PagePrivate2(page)) goto out; - } ordered = btrfs_lookup_ordered_extent(inode, page_start); if (ordered) { @@ -1414,11 +1415,9 @@ static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end) struct inode *inode = page->mapping->host; struct btrfs_writepage_fixup *fixup; struct btrfs_root *root = BTRFS_I(inode)->root; - int ret; - ret = test_range_bit(&BTRFS_I(inode)->io_tree, start, end, - EXTENT_ORDERED, 0, NULL); - if (ret) + /* this page is properly in the ordered list */ + if (TestClearPagePrivate2(page)) return 0; if (PageChecked(page)) @@ -1624,6 +1623,7 @@ nocow: static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end, struct extent_state *state, int uptodate) { + ClearPagePrivate2(page); return btrfs_finish_ordered_io(page->mapping->host, start, end); } @@ -4403,13 +4403,21 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset) u64 page_start = page_offset(page); u64 page_end = page_start + PAGE_CACHE_SIZE - 1; + + /* + * we have the page locked, so new writeback can't start, + * and the dirty bit won't be cleared while we are here. + * + * Wait for IO on this page so that we can safely clear + * the PagePrivate2 bit and do ordered accounting + */ wait_on_page_writeback(page); + tree = &BTRFS_I(page->mapping->host)->io_tree; if (offset) { btrfs_releasepage(page, GFP_NOFS); return; } - lock_extent(tree, page_start, page_end, GFP_NOFS); ordered = btrfs_lookup_ordered_extent(page->mapping->host, page_offset(page)); @@ -4421,14 +4429,19 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset) clear_extent_bit(tree, page_start, page_end, EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_LOCKED, 1, 0, NULL, GFP_NOFS); - btrfs_finish_ordered_io(page->mapping->host, - page_start, page_end); + /* + * whoever cleared the private bit is responsible + * for the finish_ordered_io + */ + if (TestClearPagePrivate2(page)) { + btrfs_finish_ordered_io(page->mapping->host, + page_start, page_end); + } btrfs_put_ordered_extent(ordered); lock_extent(tree, page_start, page_end, GFP_NOFS); } clear_extent_bit(tree, page_start, page_end, - EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC | - EXTENT_ORDERED, + EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC, 1, 1, NULL, GFP_NOFS); __btrfs_releasepage(page, GFP_NOFS); diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 7f751e462f0b..4a9c8c4cec25 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -159,8 +159,6 @@ static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree, * * len is the length of the extent * - * This also sets the EXTENT_ORDERED bit on the range in the inode. - * * The tree is given a single reference on the ordered extent that was * inserted. */ @@ -181,6 +179,7 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, entry->start = start; entry->len = len; entry->disk_len = disk_len; + entry->bytes_left = len; entry->inode = inode; if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE) set_bit(type, &entry->flags); @@ -195,9 +194,6 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, &entry->rb_node); BUG_ON(node); - set_extent_ordered(&BTRFS_I(inode)->io_tree, file_offset, - entry_end(entry) - 1, GFP_NOFS); - spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock); list_add_tail(&entry->root_extent_list, &BTRFS_I(inode)->root->fs_info->ordered_extents); @@ -241,13 +237,10 @@ int btrfs_dec_test_ordered_pending(struct inode *inode, struct btrfs_ordered_inode_tree *tree; struct rb_node *node; struct btrfs_ordered_extent *entry; - struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; int ret; tree = &BTRFS_I(inode)->ordered_tree; mutex_lock(&tree->mutex); - clear_extent_ordered(io_tree, file_offset, file_offset + io_size - 1, - GFP_NOFS); node = tree_search(tree, file_offset); if (!node) { ret = 1; @@ -260,11 +253,16 @@ int btrfs_dec_test_ordered_pending(struct inode *inode, goto out; } - ret = test_range_bit(io_tree, entry->file_offset, - entry->file_offset + entry->len - 1, - EXTENT_ORDERED, 0, NULL); - if (ret == 0) + if (io_size > entry->bytes_left) { + printk(KERN_CRIT "bad ordered accounting left %llu size %llu\n", + (unsigned long long)entry->bytes_left, + (unsigned long long)io_size); + } + entry->bytes_left -= io_size; + if (entry->bytes_left == 0) ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags); + else + ret = 1; out: mutex_unlock(&tree->mutex); return ret == 0; @@ -476,6 +474,7 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len) u64 orig_end; u64 wait_end; struct btrfs_ordered_extent *ordered; + int found; if (start + len < start) { orig_end = INT_LIMIT(loff_t); @@ -502,6 +501,7 @@ again: orig_end >> PAGE_CACHE_SHIFT); end = orig_end; + found = 0; while (1) { ordered = btrfs_lookup_first_ordered_extent(inode, end); if (!ordered) @@ -514,6 +514,7 @@ again: btrfs_put_ordered_extent(ordered); break; } + found++; btrfs_start_ordered_extent(inode, ordered, 1); end = ordered->file_offset; btrfs_put_ordered_extent(ordered); @@ -521,8 +522,8 @@ again: break; end--; } - if (test_range_bit(&BTRFS_I(inode)->io_tree, start, orig_end, - EXTENT_ORDERED | EXTENT_DELALLOC, 0, NULL)) { + if (found || test_range_bit(&BTRFS_I(inode)->io_tree, start, orig_end, + EXTENT_DELALLOC, 0, NULL)) { schedule_timeout(1); goto again; } diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index 3d31c8827b01..993a7ea45c70 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -85,6 +85,9 @@ struct btrfs_ordered_extent { /* extent length on disk */ u64 disk_len; + /* number of bytes that still need writing */ + u64 bytes_left; + /* flags (described above) */ unsigned long flags; -- cgit v1.2.3-55-g7522