diff options
author | Linus Torvalds | 2019-07-17 00:12:56 +0200 |
---|---|---|
committer | Linus Torvalds | 2019-07-17 00:12:56 +0200 |
commit | a18f8775419d3df282dd83efdb51c5a64d092f31 (patch) | |
tree | 1e0abc5c1d30e8bc58dc23099017eca496992fd2 /fs/btrfs | |
parent | Merge tag 'arc-5.3-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/vgup... (diff) | |
parent | btrfs: fix memory leak of path on error return path (diff) | |
download | kernel-qcow2-linux-a18f8775419d3df282dd83efdb51c5a64d092f31.tar.gz kernel-qcow2-linux-a18f8775419d3df282dd83efdb51c5a64d092f31.tar.xz kernel-qcow2-linux-a18f8775419d3df282dd83efdb51c5a64d092f31.zip |
Merge tag 'for-5.3-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux
Pull btrfs updates from David Sterba:
"Highlights:
- chunks that have been trimmed and unchanged since last mount are
tracked and skipped on repeated trims
- use hw assissed crc32c on more arches, speedups if native
instructions or optimized implementation is available
- the RAID56 incompat bit is automatically removed when the last
block group of that type is removed
Fixes:
- fsync fix for reflink on NODATACOW files that could lead to ENOSPC
- fix data loss after inode eviction, renaming it, and fsync it
- fix fsync not persisting dentry deletions due to inode evictions
- update ctime/mtime/iversion after hole punching
- fix compression type validation (reported by KASAN)
- send won't be allowed to start when relocation is in progress, this
can cause spurious errors or produce incorrect send stream
Core:
- new tracepoints for space update
- tree-checker: better check for end of extents for some tree items
- preparatory work for more checksum algorithms
- run delayed iput at unlink time and don't push the work to cleaner
thread where it's not properly throttled
- wrap block mapping to structures and helpers, base for further
refactoring
- split large files, part 1:
- space info handling
- block group reservations
- delayed refs
- delayed allocation
- other cleanups and refactoring"
* tag 'for-5.3-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux: (103 commits)
btrfs: fix memory leak of path on error return path
btrfs: move the subvolume reservation stuff out of extent-tree.c
btrfs: migrate the delalloc space stuff to it's own home
btrfs: migrate btrfs_trans_release_chunk_metadata
btrfs: migrate the delayed refs rsv code
btrfs: Evaluate io_tree in find_lock_delalloc_range()
btrfs: migrate the global_block_rsv helpers to block-rsv.c
btrfs: migrate the block-rsv code to block-rsv.c
btrfs: stop using block_rsv_release_bytes everywhere
btrfs: cleanup the target logic in __btrfs_block_rsv_release
btrfs: export __btrfs_block_rsv_release
btrfs: export btrfs_block_rsv_add_bytes
btrfs: move btrfs_block_rsv definitions into it's own header
btrfs: Simplify update of space_info in __reserve_metadata_bytes()
btrfs: unexport can_overcommit
btrfs: move reserve_metadata_bytes and supporting code to space-info.c
btrfs: move dump_space_info to space-info.c
btrfs: export block_rsv_use_bytes
btrfs: move btrfs_space_info_add_*_bytes to space-info.c
btrfs: move the space info update macro to space-info.h
...
Diffstat (limited to 'fs/btrfs')
50 files changed, 3786 insertions, 3124 deletions
diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig index 23537bc8c827..212b4a854f2c 100644 --- a/fs/btrfs/Kconfig +++ b/fs/btrfs/Kconfig @@ -2,7 +2,8 @@ config BTRFS_FS tristate "Btrfs filesystem support" - select LIBCRC32C + select CRYPTO + select CRYPTO_CRC32C select ZLIB_INFLATE select ZLIB_DEFLATE select LZO_COMPRESS diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index ca693dd554e9..76a843198bcb 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -10,7 +10,8 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ export.o tree-log.o free-space-cache.o zlib.o lzo.o zstd.o \ compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \ reada.o backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \ - uuid-tree.o props.o free-space-tree.o tree-checker.o + uuid-tree.o props.o free-space-tree.o tree-checker.o space-info.o \ + block-rsv.o delalloc-space.o btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 982152d3f920..89116afda7a2 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -1465,12 +1465,11 @@ int btrfs_find_all_roots(struct btrfs_trans_handle *trans, * * Return: 0 if extent is not shared, 1 if it is shared, < 0 on error. */ -int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr) +int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr, + struct ulist *roots, struct ulist *tmp) { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_trans_handle *trans; - struct ulist *tmp = NULL; - struct ulist *roots = NULL; struct ulist_iterator uiter; struct ulist_node *node; struct seq_list elem = SEQ_LIST_INIT(elem); @@ -1481,12 +1480,8 @@ int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr) .share_count = 0, }; - tmp = ulist_alloc(GFP_NOFS); - roots = ulist_alloc(GFP_NOFS); - if (!tmp || !roots) { - ret = -ENOMEM; - goto out; - } + ulist_init(roots); + ulist_init(tmp); trans = btrfs_attach_transaction(root); if (IS_ERR(trans)) { @@ -1527,8 +1522,8 @@ int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr) up_read(&fs_info->commit_root_sem); } out: - ulist_free(tmp); - ulist_free(roots); + ulist_release(roots); + ulist_release(tmp); return ret; } diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h index 54d58988483a..777f61dc081e 100644 --- a/fs/btrfs/backref.h +++ b/fs/btrfs/backref.h @@ -57,7 +57,8 @@ int btrfs_find_one_extref(struct btrfs_root *root, u64 inode_objectid, u64 start_off, struct btrfs_path *path, struct btrfs_inode_extref **ret_extref, u64 *found_off); -int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr); +int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr, + struct ulist *roots, struct ulist *tmp_ulist); int __init btrfs_prelim_ref_init(void); void __cold btrfs_prelim_ref_exit(void); diff --git a/fs/btrfs/block-rsv.c b/fs/btrfs/block-rsv.c new file mode 100644 index 000000000000..698470b9f32d --- /dev/null +++ b/fs/btrfs/block-rsv.c @@ -0,0 +1,425 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "ctree.h" +#include "block-rsv.h" +#include "space-info.h" +#include "math.h" +#include "transaction.h" + +static u64 block_rsv_release_bytes(struct btrfs_fs_info *fs_info, + struct btrfs_block_rsv *block_rsv, + struct btrfs_block_rsv *dest, u64 num_bytes, + u64 *qgroup_to_release_ret) +{ + struct btrfs_space_info *space_info = block_rsv->space_info; + u64 qgroup_to_release = 0; + u64 ret; + + spin_lock(&block_rsv->lock); + if (num_bytes == (u64)-1) { + num_bytes = block_rsv->size; + qgroup_to_release = block_rsv->qgroup_rsv_size; + } + block_rsv->size -= num_bytes; + if (block_rsv->reserved >= block_rsv->size) { + num_bytes = block_rsv->reserved - block_rsv->size; + block_rsv->reserved = block_rsv->size; + block_rsv->full = 1; + } else { + num_bytes = 0; + } + if (block_rsv->qgroup_rsv_reserved >= block_rsv->qgroup_rsv_size) { + qgroup_to_release = block_rsv->qgroup_rsv_reserved - + block_rsv->qgroup_rsv_size; + block_rsv->qgroup_rsv_reserved = block_rsv->qgroup_rsv_size; + } else { + qgroup_to_release = 0; + } + spin_unlock(&block_rsv->lock); + + ret = num_bytes; + if (num_bytes > 0) { + if (dest) { + spin_lock(&dest->lock); + if (!dest->full) { + u64 bytes_to_add; + + bytes_to_add = dest->size - dest->reserved; + bytes_to_add = min(num_bytes, bytes_to_add); + dest->reserved += bytes_to_add; + if (dest->reserved >= dest->size) + dest->full = 1; + num_bytes -= bytes_to_add; + } + spin_unlock(&dest->lock); + } + if (num_bytes) + btrfs_space_info_add_old_bytes(fs_info, space_info, + num_bytes); + } + if (qgroup_to_release_ret) + *qgroup_to_release_ret = qgroup_to_release; + return ret; +} + +int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src, + struct btrfs_block_rsv *dst, u64 num_bytes, + bool update_size) +{ + int ret; + + ret = btrfs_block_rsv_use_bytes(src, num_bytes); + if (ret) + return ret; + + btrfs_block_rsv_add_bytes(dst, num_bytes, update_size); + return 0; +} + +void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type) +{ + memset(rsv, 0, sizeof(*rsv)); + spin_lock_init(&rsv->lock); + rsv->type = type; +} + +void btrfs_init_metadata_block_rsv(struct btrfs_fs_info *fs_info, + struct btrfs_block_rsv *rsv, + unsigned short type) +{ + btrfs_init_block_rsv(rsv, type); + rsv->space_info = btrfs_find_space_info(fs_info, + BTRFS_BLOCK_GROUP_METADATA); +} + +struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info, + unsigned short type) +{ + struct btrfs_block_rsv *block_rsv; + + block_rsv = kmalloc(sizeof(*block_rsv), GFP_NOFS); + if (!block_rsv) + return NULL; + + btrfs_init_metadata_block_rsv(fs_info, block_rsv, type); + return block_rsv; +} + +void btrfs_free_block_rsv(struct btrfs_fs_info *fs_info, + struct btrfs_block_rsv *rsv) +{ + if (!rsv) + return; + btrfs_block_rsv_release(fs_info, rsv, (u64)-1); + kfree(rsv); +} + +int btrfs_block_rsv_add(struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, u64 num_bytes, + enum btrfs_reserve_flush_enum flush) +{ + int ret; + + if (num_bytes == 0) + return 0; + + ret = btrfs_reserve_metadata_bytes(root, block_rsv, num_bytes, flush); + if (!ret) + btrfs_block_rsv_add_bytes(block_rsv, num_bytes, true); + + return ret; +} + +int btrfs_block_rsv_check(struct btrfs_block_rsv *block_rsv, int min_factor) +{ + u64 num_bytes = 0; + int ret = -ENOSPC; + + if (!block_rsv) + return 0; + + spin_lock(&block_rsv->lock); + num_bytes = div_factor(block_rsv->size, min_factor); + if (block_rsv->reserved >= num_bytes) + ret = 0; + spin_unlock(&block_rsv->lock); + + return ret; +} + +int btrfs_block_rsv_refill(struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, u64 min_reserved, + enum btrfs_reserve_flush_enum flush) +{ + u64 num_bytes = 0; + int ret = -ENOSPC; + + if (!block_rsv) + return 0; + + spin_lock(&block_rsv->lock); + num_bytes = min_reserved; + if (block_rsv->reserved >= num_bytes) + ret = 0; + else + num_bytes -= block_rsv->reserved; + spin_unlock(&block_rsv->lock); + + if (!ret) + return 0; + + ret = btrfs_reserve_metadata_bytes(root, block_rsv, num_bytes, flush); + if (!ret) { + btrfs_block_rsv_add_bytes(block_rsv, num_bytes, false); + return 0; + } + + return ret; +} + +u64 __btrfs_block_rsv_release(struct btrfs_fs_info *fs_info, + struct btrfs_block_rsv *block_rsv, + u64 num_bytes, u64 *qgroup_to_release) +{ + struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; + struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv; + struct btrfs_block_rsv *target = NULL; + + /* + * If we are the delayed_rsv then push to the global rsv, otherwise dump + * into the delayed rsv if it is not full. + */ + if (block_rsv == delayed_rsv) + target = global_rsv; + else if (block_rsv != global_rsv && !delayed_rsv->full) + target = delayed_rsv; + + if (target && block_rsv->space_info != target->space_info) + target = NULL; + + return block_rsv_release_bytes(fs_info, block_rsv, target, num_bytes, + qgroup_to_release); +} + +int btrfs_block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv, u64 num_bytes) +{ + int ret = -ENOSPC; + + spin_lock(&block_rsv->lock); + if (block_rsv->reserved >= num_bytes) { + block_rsv->reserved -= num_bytes; + if (block_rsv->reserved < block_rsv->size) + block_rsv->full = 0; + ret = 0; + } + spin_unlock(&block_rsv->lock); + return ret; +} + +void btrfs_block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv, + u64 num_bytes, bool update_size) +{ + spin_lock(&block_rsv->lock); + block_rsv->reserved += num_bytes; + if (update_size) + block_rsv->size += num_bytes; + else if (block_rsv->reserved >= block_rsv->size) + block_rsv->full = 1; + spin_unlock(&block_rsv->lock); +} + +int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info, + struct btrfs_block_rsv *dest, u64 num_bytes, + int min_factor) +{ + struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; + u64 min_bytes; + + if (global_rsv->space_info != dest->space_info) + return -ENOSPC; + + spin_lock(&global_rsv->lock); + min_bytes = div_factor(global_rsv->size, min_factor); + if (global_rsv->reserved < min_bytes + num_bytes) { + spin_unlock(&global_rsv->lock); + return -ENOSPC; + } + global_rsv->reserved -= num_bytes; + if (global_rsv->reserved < global_rsv->size) + global_rsv->full = 0; + spin_unlock(&global_rsv->lock); + + btrfs_block_rsv_add_bytes(dest, num_bytes, true); + return 0; +} + +void btrfs_update_global_block_rsv(struct btrfs_fs_info *fs_info) +{ + struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv; + struct btrfs_space_info *sinfo = block_rsv->space_info; + u64 num_bytes; + + /* + * The global block rsv is based on the size of the extent tree, the + * checksum tree and the root tree. If the fs is empty we want to set + * it to a minimal amount for safety. + */ + num_bytes = btrfs_root_used(&fs_info->extent_root->root_item) + + btrfs_root_used(&fs_info->csum_root->root_item) + + btrfs_root_used(&fs_info->tree_root->root_item); + num_bytes = max_t(u64, num_bytes, SZ_16M); + + spin_lock(&sinfo->lock); + spin_lock(&block_rsv->lock); + + block_rsv->size = min_t(u64, num_bytes, SZ_512M); + + if (block_rsv->reserved < block_rsv->size) { + num_bytes = btrfs_space_info_used(sinfo, true); + if (sinfo->total_bytes > num_bytes) { + num_bytes = sinfo->total_bytes - num_bytes; + num_bytes = min(num_bytes, + block_rsv->size - block_rsv->reserved); + block_rsv->reserved += num_bytes; + btrfs_space_info_update_bytes_may_use(fs_info, sinfo, + num_bytes); + trace_btrfs_space_reservation(fs_info, "space_info", + sinfo->flags, num_bytes, + 1); + } + } else if (block_rsv->reserved > block_rsv->size) { + num_bytes = block_rsv->reserved - block_rsv->size; + btrfs_space_info_update_bytes_may_use(fs_info, sinfo, + -num_bytes); + trace_btrfs_space_reservation(fs_info, "space_info", + sinfo->flags, num_bytes, 0); + block_rsv->reserved = block_rsv->size; + } + + if (block_rsv->reserved == block_rsv->size) + block_rsv->full = 1; + else + block_rsv->full = 0; + + spin_unlock(&block_rsv->lock); + spin_unlock(&sinfo->lock); +} + +void btrfs_init_global_block_rsv(struct btrfs_fs_info *fs_info) +{ + struct btrfs_space_info *space_info; + + space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM); + fs_info->chunk_block_rsv.space_info = space_info; + + space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); + fs_info->global_block_rsv.space_info = space_info; + fs_info->trans_block_rsv.space_info = space_info; + fs_info->empty_block_rsv.space_info = space_info; + fs_info->delayed_block_rsv.space_info = space_info; + fs_info->delayed_refs_rsv.space_info = space_info; + + fs_info->extent_root->block_rsv = &fs_info->delayed_refs_rsv; + fs_info->csum_root->block_rsv = &fs_info->delayed_refs_rsv; + fs_info->dev_root->block_rsv = &fs_info->global_block_rsv; + fs_info->tree_root->block_rsv = &fs_info->global_block_rsv; + if (fs_info->quota_root) + fs_info->quota_root->block_rsv = &fs_info->global_block_rsv; + fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv; + + btrfs_update_global_block_rsv(fs_info); +} + +void btrfs_release_global_block_rsv(struct btrfs_fs_info *fs_info) +{ + btrfs_block_rsv_release(fs_info, &fs_info->global_block_rsv, (u64)-1); + WARN_ON(fs_info->trans_block_rsv.size > 0); + WARN_ON(fs_info->trans_block_rsv.reserved > 0); + WARN_ON(fs_info->chunk_block_rsv.size > 0); + WARN_ON(fs_info->chunk_block_rsv.reserved > 0); + WARN_ON(fs_info->delayed_block_rsv.size > 0); + WARN_ON(fs_info->delayed_block_rsv.reserved > 0); + WARN_ON(fs_info->delayed_refs_rsv.reserved > 0); + WARN_ON(fs_info->delayed_refs_rsv.size > 0); +} + +static struct btrfs_block_rsv *get_block_rsv( + const struct btrfs_trans_handle *trans, + const struct btrfs_root *root) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_block_rsv *block_rsv = NULL; + + if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) || + (root == fs_info->csum_root && trans->adding_csums) || + (root == fs_info->uuid_root)) + block_rsv = trans->block_rsv; + + if (!block_rsv) + block_rsv = root->block_rsv; + + if (!block_rsv) + block_rsv = &fs_info->empty_block_rsv; + + return block_rsv; +} + +struct btrfs_block_rsv *btrfs_use_block_rsv(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u32 blocksize) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_block_rsv *block_rsv; + struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; + int ret; + bool global_updated = false; + + block_rsv = get_block_rsv(trans, root); + + if (unlikely(block_rsv->size == 0)) + goto try_reserve; +again: + ret = btrfs_block_rsv_use_bytes(block_rsv, blocksize); + if (!ret) + return block_rsv; + + if (block_rsv->failfast) + return ERR_PTR(ret); + + if (block_rsv->type == BTRFS_BLOCK_RSV_GLOBAL && !global_updated) { + global_updated = true; + btrfs_update_global_block_rsv(fs_info); + goto again; + } + + /* + * The global reserve still exists to save us from ourselves, so don't + * warn_on if we are short on our delayed refs reserve. + */ + if (block_rsv->type != BTRFS_BLOCK_RSV_DELREFS && + btrfs_test_opt(fs_info, ENOSPC_DEBUG)) { + static DEFINE_RATELIMIT_STATE(_rs, + DEFAULT_RATELIMIT_INTERVAL * 10, + /*DEFAULT_RATELIMIT_BURST*/ 1); + if (__ratelimit(&_rs)) + WARN(1, KERN_DEBUG + "BTRFS: block rsv returned %d\n", ret); + } +try_reserve: + ret = btrfs_reserve_metadata_bytes(root, block_rsv, blocksize, + BTRFS_RESERVE_NO_FLUSH); + if (!ret) + return block_rsv; + /* + * If we couldn't reserve metadata bytes try and use some from + * the global reserve if its space type is the same as the global + * reservation. + */ + if (block_rsv->type != BTRFS_BLOCK_RSV_GLOBAL && + block_rsv->space_info == global_rsv->space_info) { + ret = btrfs_block_rsv_use_bytes(global_rsv, blocksize); + if (!ret) + return global_rsv; + } + return ERR_PTR(ret); +} diff --git a/fs/btrfs/block-rsv.h b/fs/btrfs/block-rsv.h new file mode 100644 index 000000000000..d1428bb73fc5 --- /dev/null +++ b/fs/btrfs/block-rsv.h @@ -0,0 +1,101 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef BTRFS_BLOCK_RSV_H +#define BTRFS_BLOCK_RSV_H + +struct btrfs_trans_handle; +enum btrfs_reserve_flush_enum; + +/* + * Types of block reserves + */ +enum { + BTRFS_BLOCK_RSV_GLOBAL, + BTRFS_BLOCK_RSV_DELALLOC, + BTRFS_BLOCK_RSV_TRANS, + BTRFS_BLOCK_RSV_CHUNK, + BTRFS_BLOCK_RSV_DELOPS, + BTRFS_BLOCK_RSV_DELREFS, + BTRFS_BLOCK_RSV_EMPTY, + BTRFS_BLOCK_RSV_TEMP, +}; + +struct btrfs_block_rsv { + u64 size; + u64 reserved; + struct btrfs_space_info *space_info; + spinlock_t lock; + unsigned short full; + unsigned short type; + unsigned short failfast; + + /* + * Qgroup equivalent for @size @reserved + * + * Unlike normal @size/@reserved for inode rsv, qgroup doesn't care + * about things like csum size nor how many tree blocks it will need to + * reserve. + * + * Qgroup cares more about net change of the extent usage. + * + * So for one newly inserted file extent, in worst case it will cause + * leaf split and level increase, nodesize for each file extent is + * already too much. + * + * In short, qgroup_size/reserved is the upper limit of possible needed + * qgroup metadata reservation. + */ + u64 qgroup_rsv_size; + u64 qgroup_rsv_reserved; +}; + +void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type); +struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info, + unsigned short type); +void btrfs_init_metadata_block_rsv(struct btrfs_fs_info *fs_info, + struct btrfs_block_rsv *rsv, + unsigned short type); +void btrfs_free_block_rsv(struct btrfs_fs_info *fs_info, + struct btrfs_block_rsv *rsv); +int btrfs_block_rsv_add(struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, u64 num_bytes, + enum btrfs_reserve_flush_enum flush); +int btrfs_block_rsv_check(struct btrfs_block_rsv *block_rsv, int min_factor); +int btrfs_block_rsv_refill(struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, u64 min_reserved, + enum btrfs_reserve_flush_enum flush); +int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv, + struct btrfs_block_rsv *dst_rsv, u64 num_bytes, + bool update_size); +int btrfs_block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv, u64 num_bytes); +int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info, + struct btrfs_block_rsv *dest, u64 num_bytes, + int min_factor); +void btrfs_block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv, + u64 num_bytes, bool update_size); +u64 __btrfs_block_rsv_release(struct btrfs_fs_info *fs_info, + struct btrfs_block_rsv *block_rsv, + u64 num_bytes, u64 *qgroup_to_release); +void btrfs_update_global_block_rsv(struct btrfs_fs_info *fs_info); +void btrfs_init_global_block_rsv(struct btrfs_fs_info *fs_info); +void btrfs_release_global_block_rsv(struct btrfs_fs_info *fs_info); +struct btrfs_block_rsv *btrfs_use_block_rsv(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u32 blocksize); + +static inline void btrfs_block_rsv_release(struct btrfs_fs_info *fs_info, + struct btrfs_block_rsv *block_rsv, + u64 num_bytes) +{ + __btrfs_block_rsv_release(fs_info, block_rsv, num_bytes, NULL); +} + +static inline void btrfs_unuse_block_rsv(struct btrfs_fs_info *fs_info, + struct btrfs_block_rsv *block_rsv, + u32 blocksize) +{ + btrfs_block_rsv_add_bytes(block_rsv, blocksize, false); + btrfs_block_rsv_release(fs_info, block_rsv, 0); +} + +#endif /* BTRFS_BLOCK_RSV_H */ diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index d5b438706b77..f853835c409c 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -337,22 +337,34 @@ static inline void btrfs_inode_resume_unlocked_dio(struct btrfs_inode *inode) clear_bit(BTRFS_INODE_READDIO_NEED_LOCK, &inode->runtime_flags); } +/* Array of bytes with variable length, hexadecimal format 0x1234 */ +#define CSUM_FMT "0x%*phN" +#define CSUM_FMT_VALUE(size, bytes) size, bytes + static inline void btrfs_print_data_csum_error(struct btrfs_inode *inode, - u64 logical_start, u32 csum, u32 csum_expected, int mirror_num) + u64 logical_start, u8 *csum, u8 *csum_expected, int mirror_num) { struct btrfs_root *root = inode->root; + struct btrfs_super_block *sb = root->fs_info->super_copy; + const u16 csum_size = btrfs_super_csum_size(sb); /* Output minus objectid, which is more meaningful */ if (root->root_key.objectid >= BTRFS_LAST_FREE_OBJECTID) btrfs_warn_rl(root->fs_info, - "csum failed root %lld ino %lld off %llu csum 0x%08x expected csum 0x%08x mirror %d", +"csum failed root %lld ino %lld off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d", root->root_key.objectid, btrfs_ino(inode), - logical_start, csum, csum_expected, mirror_num); + logical_start, + CSUM_FMT_VALUE(csum_size, csum), + CSUM_FMT_VALUE(csum_size, csum_expected), + mirror_num); else btrfs_warn_rl(root->fs_info, - "csum failed root %llu ino %llu off %llu csum 0x%08x expected csum 0x%08x mirror %d", +"csum failed root %llu ino %llu off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d", root->root_key.objectid, btrfs_ino(inode), - logical_start, csum, csum_expected, mirror_num); + logical_start, + CSUM_FMT_VALUE(csum_size, csum), + CSUM_FMT_VALUE(csum_size, csum_expected), + mirror_num); } #endif diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index b0c8094528d1..81a9731959a9 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -83,7 +83,7 @@ #include <linux/blkdev.h> #include <linux/mm.h> #include <linux/string.h> -#include <linux/crc32c.h> +#include <crypto/hash.h> #include "ctree.h" #include "disk-io.h" #include "transaction.h" @@ -1710,9 +1710,9 @@ static int btrfsic_test_for_metadata(struct btrfsic_state *state, char **datav, unsigned int num_pages) { struct btrfs_fs_info *fs_info = state->fs_info; + SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); struct btrfs_header *h; u8 csum[BTRFS_CSUM_SIZE]; - u32 crc = ~(u32)0; unsigned int i; if (num_pages * PAGE_SIZE < state->metablock_size) @@ -1723,14 +1723,17 @@ static int btrfsic_test_for_metadata(struct btrfsic_state *state, if (memcmp(h->fsid, fs_info->fs_devices->fsid, BTRFS_FSID_SIZE)) return 1; + shash->tfm = fs_info->csum_shash; + crypto_shash_init(shash); + for (i = 0; i < num_pages; i++) { u8 *data = i ? datav[i] : (datav[i] + BTRFS_CSUM_SIZE); size_t sublen = i ? PAGE_SIZE : (PAGE_SIZE - BTRFS_CSUM_SIZE); - crc = crc32c(crc, data, sublen); + crypto_shash_update(shash, data, sublen); } - btrfs_csum_final(crc, csum); + crypto_shash_final(shash, csum); if (memcmp(csum, h->csum, state->csum_size)) return 1; diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 84dd4a8980c5..60c47b417a4b 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -17,6 +17,7 @@ #include <linux/slab.h> #include <linux/sched/mm.h> #include <linux/log2.h> +#include <crypto/hash.h> #include "ctree.h" #include "disk-io.h" #include "transaction.h" @@ -42,6 +43,22 @@ const char* btrfs_compress_type2str(enum btrfs_compression_type type) return NULL; } +bool btrfs_compress_is_valid_type(const char *str, size_t len) +{ + int i; + + for (i = 1; i < ARRAY_SIZE(btrfs_compress_types); i++) { + size_t comp_len = strlen(btrfs_compress_types[i]); + + if (len < comp_len) + continue; + + if (!strncmp(btrfs_compress_types[i], str, comp_len)) + return true; + } + return false; +} + static int btrfs_decompress_bio(struct compressed_bio *cb); static inline int compressed_bio_size(struct btrfs_fs_info *fs_info, @@ -57,32 +74,37 @@ static int check_compressed_csum(struct btrfs_inode *inode, struct compressed_bio *cb, u64 disk_start) { + struct btrfs_fs_info *fs_info = inode->root->fs_info; + SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); + const u16 csum_size = btrfs_super_csum_size(fs_info->super_copy); int ret; struct page *page; unsigned long i; char *kaddr; - u32 csum; - u32 *cb_sum = &cb->sums; + u8 csum[BTRFS_CSUM_SIZE]; + u8 *cb_sum = cb->sums; if (inode->flags & BTRFS_INODE_NODATASUM) return 0; + shash->tfm = fs_info->csum_shash; + for (i = 0; i < cb->nr_pages; i++) { page = cb->compressed_pages[i]; - csum = ~(u32)0; + crypto_shash_init(shash); kaddr = kmap_atomic(page); - csum = btrfs_csum_data(kaddr, csum, PAGE_SIZE); - btrfs_csum_final(csum, (u8 *)&csum); + crypto_shash_update(shash, kaddr, PAGE_SIZE); kunmap_atomic(kaddr); + crypto_shash_final(shash, (u8 *)&csum); - if (csum != *cb_sum) { - btrfs_print_data_csum_error(inode, disk_start, csum, - *cb_sum, cb->mirror_num); + if (memcmp(&csum, cb_sum, csum_size)) { + btrfs_print_data_csum_error(inode, disk_start, + csum, cb_sum, cb->mirror_num); ret = -EIO; goto fail; } - cb_sum++; + cb_sum += csum_size; } ret = 0; @@ -318,7 +340,8 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start, bdev = fs_info->fs_devices->latest_bdev; - bio = btrfs_bio_alloc(bdev, first_byte); + bio = btrfs_bio_alloc(first_byte); + bio_set_dev(bio, bdev); bio->bi_opf = REQ_OP_WRITE | write_flags; bio->bi_private = cb; bio->bi_end_io = end_compressed_bio_write; @@ -360,7 +383,8 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start, bio_endio(bio); } - bio = btrfs_bio_alloc(bdev, first_byte); + bio = btrfs_bio_alloc(first_byte); + bio_set_dev(bio, bdev); bio->bi_opf = REQ_OP_WRITE | write_flags; bio->bi_private = cb; bio->bi_end_io = end_compressed_bio_write; @@ -536,7 +560,8 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, struct extent_map *em; blk_status_t ret = BLK_STS_RESOURCE; int faili = 0; - u32 *sums; + const u16 csum_size = btrfs_super_csum_size(fs_info->super_copy); + u8 *sums; em_tree = &BTRFS_I(inode)->extent_tree; @@ -558,7 +583,7 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, cb->errors = 0; cb->inode = inode; cb->mirror_num = mirror_num; - sums = &cb->sums; + sums = cb->sums; cb->start = em->orig_start; em_len = em->len; @@ -597,7 +622,8 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, /* include any pages we added in add_ra-bio_pages */ cb->len = bio->bi_iter.bi_size; - comp_bio = btrfs_bio_alloc(bdev, cur_disk_byte); + comp_bio = btrfs_bio_alloc(cur_disk_byte); + bio_set_dev(comp_bio, bdev); comp_bio->bi_opf = REQ_OP_READ; comp_bio->bi_private = cb; comp_bio->bi_end_io = end_compressed_bio_read; @@ -617,6 +643,8 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, page->mapping = NULL; if (submit || bio_add_page(comp_bio, page, PAGE_SIZE, 0) < PAGE_SIZE) { + unsigned int nr_sectors; + ret = btrfs_bio_wq_end_io(fs_info, comp_bio, BTRFS_WQ_ENDIO_DATA); BUG_ON(ret); /* -ENOMEM */ @@ -634,8 +662,10 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, sums); BUG_ON(ret); /* -ENOMEM */ } - sums += DIV_ROUND_UP(comp_bio->bi_iter.bi_size, - fs_info->sectorsize); + + nr_sectors = DIV_ROUND_UP(comp_bio->bi_iter.bi_size, + fs_info->sectorsize); + sums += csum_size * nr_sectors; ret = btrfs_map_bio(fs_info, comp_bio, mirror_num, 0); if (ret) { @@ -643,7 +673,8 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, bio_endio(comp_bio); } - comp_bio = btrfs_bio_alloc(bdev, cur_disk_byte); + comp_bio = btrfs_bio_alloc(cur_disk_byte); + bio_set_dev(comp_bio, bdev); comp_bio->bi_opf = REQ_OP_READ; comp_bio->bi_private = cb; comp_bio->bi_end_io = end_compressed_bio_read; diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index 9976fe0f7526..2035b8eb1290 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -61,7 +61,7 @@ struct compressed_bio { * the start of a variable length array of checksums only * used by reads */ - u32 sums; + u8 sums[]; }; static inline unsigned int btrfs_compress_type(unsigned int type_level) @@ -173,6 +173,7 @@ extern const struct btrfs_compress_op btrfs_lzo_compress; extern const struct btrfs_compress_op btrfs_zstd_compress; const char* btrfs_compress_type2str(enum btrfs_compression_type type); +bool btrfs_compress_is_valid_type(const char *str, size_t len); int btrfs_compress_heuristic(struct inode *inode, u64 start, u64 end); diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 0a61dff27f57..299e11e6c554 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -19,6 +19,7 @@ #include <linux/kobject.h> #include <trace/events/btrfs.h> #include <asm/kmap_types.h> +#include <asm/unaligned.h> #include <linux/pagemap.h> #include <linux/btrfs.h> #include <linux/btrfs_tree.h> @@ -31,11 +32,13 @@ #include "extent_io.h" #include "extent_map.h" #include "async-thread.h" +#include "block-rsv.h" struct btrfs_trans_handle; struct btrfs_transaction; struct btrfs_pending_snapshot; struct btrfs_delayed_ref_root; +struct btrfs_space_info; extern struct kmem_cache *btrfs_trans_handle_cachep; extern struct kmem_cache *btrfs_bit_radix_cachep; extern struct kmem_cache *btrfs_path_cachep; @@ -45,7 +48,16 @@ struct btrfs_ref; #define BTRFS_MAGIC 0x4D5F53665248425FULL /* ascii _BHRfS_M, no null */ -#define BTRFS_MAX_MIRRORS 3 +/* + * Maximum number of mirrors that can be available for all profiles counting + * the target device of dev-replace as one. During an active device replace + * procedure, the target device of the copy operation is a mirror for the + * filesystem data as well that can be used to read data in order to repair + * read errors on other disks. + * + * Current value is derived from RAID1 with 2 copies. + */ +#define BTRFS_MAX_MIRRORS (2 + 1) #define BTRFS_MAX_LEVEL 8 @@ -72,6 +84,7 @@ struct btrfs_ref; /* four bytes for CRC32 */ static const int btrfs_csum_sizes[] = { 4 }; +static const char *btrfs_csum_names[] = { "crc32c" }; #define BTRFS_EMPTY_DIR_SIZE 0 @@ -99,10 +112,6 @@ static inline u32 count_max_extents(u64 size) return div_u64(size + BTRFS_MAX_EXTENT_SIZE - 1, BTRFS_MAX_EXTENT_SIZE); } -struct btrfs_mapping_tree { - struct extent_map_tree map_tree; -}; - static inline unsigned long btrfs_chunk_item_size(int num_stripes) { BUG_ON(num_stripes == 0); @@ -395,115 +404,6 @@ struct raid_kobject { struct list_head list; }; -struct btrfs_space_info { - spinlock_t lock; - - u64 total_bytes; /* total bytes in the space, - this doesn't take mirrors into account */ - u64 bytes_used; /* total bytes used, - this doesn't take mirrors into account */ - u64 bytes_pinned; /* total bytes pinned, will be freed when the - transaction finishes */ - u64 bytes_reserved; /* total bytes the allocator has reserved for - current allocations */ - u64 bytes_may_use; /* number of bytes that may be used for - delalloc/allocations */ - u64 bytes_readonly; /* total bytes that are read only */ - - u64 max_extent_size; /* This will hold the maximum extent size of - the space info if we had an ENOSPC in the - allocator. */ - - unsigned int full:1; /* indicates that we cannot allocate any more - chunks for this space */ - unsigned int chunk_alloc:1; /* set if we are allocating a chunk */ - - unsigned int flush:1; /* set if we are trying to make space */ - - unsigned int force_alloc; /* set if we need to force a chunk - alloc for this space */ - - u64 disk_used; /* total bytes used on disk */ - u64 disk_total; /* total bytes on disk, takes mirrors into - account */ - - u64 flags; - - /* - * bytes_pinned is kept in line with what is actually pinned, as in - * we've called update_block_group and dropped the bytes_used counter - * and increased the bytes_pinned counter. However this means that - * bytes_pinned does not reflect the bytes that will be pinned once the - * delayed refs are flushed, so this counter is inc'ed every time we - * call btrfs_free_extent so it is a realtime count of what will be - * freed once the transaction is committed. It will be zeroed every - * time the transaction commits. - */ - struct percpu_counter total_bytes_pinned; - - struct list_head list; - /* Protected by the spinlock 'lock'. */ - struct list_head ro_bgs; - struct list_head priority_tickets; - struct list_head tickets; - /* - * tickets_id just indicates the next ticket will be handled, so note - * it's not stored per ticket. - */ - u64 tickets_id; - - struct rw_semaphore groups_sem; - /* for block groups in our same type */ - struct list_head block_groups[BTRFS_NR_RAID_TYPES]; - wait_queue_head_t wait; - - struct kobject kobj; - struct kobject *block_group_kobjs[BTRFS_NR_RAID_TYPES]; -}; - -/* - * Types of block reserves - */ -enum { - BTRFS_BLOCK_RSV_GLOBAL, - BTRFS_BLOCK_RSV_DELALLOC, - BTRFS_BLOCK_RSV_TRANS, - BTRFS_BLOCK_RSV_CHUNK, - BTRFS_BLOCK_RSV_DELOPS, - BTRFS_BLOCK_RSV_DELREFS, - BTRFS_BLOCK_RSV_EMPTY, - BTRFS_BLOCK_RSV_TEMP, -}; - -struct btrfs_block_rsv { - u64 size; - u64 reserved; - struct btrfs_space_info *space_info; - spinlock_t lock; - unsigned short full; - unsigned short type; - unsigned short failfast; - - /* - * Qgroup equivalent for @size @reserved - * - * Unlike normal @size/@reserved for inode rsv, qgroup doesn't care - * about things like csum size nor how many tree blocks it will need to - * reserve. - * - * Qgroup cares more about net change of the extent usage. - * - * So for one newly inserted file extent, in worst case it will cause - * leaf split and level increase, nodesize for each file extent is - * already too much. - * - * In short, qgroup_size/reserved is the upper limit of possible needed - * qgroup metadata reservation. - */ - u64 qgroup_rsv_size; - u64 qgroup_rsv_reserved; -}; - /* * free clusters are used to claim free space in relatively large chunks, * allowing us to do less seeky writes. They are used for all metadata @@ -786,11 +686,18 @@ enum { /* * Indicate that balance has been set up from the ioctl and is in the * main phase. The fs_info::balance_ctl is initialized. + * Set and cleared while holding fs_info::balance_mutex. */ BTRFS_FS_BALANCE_RUNNING, /* Indicate that the cleaner thread is awake and doing something. */ BTRFS_FS_CLEANER_RUNNING, + + /* + * The checksumming has an optimized version and is considered fast, + * so we don't need to offload checksums to workqueues. + */ + BTRFS_FS_CSUM_IMPL_FAST, }; struct btrfs_fs_info { @@ -824,7 +731,7 @@ struct btrfs_fs_info { struct extent_io_tree *pinned_extents; /* logical->physical extent mapping */ - struct btrfs_mapping_tree mapping_tree; + struct extent_map_tree mapping_tree; /* * block reservation for extent, checksum, root tree and @@ -1160,6 +1067,14 @@ struct btrfs_fs_info { spinlock_t swapfile_pins_lock; struct rb_root swapfile_pins; + struct crypto_shash *csum_shash; + + /* + * Number of send operations in progress. + * Updated while holding fs_info::balance_mutex. + */ + int send_in_progress; + #ifdef CONFIG_BTRFS_FS_REF_VERIFY spinlock_t ref_verify_lock; struct rb_root block_tree; @@ -2451,6 +2366,11 @@ static inline int btrfs_super_csum_size(const struct btrfs_super_block *s) return btrfs_csum_sizes[t]; } +static inline const char *btrfs_super_csum_name(u16 csum_type) +{ + /* csum type is validated at mount time */ + return btrfs_csum_names[csum_type]; +} /* * The leaf data grows from end-to-front in the node. @@ -2642,6 +2562,16 @@ BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cursor_right, ((unsigned long)(BTRFS_LEAF_DATA_OFFSET + \ btrfs_item_offset_nr(leaf, slot))) +static inline u32 btrfs_crc32c(u32 crc, const void *address, unsigned length) +{ + return crc32c(crc, address, length); +} + +static inline void btrfs_crc32c_final(u32 crc, u8 *result) +{ + put_unaligned_le32(~crc, result); +} + static inline u64 btrfs_name_hash(const char *name, int len) { return crc32c((u32)~1, name, len); @@ -2656,12 +2586,6 @@ static inline u64 btrfs_extref_hash(u64 parent_objectid, const char *name, return (u64) crc32c(parent_objectid, name, len); } -static inline bool btrfs_mixed_space_info(struct btrfs_space_info *space_info) -{ - return ((space_info->flags & BTRFS_BLOCK_GROUP_METADATA) && - (space_info->flags & BTRFS_BLOCK_GROUP_DATA)); -} - static inline gfp_t btrfs_alloc_write_mask(struct address_space *mapping) { return mapping_gfp_constraint(mapping, ~__GFP_FS); @@ -2698,8 +2622,6 @@ static inline u64 btrfs_calc_trunc_metadata_size(struct btrfs_fs_info *fs_info, return (u64)fs_info->nodesize * BTRFS_MAX_LEVEL * num_items; } -int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans); -bool btrfs_check_space_for_delayed_refs(struct btrfs_fs_info *fs_info); void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info, const u64 start); void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg); @@ -2814,17 +2736,28 @@ enum btrfs_flush_state { COMMIT_TRANS = 9, }; -int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes); -int btrfs_check_data_free_space(struct inode *inode, - struct extent_changeset **reserved, u64 start, u64 len); -void btrfs_free_reserved_data_space(struct inode *inode, - struct extent_changeset *reserved, u64 start, u64 len); -void btrfs_delalloc_release_space(struct inode *inode, - struct extent_changeset *reserved, - u64 start, u64 len, bool qgroup_free); -void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start, - u64 len); -void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans); +/* + * control flags for do_chunk_alloc's force field + * CHUNK_ALLOC_NO_FORCE means to only allocate a chunk + * if we really need one. + * + * CHUNK_ALLOC_LIMITED means to only try and allocate one + * if we have very few chunks already allocated. This is + * used as part of the clustering code to help make sure + * we have a good pool of storage to cluster in, without + * filling the FS with empty chunks + * + * CHUNK_ALLOC_FORCE means it must try to allocate one + * + */ +enum btrfs_chunk_alloc_enum { + CHUNK_ALLOC_NO_FORCE, + CHUNK_ALLOC_LIMITED, + CHUNK_ALLOC_FORCE, +}; + +int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags, + enum btrfs_chunk_alloc_enum force); int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, struct btrfs_block_rsv *rsv, int nitems, bool use_global_rsv); @@ -2834,41 +2767,6 @@ void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes, bool qgroup_free); int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes); -void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes, - bool qgroup_free); -int btrfs_delalloc_reserve_space(struct inode *inode, - struct extent_changeset **reserved, u64 start, u64 len); -void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type); -struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info, - unsigned short type); -void btrfs_init_metadata_block_rsv(struct btrfs_fs_info *fs_info, - struct btrfs_block_rsv *rsv, - unsigned short type); -void btrfs_free_block_rsv(struct btrfs_fs_info *fs_info, - struct btrfs_block_rsv *rsv); -int btrfs_block_rsv_add(struct btrfs_root *root, - struct btrfs_block_rsv *block_rsv, u64 num_bytes, - enum btrfs_reserve_flush_enum flush); -int btrfs_block_rsv_check(struct btrfs_block_rsv *block_rsv, int min_factor); -int btrfs_block_rsv_refill(struct btrfs_root *root, - struct btrfs_block_rsv *block_rsv, u64 min_reserved, - enum btrfs_reserve_flush_enum flush); -int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv, - struct btrfs_block_rsv *dst_rsv, u64 num_bytes, - bool update_size); -int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info, - struct btrfs_block_rsv *dest, u64 num_bytes, - int min_factor); -void btrfs_block_rsv_release(struct btrfs_fs_info *fs_info, - struct btrfs_block_rsv *block_rsv, - u64 num_bytes); -void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr); -void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans); -int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info, - enum btrfs_reserve_flush_enum flush); -void btrfs_migrate_to_delayed_refs_rsv(struct btrfs_fs_info *fs_info, - struct btrfs_block_rsv *src, - u64 num_bytes); int btrfs_inc_block_group_ro(struct btrfs_block_group_cache *cache); void btrfs_dec_block_group_ro(struct btrfs_block_group_cache *cache); void btrfs_put_block_group_cache(struct btrfs_fs_info *info); @@ -3186,7 +3084,8 @@ int btrfs_find_name_in_ext_backref(struct extent_buffer *leaf, int slot, struct btrfs_dio_private; int btrfs_del_csums(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 bytenr, u64 len); -blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u32 *dst); +blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, + u8 *dst); blk_status_t btrfs_lookup_bio_sums_dio(struct inode *inode, struct bio *bio, u64 logical_offset); int btrfs_insert_file_extent(struct btrfs_trans_handle *trans, @@ -3514,8 +3413,7 @@ __cold static inline void assfail(const char *expr, const char *file, int line) { if (IS_ENABLED(CONFIG_BTRFS_ASSERT)) { - pr_err("assertion failed: %s, file: %s, line: %d\n", - expr, file, line); + pr_err("assertion failed: %s, in %s:%d\n", expr, file, line); BUG(); } } @@ -3599,10 +3497,11 @@ do { \ /* compatibility and incompatibility defines */ #define btrfs_set_fs_incompat(__fs_info, opt) \ - __btrfs_set_fs_incompat((__fs_info), BTRFS_FEATURE_INCOMPAT_##opt) + __btrfs_set_fs_incompat((__fs_info), BTRFS_FEATURE_INCOMPAT_##opt, \ + #opt) static inline void __btrfs_set_fs_incompat(struct btrfs_fs_info *fs_info, - u64 flag) + u64 flag, const char* name) { struct btrfs_super_block *disk_super; u64 features; @@ -3615,18 +3514,20 @@ static inline void __btrfs_set_fs_incompat(struct btrfs_fs_info *fs_info, if (!(features & flag)) { features |= flag; btrfs_set_super_incompat_flags(disk_super, features); - btrfs_info(fs_info, "setting %llu feature flag", - flag); + btrfs_info(fs_info, + "setting incompat feature flag for %s (0x%llx)", + name, flag); } spin_unlock(&fs_info->super_lock); } } #define btrfs_clear_fs_incompat(__fs_info, opt) \ - __btrfs_clear_fs_incompat((__fs_info), BTRFS_FEATURE_INCOMPAT_##opt) + __btrfs_clear_fs_incompat((__fs_info), BTRFS_FEATURE_INCOMPAT_##opt, \ + #opt) static inline void __btrfs_clear_fs_incompat(struct btrfs_fs_info *fs_info, - u64 flag) + u64 flag, const char* name) { struct btrfs_super_block *disk_super; u64 features; @@ -3639,8 +3540,9 @@ static inline void __btrfs_clear_fs_incompat(struct btrfs_fs_info *fs_info, if (features & flag) { features &= ~flag; btrfs_set_super_incompat_flags(disk_super, features); - btrfs_info(fs_info, "clearing %llu feature flag", - flag); + btrfs_info(fs_info, + "clearing incompat feature flag for %s (0x%llx)", + name, flag); } spin_unlock(&fs_info->super_lock); } @@ -3657,10 +3559,11 @@ static inline bool __btrfs_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag) } #define btrfs_set_fs_compat_ro(__fs_info, opt) \ - __btrfs_set_fs_compat_ro((__fs_info), BTRFS_FEATURE_COMPAT_RO_##opt) + __btrfs_set_fs_compat_ro((__fs_info), BTRFS_FEATURE_COMPAT_RO_##opt, \ + #opt) static inline void __btrfs_set_fs_compat_ro(struct btrfs_fs_info *fs_info, - u64 flag) + u64 flag, const char *name) { struct btrfs_super_block *disk_super; u64 features; @@ -3673,18 +3576,20 @@ static inline void __btrfs_set_fs_compat_ro(struct btrfs_fs_info *fs_info, if (!(features & flag)) { features |= flag; btrfs_set_super_compat_ro_flags(disk_super, features); - btrfs_info(fs_info, "setting %llu ro feature flag", - flag); + btrfs_info(fs_info, + "setting compat-ro feature flag for %s (0x%llx)", + name, flag); } spin_unlock(&fs_info->super_lock); } } #define btrfs_clear_fs_compat_ro(__fs_info, opt) \ - __btrfs_clear_fs_compat_ro((__fs_info), BTRFS_FEATURE_COMPAT_RO_##opt) + __btrfs_clear_fs_compat_ro((__fs_info), BTRFS_FEATURE_COMPAT_RO_##opt, \ + #opt) static inline void __btrfs_clear_fs_compat_ro(struct btrfs_fs_info *fs_info, - u64 flag) + u64 flag, const char *name) { struct btrfs_super_block *disk_super; u64 features; @@ -3697,8 +3602,9 @@ static inline void __btrfs_clear_fs_compat_ro(struct btrfs_fs_info *fs_info, if (features & flag) { features &= ~flag; btrfs_set_super_compat_ro_flags(disk_super, features); - btrfs_info(fs_info, "clearing %llu ro feature flag", - flag); + btrfs_info(fs_info, + "clearing compat-ro feature flag for %s (0x%llx)", + name, flag); } spin_unlock(&fs_info->super_lock); } diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c new file mode 100644 index 000000000000..17f7c0d38768 --- /dev/null +++ b/fs/btrfs/delalloc-space.c @@ -0,0 +1,494 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "ctree.h" +#include "delalloc-space.h" +#include "block-rsv.h" +#include "btrfs_inode.h" +#include "space-info.h" +#include "transaction.h" +#include "qgroup.h" + +int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes) +{ + struct btrfs_root *root = inode->root; + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_space_info *data_sinfo = fs_info->data_sinfo; + u64 used; + int ret = 0; + int need_commit = 2; + int have_pinned_space; + + /* Make sure bytes are sectorsize aligned */ + bytes = ALIGN(bytes, fs_info->sectorsize); + + if (btrfs_is_free_space_inode(inode)) { + need_commit = 0; + ASSERT(current->journal_info); + } + +again: + /* Make sure we have enough space to handle the data first */ + spin_lock(&data_sinfo->lock); + used = btrfs_space_info_used(data_sinfo, true); + + if (used + bytes > data_sinfo->total_bytes) { + struct btrfs_trans_handle *trans; + + /* + * If we don't have enough free bytes in this space then we need + * to alloc a new chunk. + */ + if (!data_sinfo->full) { + u64 alloc_target; + + data_sinfo->force_alloc = CHUNK_ALLOC_FORCE; + spin_unlock(&data_sinfo->lock); + + alloc_target = btrfs_data_alloc_profile(fs_info); + /* + * It is ugly that we don't call nolock join + * transaction for the free space inode case here. + * But it is safe because we only do the data space + * reservation for the free space cache in the + * transaction context, the common join transaction + * just increase the counter of the current transaction + * handler, doesn't try to acquire the trans_lock of + * the fs. + */ + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) + return PTR_ERR(trans); + + ret = btrfs_chunk_alloc(trans, alloc_target, + CHUNK_ALLOC_NO_FORCE); + btrfs_end_transaction(trans); + if (ret < 0) { + if (ret != -ENOSPC) + return ret; + else { + have_pinned_space = 1; + goto commit_trans; + } + } + + goto again; + } + + /* + * If we don't have enough pinned space to deal with this + * allocation, and no removed chunk in current transaction, + * don't bother committing the transaction. + */ + have_pinned_space = __percpu_counter_compare( + &data_sinfo->total_bytes_pinned, + used + bytes - data_sinfo->total_bytes, + BTRFS_TOTAL_BYTES_PINNED_BATCH); + spin_unlock(&data_sinfo->lock); + + /* Commit the current transaction and try again */ +commit_trans: + if (need_commit) { + need_commit--; + + if (need_commit > 0) { + btrfs_start_delalloc_roots(fs_info, -1); + btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, + (u64)-1); + } + + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) + return PTR_ERR(trans); + if (have_pinned_space >= 0 || + test_bit(BTRFS_TRANS_HAVE_FREE_BGS, + &trans->transaction->flags) || + need_commit > 0) { + ret = btrfs_commit_transaction(trans); + if (ret) + return ret; + /* + * The cleaner kthread might still be doing iput + * operations. Wait for it to finish so that + * more space is released. We don't need to + * explicitly run the delayed iputs here because + * the commit_transaction would have woken up + * the cleaner. + */ + ret = btrfs_wait_on_delayed_iputs(fs_info); + if (ret) + return ret; + goto again; + } else { + btrfs_end_transaction(trans); + } + } + + trace_btrfs_space_reservation(fs_info, + "space_info:enospc", + data_sinfo->flags, bytes, 1); + return -ENOSPC; + } + btrfs_space_info_update_bytes_may_use(fs_info, data_sinfo, bytes); + trace_btrfs_space_reservation(fs_info, "space_info", + data_sinfo->flags, bytes, 1); + spin_unlock(&data_sinfo->lock); + + return 0; +} + +int btrfs_check_data_free_space(struct inode *inode, + struct extent_changeset **reserved, u64 start, u64 len) +{ + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + int ret; + + /* align the range */ + len = round_up(start + len, fs_info->sectorsize) - + round_down(start, fs_info->sectorsize); + start = round_down(start, fs_info->sectorsize); + + ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode), len); + if (ret < 0) + return ret; + + /* Use new btrfs_qgroup_reserve_data to reserve precious data space. */ + ret = btrfs_qgroup_reserve_data(inode, reserved, start, len); + if (ret < 0) + btrfs_free_reserved_data_space_noquota(inode, start, len); + else + ret = 0; + return ret; +} + +/* + * Called if we need to clear a data reservation for this inode + * Normally in a error case. + * + * This one will *NOT* use accurate qgroup reserved space API, just for case + * which we can't sleep and is sure it won't affect qgroup reserved space. + * Like clear_bit_hook(). + */ +void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start, + u64 len) +{ + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_space_info *data_sinfo; + + /* Make sure the range is aligned to sectorsize */ + len = round_up(start + len, fs_info->sectorsize) - + round_down(start, fs_info->sectorsize); + start = round_down(start, fs_info->sectorsize); + + data_sinfo = fs_info->data_sinfo; + spin_lock(&data_sinfo->lock); + btrfs_space_info_update_bytes_may_use(fs_info, data_sinfo, -len); + trace_btrfs_space_reservation(fs_info, "space_info", + data_sinfo->flags, len, 0); + spin_unlock(&data_sinfo->lock); +} + +/* + * Called if we need to clear a data reservation for this inode + * Normally in a error case. + * + * This one will handle the per-inode data rsv map for accurate reserved + * space framework. + */ +void btrfs_free_reserved_data_space(struct inode *inode, + struct extent_changeset *reserved, u64 start, u64 len) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + + /* Make sure the range is aligned to sectorsize */ + len = round_up(start + len, root->fs_info->sectorsize) - + round_down(start, root->fs_info->sectorsize); + start = round_down(start, root->fs_info->sectorsize); + + btrfs_free_reserved_data_space_noquota(inode, start, len); + btrfs_qgroup_free_data(inode, reserved, start, len); +} + +/** + * btrfs_inode_rsv_release - release any excessive reservation. + * @inode - the inode we need to release from. + * @qgroup_free - free or convert qgroup meta. + * Unlike normal operation, qgroup meta reservation needs to know if we are + * freeing qgroup reservation or just converting it into per-trans. Normally + * @qgroup_free is true for error handling, and false for normal release. + * + * This is the same as btrfs_block_rsv_release, except that it handles the + * tracepoint for the reservation. + */ +static void btrfs_inode_rsv_release(struct btrfs_inode *inode, bool qgroup_free) +{ + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct btrfs_block_rsv *block_rsv = &inode->block_rsv; + u64 released = 0; + u64 qgroup_to_release = 0; + + /* + * Since we statically set the block_rsv->size we just want to say we + * are releasing 0 bytes, and then we'll just get the reservation over + * the size free'd. + */ + released = __btrfs_block_rsv_release(fs_info, block_rsv, 0, + &qgroup_to_release); + if (released > 0) + trace_btrfs_space_reservation(fs_info, "delalloc", + btrfs_ino(inode), released, 0); + if (qgroup_free) + btrfs_qgroup_free_meta_prealloc(inode->root, qgroup_to_release); + else + btrfs_qgroup_convert_reserved_meta(inode->root, + qgroup_to_release); +} + +static void btrfs_calculate_inode_block_rsv_size(struct btrfs_fs_info *fs_info, + struct btrfs_inode *inode) +{ + struct btrfs_block_rsv *block_rsv = &inode->block_rsv; + u64 reserve_size = 0; + u64 qgroup_rsv_size = 0; + u64 csum_leaves; + unsigned outstanding_extents; + + lockdep_assert_held(&inode->lock); + outstanding_extents = inode->outstanding_extents; + if (outstanding_extents) + reserve_size = btrfs_calc_trans_metadata_size(fs_info, + outstanding_extents + 1); + csum_leaves = btrfs_csum_bytes_to_leaves(fs_info, + inode->csum_bytes); + reserve_size += btrfs_calc_trans_metadata_size(fs_info, + csum_leaves); + /* + * For qgroup rsv, the calculation is very simple: + * account one nodesize for each outstanding extent + * + * This is overestimating in most cases. + */ + qgroup_rsv_size = (u64)outstanding_extents * fs_info->nodesize; + + spin_lock(&block_rsv->lock); + block_rsv->size = reserve_size; + block_rsv->qgroup_rsv_size = qgroup_rsv_size; + spin_unlock(&block_rsv->lock); +} + +static void calc_inode_reservations(struct btrfs_fs_info *fs_info, + u64 num_bytes, u64 *meta_reserve, + u64 *qgroup_reserve) +{ + u64 nr_extents = count_max_extents(num_bytes); + u64 csum_leaves = btrfs_csum_bytes_to_leaves(fs_info, num_bytes); + + /* We add one for the inode update at finish ordered time */ + *meta_reserve = btrfs_calc_trans_metadata_size(fs_info, + nr_extents + csum_leaves + 1); + *qgroup_reserve = nr_extents * fs_info->nodesize; +} + +int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes) +{ + struct btrfs_root *root = inode->root; + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_block_rsv *block_rsv = &inode->block_rsv; + u64 meta_reserve, qgroup_reserve; + unsigned nr_extents; + enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL; + int ret = 0; + bool delalloc_lock = true; + + /* + * If we are a free space inode we need to not flush since we will be in + * the middle of a transaction commit. We also don't need the delalloc + * mutex since we won't race with anybody. We need this mostly to make + * lockdep shut its filthy mouth. + * + * If we have a transaction open (can happen if we call truncate_block + * from truncate), then we need FLUSH_LIMIT so we don't deadlock. + */ + if (btrfs_is_free_space_inode(inode)) { + flush = BTRFS_RESERVE_NO_FLUSH; + delalloc_lock = false; + } else { + if (current->journal_info) + flush = BTRFS_RESERVE_FLUSH_LIMIT; + + if (btrfs_transaction_in_commit(fs_info)) + schedule_timeout(1); + } + + if (delalloc_lock) + mutex_lock(&inode->delalloc_mutex); + + num_bytes = ALIGN(num_bytes, fs_info->sectorsize); + + /* + * We always want to do it this way, every other way is wrong and ends + * in tears. Pre-reserving the amount we are going to add will always + * be the right way, because otherwise if we have enough parallelism we + * could end up with thousands of inodes all holding little bits of + * reservations they were able to make previously and the only way to + * reclaim that space is to ENOSPC out the operations and clear + * everything out and try again, which is bad. This way we just + * over-reserve slightly, and clean up the mess when we are done. + */ + calc_inode_reservations(fs_info, num_bytes, &meta_reserve, + &qgroup_reserve); + ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_reserve, true); + if (ret) + goto out_fail; + ret = btrfs_reserve_metadata_bytes(root, block_rsv, meta_reserve, flush); + if (ret) + goto out_qgroup; + + /* + * Now we need to update our outstanding extents and csum bytes _first_ + * and then add the reservation to the block_rsv. This keeps us from + * racing with an ordered completion or some such that would think it + * needs to free the reservation we just made. + */ + spin_lock(&inode->lock); + nr_extents = count_max_extents(num_bytes); + btrfs_mod_outstanding_extents(inode, nr_extents); + inode->csum_bytes += num_bytes; + btrfs_calculate_inode_block_rsv_size(fs_info, inode); + spin_unlock(&inode->lock); + + /* Now we can safely add our space to our block rsv */ + btrfs_block_rsv_add_bytes(block_rsv, meta_reserve, false); + trace_btrfs_space_reservation(root->fs_info, "delalloc", + btrfs_ino(inode), meta_reserve, 1); + + spin_lock(&block_rsv->lock); + block_rsv->qgroup_rsv_reserved += qgroup_reserve; + spin_unlock(&block_rsv->lock); + + if (delalloc_lock) + mutex_unlock(&inode->delalloc_mutex); + return 0; +out_qgroup: + btrfs_qgroup_free_meta_prealloc(root, qgroup_reserve); +out_fail: + btrfs_inode_rsv_release(inode, true); + if (delalloc_lock) + mutex_unlock(&inode->delalloc_mutex); + return ret; +} + +/** + * btrfs_delalloc_release_metadata - release a metadata reservation for an inode + * @inode: the inode to release the reservation for. + * @num_bytes: the number of bytes we are releasing. + * @qgroup_free: free qgroup reservation or convert it to per-trans reservation + * + * This will release the metadata reservation for an inode. This can be called + * once we complete IO for a given set of bytes to release their metadata + * reservations, or on error for the same reason. + */ +void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes, + bool qgroup_free) +{ + struct btrfs_fs_info *fs_info = inode->root->fs_info; + + num_bytes = ALIGN(num_bytes, fs_info->sectorsize); + spin_lock(&inode->lock); + inode->csum_bytes -= num_bytes; + btrfs_calculate_inode_block_rsv_size(fs_info, inode); + spin_unlock(&inode->lock); + + if (btrfs_is_testing(fs_info)) + return; + + btrfs_inode_rsv_release(inode, qgroup_free); +} + +/** + * btrfs_delalloc_release_extents - release our outstanding_extents + * @inode: the inode to balance the reservation for. + * @num_bytes: the number of bytes we originally reserved with + * @qgroup_free: do we need to free qgroup meta reservation or convert them. + * + * When we reserve space we increase outstanding_extents for the extents we may + * add. Once we've set the range as delalloc or created our ordered extents we + * have outstanding_extents to track the real usage, so we use this to free our + * temporarily tracked outstanding_extents. This _must_ be used in conjunction + * with btrfs_delalloc_reserve_metadata. + */ +void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes, + bool qgroup_free) +{ + struct btrfs_fs_info *fs_info = inode->root->fs_info; + unsigned num_extents; + + spin_lock(&inode->lock); + num_extents = count_max_extents(num_bytes); + btrfs_mod_outstanding_extents(inode, -num_extents); + btrfs_calculate_inode_block_rsv_size(fs_info, inode); + spin_unlock(&inode->lock); + + if (btrfs_is_testing(fs_info)) + return; + + btrfs_inode_rsv_release(inode, qgroup_free); +} + +/** + * btrfs_delalloc_reserve_space - reserve data and metadata space for + * delalloc + * @inode: inode we're writing to + * @start: start range we are writing to + * @len: how long the range we are writing to + * @reserved: mandatory parameter, record actually reserved qgroup ranges of + * current reservation. + * + * This will do the following things + * + * - reserve space in data space info for num bytes + * and reserve precious corresponding qgroup space + * (Done in check_data_free_space) + * + * - reserve space for metadata space, based on the number of outstanding + * extents and how much csums will be needed + * also reserve metadata space in a per root over-reserve method. + * - add to the inodes->delalloc_bytes + * - add it to the fs_info's delalloc inodes list. + * (Above 3 all done in delalloc_reserve_metadata) + * + * Return 0 for success + * Return <0 for error(-ENOSPC or -EQUOT) + */ +int btrfs_delalloc_reserve_space(struct inode *inode, + struct extent_changeset **reserved, u64 start, u64 len) +{ + int ret; + + ret = btrfs_check_data_free_space(inode, reserved, start, len); + if (ret < 0) + return ret; + ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len); + if (ret < 0) + btrfs_free_reserved_data_space(inode, *reserved, start, len); + return ret; +} + +/** + * btrfs_delalloc_release_space - release data and metadata space for delalloc + * @inode: inode we're releasing space for + * @start: start position of the space already reserved + * @len: the len of the space already reserved + * @release_bytes: the len of the space we consumed or didn't use + * + * This function will release the metadata space that was not used and will + * decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes + * list if there are no delalloc bytes left. + * Also it will handle the qgroup reserved space. + */ +void btrfs_delalloc_release_space(struct inode *inode, + struct extent_changeset *reserved, + u64 start, u64 len, bool qgroup_free) +{ + btrfs_delalloc_release_metadata(BTRFS_I(inode), len, qgroup_free); + btrfs_free_reserved_data_space(inode, reserved, start, len); +} diff --git a/fs/btrfs/delalloc-space.h b/fs/btrfs/delalloc-space.h new file mode 100644 index 000000000000..54466fbd7075 --- /dev/null +++ b/fs/btrfs/delalloc-space.h @@ -0,0 +1,23 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef BTRFS_DELALLOC_SPACE_H +#define BTRFS_DELALLOC_SPACE_H + +struct extent_changeset; + +int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes); +int btrfs_check_data_free_space(struct inode *inode, + struct extent_changeset **reserved, u64 start, u64 len); +void btrfs_free_reserved_data_space(struct inode *inode, + struct extent_changeset *reserved, u64 start, u64 len); +void btrfs_delalloc_release_space(struct inode *inode, + struct extent_changeset *reserved, + u64 start, u64 len, bool qgroup_free); +void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start, + u64 len); +void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes, + bool qgroup_free); +int btrfs_delalloc_reserve_space(struct inode *inode, + struct extent_changeset **reserved, u64 start, u64 len); + +#endif /* BTRFS_DELALLOC_SPACE_H */ diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index a73fc23e2961..9a91d1eb0af4 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -10,6 +10,7 @@ #include "delayed-ref.h" #include "transaction.h" #include "qgroup.h" +#include "space-info.h" struct kmem_cache *btrfs_delayed_ref_head_cachep; struct kmem_cache *btrfs_delayed_tree_ref_cachep; @@ -24,6 +25,179 @@ struct kmem_cache *btrfs_delayed_extent_op_cachep; * of hammering updates on the extent allocation tree. */ +bool btrfs_check_space_for_delayed_refs(struct btrfs_fs_info *fs_info) +{ + struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv; + struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; + bool ret = false; + u64 reserved; + + spin_lock(&global_rsv->lock); + reserved = global_rsv->reserved; + spin_unlock(&global_rsv->lock); + + /* + * Since the global reserve is just kind of magic we don't really want + * to rely on it to save our bacon, so if our size is more than the + * delayed_refs_rsv and the global rsv then it's time to think about + * bailing. + */ + spin_lock(&delayed_refs_rsv->lock); + reserved += delayed_refs_rsv->reserved; + if (delayed_refs_rsv->size >= reserved) + ret = true; + spin_unlock(&delayed_refs_rsv->lock); + return ret; +} + +int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans) +{ + u64 num_entries = + atomic_read(&trans->transaction->delayed_refs.num_entries); + u64 avg_runtime; + u64 val; + + smp_mb(); + avg_runtime = trans->fs_info->avg_delayed_ref_runtime; + val = num_entries * avg_runtime; + if (val >= NSEC_PER_SEC) + return 1; + if (val >= NSEC_PER_SEC / 2) + return 2; + + return btrfs_check_space_for_delayed_refs(trans->fs_info); +} + +/** + * btrfs_delayed_refs_rsv_release - release a ref head's reservation. + * @fs_info - the fs_info for our fs. + * @nr - the number of items to drop. + * + * This drops the delayed ref head's count from the delayed refs rsv and frees + * any excess reservation we had. + */ +void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr) +{ + struct btrfs_block_rsv *block_rsv = &fs_info->delayed_refs_rsv; + u64 num_bytes = btrfs_calc_trans_metadata_size(fs_info, nr); + u64 released = 0; + + released = __btrfs_block_rsv_release(fs_info, block_rsv, num_bytes, + NULL); + if (released) + trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv", + 0, released, 0); +} + +/* + * btrfs_update_delayed_refs_rsv - adjust the size of the delayed refs rsv + * @trans - the trans that may have generated delayed refs + * + * This is to be called anytime we may have adjusted trans->delayed_ref_updates, + * it'll calculate the additional size and add it to the delayed_refs_rsv. + */ +void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv; + u64 num_bytes; + + if (!trans->delayed_ref_updates) + return; + + num_bytes = btrfs_calc_trans_metadata_size(fs_info, + trans->delayed_ref_updates); + spin_lock(&delayed_rsv->lock); + delayed_rsv->size += num_bytes; + delayed_rsv->full = 0; + spin_unlock(&delayed_rsv->lock); + trans->delayed_ref_updates = 0; +} + +/** + * btrfs_migrate_to_delayed_refs_rsv - transfer bytes to our delayed refs rsv. + * @fs_info - the fs info for our fs. + * @src - the source block rsv to transfer from. + * @num_bytes - the number of bytes to transfer. + * + * This transfers up to the num_bytes amount from the src rsv to the + * delayed_refs_rsv. Any extra bytes are returned to the space info. + */ +void btrfs_migrate_to_delayed_refs_rsv(struct btrfs_fs_info *fs_info, + struct btrfs_block_rsv *src, + u64 num_bytes) +{ + struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv; + u64 to_free = 0; + + spin_lock(&src->lock); + src->reserved -= num_bytes; + src->size -= num_bytes; + spin_unlock(&src->lock); + + spin_lock(&delayed_refs_rsv->lock); + if (delayed_refs_rsv->size > delayed_refs_rsv->reserved) { + u64 delta = delayed_refs_rsv->size - + delayed_refs_rsv->reserved; + if (num_bytes > delta) { + to_free = num_bytes - delta; + num_bytes = delta; + } + } else { + to_free = num_bytes; + num_bytes = 0; + } + + if (num_bytes) + delayed_refs_rsv->reserved += num_bytes; + if (delayed_refs_rsv->reserved >= delayed_refs_rsv->size) + delayed_refs_rsv->full = 1; + spin_unlock(&delayed_refs_rsv->lock); + + if (num_bytes) + trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv", + 0, num_bytes, 1); + if (to_free) + btrfs_space_info_add_old_bytes(fs_info, + delayed_refs_rsv->space_info, to_free); +} + +/** + * btrfs_delayed_refs_rsv_refill - refill based on our delayed refs usage. + * @fs_info - the fs_info for our fs. + * @flush - control how we can flush for this reservation. + * + * This will refill the delayed block_rsv up to 1 items size worth of space and + * will return -ENOSPC if we can't make the reservation. + */ +int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info, + enum btrfs_reserve_flush_enum flush) +{ + struct btrfs_block_rsv *block_rsv = &fs_info->delayed_refs_rsv; + u64 limit = btrfs_calc_trans_metadata_size(fs_info, 1); + u64 num_bytes = 0; + int ret = -ENOSPC; + + spin_lock(&block_rsv->lock); + if (block_rsv->reserved < block_rsv->size) { + num_bytes = block_rsv->size - block_rsv->reserved; + num_bytes = min(num_bytes, limit); + } + spin_unlock(&block_rsv->lock); + + if (!num_bytes) + return 0; + + ret = btrfs_reserve_metadata_bytes(fs_info->extent_root, block_rsv, + num_bytes, flush); + if (ret) + return ret; + btrfs_block_rsv_add_bytes(block_rsv, num_bytes, 0); + trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv", + 0, num_bytes, 1); + return 0; +} + /* * compare two delayed tree backrefs with same bytenr and type */ @@ -957,13 +1131,14 @@ int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans, } /* - * this does a simple search for the head node for a given extent. - * It must be called with the delayed ref spinlock held, and it returns - * the head node if any where found, or NULL if not. + * This does a simple search for the head node for a given extent. Returns the + * head node if found, or NULL if not. */ struct btrfs_delayed_ref_head * btrfs_find_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs, u64 bytenr) { + lockdep_assert_held(&delayed_refs->lock); + return find_ref_head(delayed_refs, bytenr, false); } diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index c18f93ea88ed..1c977e6d45dc 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -364,6 +364,16 @@ struct btrfs_delayed_ref_head *btrfs_select_ref_head( int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, u64 seq); +void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr); +void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans); +int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info, + enum btrfs_reserve_flush_enum flush); +void btrfs_migrate_to_delayed_refs_rsv(struct btrfs_fs_info *fs_info, + struct btrfs_block_rsv *src, + u64 num_bytes); +int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans); +bool btrfs_check_space_for_delayed_refs(struct btrfs_fs_info *fs_info); + /* * helper functions to cast a node into its container */ diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index ee0989c7e3a9..6b2e9aa83ffa 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -201,7 +201,7 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, return PTR_ERR(bdev); } - filemap_write_and_wait(bdev->bd_inode->i_mapping); + sync_blockdev(bdev); devices = &fs_info->fs_devices->devices; list_for_each_entry(device, devices, dev_list) { @@ -237,7 +237,6 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, } rcu_assign_pointer(device->name, name); - mutex_lock(&fs_info->fs_devices->device_list_mutex); set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); device->generation = 0; device->io_width = fs_info->sectorsize; @@ -256,6 +255,8 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, device->dev_stats_valid = 1; set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE); device->fs_devices = fs_info->fs_devices; + + mutex_lock(&fs_info->fs_devices->device_list_mutex); list_add(&device->dev_list, &fs_info->fs_devices->devices); fs_info->fs_devices->num_devices++; fs_info->fs_devices->open_devices++; @@ -399,7 +400,6 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, int ret; struct btrfs_device *tgt_device = NULL; struct btrfs_device *src_device = NULL; - bool need_unlock; src_device = btrfs_find_device_by_devspec(fs_info, srcdevid, srcdev_name); @@ -413,11 +413,6 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, return -ETXTBSY; } - ret = btrfs_init_dev_replace_tgtdev(fs_info, tgtdev_name, - src_device, &tgt_device); - if (ret) - return ret; - /* * Here we commit the transaction to make sure commit_total_bytes * of all the devices are updated. @@ -431,7 +426,11 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, return PTR_ERR(trans); } - need_unlock = true; + ret = btrfs_init_dev_replace_tgtdev(fs_info, tgtdev_name, + src_device, &tgt_device); + if (ret) + return ret; + down_write(&dev_replace->rwsem); switch (dev_replace->replace_state) { case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: @@ -442,11 +441,11 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: ASSERT(0); ret = BTRFS_IOCTL_DEV_REPLACE_RESULT_ALREADY_STARTED; + up_write(&dev_replace->rwsem); goto leave; } dev_replace->cont_reading_from_srcdev_mode = read_src; - WARN_ON(!src_device); dev_replace->srcdev = src_device; dev_replace->tgtdev = tgt_device; @@ -471,7 +470,6 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, atomic64_set(&dev_replace->num_write_errors, 0); atomic64_set(&dev_replace->num_uncorrectable_read_errors, 0); up_write(&dev_replace->rwsem); - need_unlock = false; ret = btrfs_sysfs_add_device_link(tgt_device->fs_devices, tgt_device); if (ret) @@ -479,16 +477,16 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1); - /* force writing the updated state information to disk */ - trans = btrfs_start_transaction(root, 0); + /* Commit dev_replace state and reserve 1 item for it. */ + trans = btrfs_start_transaction(root, 1); if (IS_ERR(trans)) { ret = PTR_ERR(trans); - need_unlock = true; down_write(&dev_replace->rwsem); dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED; dev_replace->srcdev = NULL; dev_replace->tgtdev = NULL; + up_write(&dev_replace->rwsem); goto leave; } @@ -510,8 +508,6 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, return ret; leave: - if (need_unlock) - up_write(&dev_replace->rwsem); btrfs_destroy_dev_replace_tgtdev(tgt_device); return ret; } @@ -678,7 +674,6 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, btrfs_device_set_disk_total_bytes(tgt_device, src_device->disk_total_bytes); btrfs_device_set_bytes_used(tgt_device, src_device->bytes_used); - tgt_device->commit_total_bytes = src_device->commit_total_bytes; tgt_device->commit_bytes_used = src_device->bytes_used; btrfs_assign_next_active_device(src_device, tgt_device); @@ -728,7 +723,7 @@ static void btrfs_dev_replace_update_device_in_mapping_tree( struct btrfs_device *srcdev, struct btrfs_device *tgtdev) { - struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree; + struct extent_map_tree *em_tree = &fs_info->mapping_tree; struct extent_map *em; struct map_lookup *map; u64 start = 0; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index deb74a8c191a..41a2bd2e0c56 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -19,6 +19,7 @@ #include <linux/crc32c.h> #include <linux/sched/mm.h> #include <asm/unaligned.h> +#include <crypto/hash.h> #include "ctree.h" #include "disk-io.h" #include "transaction.h" @@ -40,10 +41,6 @@ #include "tree-checker.h" #include "ref-verify.h" -#ifdef CONFIG_X86 -#include <asm/cpufeature.h> -#endif - #define BTRFS_SUPER_FLAG_SUPP (BTRFS_HEADER_FLAG_WRITTEN |\ BTRFS_HEADER_FLAG_RELOC |\ BTRFS_SUPER_FLAG_ERROR |\ @@ -249,16 +246,6 @@ out: return em; } -u32 btrfs_csum_data(const char *data, u32 seed, size_t len) -{ - return crc32c(seed, data, len); -} - -void btrfs_csum_final(u32 crc, u8 *result) -{ - put_unaligned_le32(~crc, result); -} - /* * Compute the csum of a btree block and store the result to provided buffer. * @@ -266,6 +253,8 @@ void btrfs_csum_final(u32 crc, u8 *result) */ static int csum_tree_block(struct extent_buffer *buf, u8 *result) { + struct btrfs_fs_info *fs_info = buf->fs_info; + SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); unsigned long len; unsigned long cur_len; unsigned long offset = BTRFS_CSUM_SIZE; @@ -273,9 +262,12 @@ static int csum_tree_block(struct extent_buffer *buf, u8 *result) unsigned long map_start; unsigned long map_len; int err; - u32 crc = ~(u32)0; + + shash->tfm = fs_info->csum_shash; + crypto_shash_init(shash); len = buf->len - offset; + while (len > 0) { /* * Note: we don't need to check for the err == 1 case here, as @@ -288,14 +280,13 @@ static int csum_tree_block(struct extent_buffer *buf, u8 *result) if (WARN_ON(err)) return err; cur_len = min(len, map_len - (offset - map_start)); - crc = btrfs_csum_data(kaddr + offset - map_start, - crc, cur_len); + crypto_shash_update(shash, kaddr + offset - map_start, cur_len); len -= cur_len; offset += cur_len; } memset(result, 0, BTRFS_CSUM_SIZE); - btrfs_csum_final(crc, result); + crypto_shash_final(shash, result); return 0; } @@ -356,6 +347,16 @@ out: return ret; } +static bool btrfs_supported_super_csum(u16 csum_type) +{ + switch (csum_type) { + case BTRFS_CSUM_TYPE_CRC32: + return true; + default: + return false; + } +} + /* * Return 0 if the superblock checksum type matches the checksum value of that * algorithm. Pass the raw disk superblock data. @@ -365,33 +366,25 @@ static int btrfs_check_super_csum(struct btrfs_fs_info *fs_info, { struct btrfs_super_block *disk_sb = (struct btrfs_super_block *)raw_disk_sb; - u16 csum_type = btrfs_super_csum_type(disk_sb); - int ret = 0; + char result[BTRFS_CSUM_SIZE]; + SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); - if (csum_type == BTRFS_CSUM_TYPE_CRC32) { - u32 crc = ~(u32)0; - char result[sizeof(crc)]; + shash->tfm = fs_info->csum_shash; + crypto_shash_init(shash); - /* - * The super_block structure does not span the whole - * BTRFS_SUPER_INFO_SIZE range, we expect that the unused space - * is filled with zeros and is included in the checksum. - */ - crc = btrfs_csum_data(raw_disk_sb + BTRFS_CSUM_SIZE, - crc, BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE); - btrfs_csum_final(crc, result); + /* + * The super_block structure does not span the whole + * BTRFS_SUPER_INFO_SIZE range, we expect that the unused space is + * filled with zeros and is included in the checksum. + */ + crypto_shash_update(shash, raw_disk_sb + BTRFS_CSUM_SIZE, + BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE); + crypto_shash_final(shash, result); - if (memcmp(raw_disk_sb, result, sizeof(result))) - ret = 1; - } + if (memcmp(disk_sb->csum, result, btrfs_super_csum_size(disk_sb))) + return 1; - if (csum_type >= ARRAY_SIZE(btrfs_csum_sizes)) { - btrfs_err(fs_info, "unsupported checksum algorithm %u", - csum_type); - ret = 1; - } - - return ret; + return 0; } int btrfs_verify_level_key(struct extent_buffer *eb, int level, @@ -873,14 +866,13 @@ static blk_status_t btree_submit_bio_start(void *private_data, struct bio *bio, return btree_csum_one_bio(bio); } -static int check_async_write(struct btrfs_inode *bi) +static int check_async_write(struct btrfs_fs_info *fs_info, + struct btrfs_inode *bi) { if (atomic_read(&bi->sync_writers)) return 0; -#ifdef CONFIG_X86 - if (static_cpu_has(X86_FEATURE_XMM4_2)) + if (test_bit(BTRFS_FS_CSUM_IMPL_FAST, &fs_info->flags)) return 0; -#endif return 1; } @@ -889,7 +881,7 @@ static blk_status_t btree_submit_bio_hook(struct inode *inode, struct bio *bio, unsigned long bio_flags) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - int async = check_async_write(BTRFS_I(inode)); + int async = check_async_write(fs_info, BTRFS_I(inode)); blk_status_t ret; if (bio_op(bio) != REQ_OP_WRITE) { @@ -2262,6 +2254,29 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info, return 0; } +static int btrfs_init_csum_hash(struct btrfs_fs_info *fs_info, u16 csum_type) +{ + struct crypto_shash *csum_shash; + const char *csum_name = btrfs_super_csum_name(csum_type); + + csum_shash = crypto_alloc_shash(csum_name, 0, 0); + + if (IS_ERR(csum_shash)) { + btrfs_err(fs_info, "error allocating %s hash for checksum", + csum_name); + return PTR_ERR(csum_shash); + } + + fs_info->csum_shash = csum_shash; + + return 0; +} + +static void btrfs_free_csum_hash(struct btrfs_fs_info *fs_info) +{ + crypto_free_shash(fs_info->csum_shash); +} + static int btrfs_replay_log(struct btrfs_fs_info *fs_info, struct btrfs_fs_devices *fs_devices) { @@ -2577,7 +2592,7 @@ static int btrfs_validate_write_super(struct btrfs_fs_info *fs_info, ret = validate_super(fs_info, sb, -1); if (ret < 0) goto out; - if (btrfs_super_csum_type(sb) != BTRFS_CSUM_TYPE_CRC32) { + if (!btrfs_supported_super_csum(btrfs_super_csum_type(sb))) { ret = -EUCLEAN; btrfs_err(fs_info, "invalid csum type, has %u want %u", btrfs_super_csum_type(sb), BTRFS_CSUM_TYPE_CRC32); @@ -2607,6 +2622,7 @@ int open_ctree(struct super_block *sb, u32 stripesize; u64 generation; u64 features; + u16 csum_type; struct btrfs_key location; struct buffer_head *bh; struct btrfs_super_block *disk_super; @@ -2689,7 +2705,7 @@ int open_ctree(struct super_block *sb, INIT_LIST_HEAD(&fs_info->space_info); INIT_LIST_HEAD(&fs_info->tree_mod_seq_list); INIT_LIST_HEAD(&fs_info->unused_bgs); - btrfs_mapping_init(&fs_info->mapping_tree); + extent_map_tree_init(&fs_info->mapping_tree); btrfs_init_block_rsv(&fs_info->global_block_rsv, BTRFS_BLOCK_RSV_GLOBAL); btrfs_init_block_rsv(&fs_info->trans_block_rsv, BTRFS_BLOCK_RSV_TRANS); @@ -2793,6 +2809,8 @@ int open_ctree(struct super_block *sb, spin_lock_init(&fs_info->swapfile_pins_lock); fs_info->swapfile_pins = RB_ROOT; + fs_info->send_in_progress = 0; + ret = btrfs_alloc_stripe_hash_table(fs_info); if (ret) { err = ret; @@ -2813,6 +2831,25 @@ int open_ctree(struct super_block *sb, } /* + * Verify the type first, if that or the the checksum value are + * corrupted, we'll find out + */ + csum_type = btrfs_super_csum_type((struct btrfs_super_block *)bh->b_data); + if (!btrfs_supported_super_csum(csum_type)) { + btrfs_err(fs_info, "unsupported checksum algorithm: %u", + csum_type); + err = -EINVAL; + brelse(bh); + goto fail_alloc; + } + + ret = btrfs_init_csum_hash(fs_info, csum_type); + if (ret) { + err = ret; + goto fail_alloc; + } + + /* * We want to check superblock checksum, the type is stored inside. * Pass the whole disk block of size BTRFS_SUPER_INFO_SIZE (4k). */ @@ -2820,7 +2857,7 @@ int open_ctree(struct super_block *sb, btrfs_err(fs_info, "superblock checksum mismatch"); err = -EINVAL; brelse(bh); - goto fail_alloc; + goto fail_csum; } /* @@ -2857,11 +2894,11 @@ int open_ctree(struct super_block *sb, if (ret) { btrfs_err(fs_info, "superblock contains fatal errors"); err = -EINVAL; - goto fail_alloc; + goto fail_csum; } if (!btrfs_super_root(disk_super)) - goto fail_alloc; + goto fail_csum; /* check FS state, whether FS is broken. */ if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_ERROR) @@ -2883,7 +2920,7 @@ int open_ctree(struct super_block *sb, ret = btrfs_parse_options(fs_info, options, sb->s_flags); if (ret) { err = ret; - goto fail_alloc; + goto fail_csum; } features = btrfs_super_incompat_flags(disk_super) & @@ -2893,7 +2930,7 @@ int open_ctree(struct super_block *sb, "cannot mount because of unsupported optional features (%llx)", features); err = -EINVAL; - goto fail_alloc; + goto fail_csum; } features = btrfs_super_incompat_flags(disk_super); @@ -2937,7 +2974,7 @@ int open_ctree(struct super_block *sb, btrfs_err(fs_info, "unequal nodesize/sectorsize (%u != %u) are not allowed for mixed block groups", nodesize, sectorsize); - goto fail_alloc; + goto fail_csum; } /* @@ -2953,7 +2990,7 @@ int open_ctree(struct super_block *sb, "cannot mount read-write because of unsupported optional features (%llx)", features); err = -EINVAL; - goto fail_alloc; + goto fail_csum; } ret = btrfs_init_workqueues(fs_info, fs_devices); @@ -3331,6 +3368,8 @@ fail_tree_roots: fail_sb_buffer: btrfs_stop_all_workers(fs_info); btrfs_free_block_groups(fs_info); +fail_csum: + btrfs_free_csum_hash(fs_info); fail_alloc: fail_iput: btrfs_mapping_tree_free(&fs_info->mapping_tree); @@ -3472,17 +3511,20 @@ struct buffer_head *btrfs_read_dev_super(struct block_device *bdev) static int write_dev_supers(struct btrfs_device *device, struct btrfs_super_block *sb, int max_mirrors) { + struct btrfs_fs_info *fs_info = device->fs_info; + SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); struct buffer_head *bh; int i; int ret; int errors = 0; - u32 crc; u64 bytenr; int op_flags; if (max_mirrors == 0) max_mirrors = BTRFS_SUPER_MIRROR_MAX; + shash->tfm = fs_info->csum_shash; + for (i = 0; i < max_mirrors; i++) { bytenr = btrfs_sb_offset(i); if (bytenr + BTRFS_SUPER_INFO_SIZE >= @@ -3491,10 +3533,10 @@ static int write_dev_supers(struct btrfs_device *device, btrfs_set_super_bytenr(sb, bytenr); - crc = ~(u32)0; - crc = btrfs_csum_data((const char *)sb + BTRFS_CSUM_SIZE, crc, - BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE); - btrfs_csum_final(crc, sb->csum); + crypto_shash_init(shash); + crypto_shash_update(shash, (const char *)sb + BTRFS_CSUM_SIZE, + BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE); + crypto_shash_final(shash, sb->csum); /* One reference for us, and we leave it for the caller */ bh = __getblk(device->bdev, bytenr / BTRFS_BDEV_BLOCKSIZE, @@ -3709,7 +3751,7 @@ int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags) if ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0 || (flags & BTRFS_AVAIL_ALLOC_BIT_SINGLE)) - min_tolerated = min(min_tolerated, + min_tolerated = min_t(int, min_tolerated, btrfs_raid_array[BTRFS_RAID_SINGLE]. tolerated_failures); @@ -3718,7 +3760,7 @@ int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags) continue; if (!(flags & btrfs_raid_array[raid_type].bg_flag)) continue; - min_tolerated = min(min_tolerated, + min_tolerated = min_t(int, min_tolerated, btrfs_raid_array[raid_type]. tolerated_failures); } diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index a0161aa1ea0b..e80f7c45a307 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -115,8 +115,6 @@ int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid, int atomic); int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid, int level, struct btrfs_key *first_key); -u32 btrfs_csum_data(const char *data, u32 seed, size_t len); -void btrfs_csum_final(u32 crc, u8 *result); blk_status_t btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio, enum btrfs_wq_endio_type metadata); blk_status_t btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio, diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 5faf057f6f37..d3b58e388535 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -28,46 +28,12 @@ #include "sysfs.h" #include "qgroup.h" #include "ref-verify.h" +#include "space-info.h" +#include "block-rsv.h" +#include "delalloc-space.h" #undef SCRAMBLE_DELAYED_REFS -/* - * control flags for do_chunk_alloc's force field - * CHUNK_ALLOC_NO_FORCE means to only allocate a chunk - * if we really need one. - * - * CHUNK_ALLOC_LIMITED means to only try and allocate one - * if we have very few chunks already allocated. This is - * used as part of the clustering code to help make sure - * we have a good pool of storage to cluster in, without - * filling the FS with empty chunks - * - * CHUNK_ALLOC_FORCE means it must try to allocate one - * - */ -enum { - CHUNK_ALLOC_NO_FORCE = 0, - CHUNK_ALLOC_LIMITED = 1, - CHUNK_ALLOC_FORCE = 2, -}; - -/* - * Declare a helper function to detect underflow of various space info members - */ -#define DECLARE_SPACE_INFO_UPDATE(name) \ -static inline void update_##name(struct btrfs_space_info *sinfo, \ - s64 bytes) \ -{ \ - if (bytes < 0 && sinfo->name < -bytes) { \ - WARN_ON(1); \ - sinfo->name = 0; \ - return; \ - } \ - sinfo->name += bytes; \ -} - -DECLARE_SPACE_INFO_UPDATE(bytes_may_use); -DECLARE_SPACE_INFO_UPDATE(bytes_pinned); static int __btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_node *node, u64 parent, @@ -84,21 +50,8 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_node *node, struct btrfs_delayed_extent_op *extent_op); -static int do_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags, - int force); static int find_next_key(struct btrfs_path *path, int level, struct btrfs_key *key); -static void dump_space_info(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *info, u64 bytes, - int dump_block_groups); -static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv, - u64 num_bytes); -static void space_info_add_new_bytes(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info, - u64 num_bytes); -static void space_info_add_old_bytes(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info, - u64 num_bytes); static noinline int block_group_cache_done(struct btrfs_block_group_cache *cache) @@ -737,62 +690,39 @@ struct btrfs_block_group_cache *btrfs_lookup_block_group( return block_group_cache_tree_search(info, bytenr, 1); } -static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info, - u64 flags) +static u64 generic_ref_to_space_flags(struct btrfs_ref *ref) { - struct list_head *head = &info->space_info; - struct btrfs_space_info *found; - - flags &= BTRFS_BLOCK_GROUP_TYPE_MASK; - - rcu_read_lock(); - list_for_each_entry_rcu(found, head, list) { - if (found->flags & flags) { - rcu_read_unlock(); - return found; - } + if (ref->type == BTRFS_REF_METADATA) { + if (ref->tree_ref.root == BTRFS_CHUNK_TREE_OBJECTID) + return BTRFS_BLOCK_GROUP_SYSTEM; + else + return BTRFS_BLOCK_GROUP_METADATA; } - rcu_read_unlock(); - return NULL; + return BTRFS_BLOCK_GROUP_DATA; } static void add_pinned_bytes(struct btrfs_fs_info *fs_info, - struct btrfs_ref *ref, int sign) + struct btrfs_ref *ref) { struct btrfs_space_info *space_info; - s64 num_bytes; - u64 flags; - - ASSERT(sign == 1 || sign == -1); - num_bytes = sign * ref->len; - if (ref->type == BTRFS_REF_METADATA) { - if (ref->tree_ref.root == BTRFS_CHUNK_TREE_OBJECTID) - flags = BTRFS_BLOCK_GROUP_SYSTEM; - else - flags = BTRFS_BLOCK_GROUP_METADATA; - } else { - flags = BTRFS_BLOCK_GROUP_DATA; - } + u64 flags = generic_ref_to_space_flags(ref); - space_info = __find_space_info(fs_info, flags); + space_info = btrfs_find_space_info(fs_info, flags); ASSERT(space_info); - percpu_counter_add_batch(&space_info->total_bytes_pinned, num_bytes, + percpu_counter_add_batch(&space_info->total_bytes_pinned, ref->len, BTRFS_TOTAL_BYTES_PINNED_BATCH); } -/* - * after adding space to the filesystem, we need to clear the full flags - * on all the space infos. - */ -void btrfs_clear_space_info_full(struct btrfs_fs_info *info) +static void sub_pinned_bytes(struct btrfs_fs_info *fs_info, + struct btrfs_ref *ref) { - struct list_head *head = &info->space_info; - struct btrfs_space_info *found; + struct btrfs_space_info *space_info; + u64 flags = generic_ref_to_space_flags(ref); - rcu_read_lock(); - list_for_each_entry_rcu(found, head, list) - found->full = 0; - rcu_read_unlock(); + space_info = btrfs_find_space_info(fs_info, flags); + ASSERT(space_info); + percpu_counter_add_batch(&space_info->total_bytes_pinned, -ref->len, + BTRFS_TOTAL_BYTES_PINNED_BATCH); } /* simple helper to search for an existing data extent at a given offset */ @@ -1121,11 +1051,11 @@ static u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset) __le64 lenum; lenum = cpu_to_le64(root_objectid); - high_crc = crc32c(high_crc, &lenum, sizeof(lenum)); + high_crc = btrfs_crc32c(high_crc, &lenum, sizeof(lenum)); lenum = cpu_to_le64(owner); - low_crc = crc32c(low_crc, &lenum, sizeof(lenum)); + low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum)); lenum = cpu_to_le64(offset); - low_crc = crc32c(low_crc, &lenum, sizeof(lenum)); + low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum)); return ((u64)high_crc << 31) ^ (u64)low_crc; } @@ -2065,7 +1995,7 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, btrfs_ref_tree_mod(fs_info, generic_ref); if (ret == 0 && old_ref_mod < 0 && new_ref_mod >= 0) - add_pinned_bytes(fs_info, generic_ref, -1); + sub_pinned_bytes(fs_info, generic_ref); return ret; } @@ -2462,7 +2392,7 @@ void btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info, flags = BTRFS_BLOCK_GROUP_SYSTEM; else flags = BTRFS_BLOCK_GROUP_METADATA; - space_info = __find_space_info(fs_info, flags); + space_info = btrfs_find_space_info(fs_info, flags); ASSERT(space_info); percpu_counter_add_batch(&space_info->total_bytes_pinned, -head->num_bytes, @@ -2824,49 +2754,6 @@ u64 btrfs_csum_bytes_to_leaves(struct btrfs_fs_info *fs_info, u64 csum_bytes) return num_csums; } -bool btrfs_check_space_for_delayed_refs(struct btrfs_fs_info *fs_info) -{ - struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv; - struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; - bool ret = false; - u64 reserved; - - spin_lock(&global_rsv->lock); - reserved = global_rsv->reserved; - spin_unlock(&global_rsv->lock); - - /* - * Since the global reserve is just kind of magic we don't really want - * to rely on it to save our bacon, so if our size is more than the - * delayed_refs_rsv and the global rsv then it's time to think about - * bailing. - */ - spin_lock(&delayed_refs_rsv->lock); - reserved += delayed_refs_rsv->reserved; - if (delayed_refs_rsv->size >= reserved) - ret = true; - spin_unlock(&delayed_refs_rsv->lock); - return ret; -} - -int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans) -{ - u64 num_entries = - atomic_read(&trans->transaction->delayed_refs.num_entries); - u64 avg_runtime; - u64 val; - - smp_mb(); - avg_runtime = trans->fs_info->avg_delayed_ref_runtime; - val = num_entries * avg_runtime; - if (val >= NSEC_PER_SEC) - return 1; - if (val >= NSEC_PER_SEC / 2) - return 2; - - return btrfs_check_space_for_delayed_refs(trans->fs_info); -} - /* * this starts processing the delayed reference count updates and * extent insertions we have queued up so far. count can be @@ -3834,93 +3721,6 @@ void btrfs_wait_nocow_writers(struct btrfs_block_group_cache *bg) wait_var_event(&bg->nocow_writers, !atomic_read(&bg->nocow_writers)); } -static const char *alloc_name(u64 flags) -{ - switch (flags) { - case BTRFS_BLOCK_GROUP_METADATA|BTRFS_BLOCK_GROUP_DATA: - return "mixed"; - case BTRFS_BLOCK_GROUP_METADATA: - return "metadata"; - case BTRFS_BLOCK_GROUP_DATA: - return "data"; - case BTRFS_BLOCK_GROUP_SYSTEM: - return "system"; - default: - WARN_ON(1); - return "invalid-combination"; - }; -} - -static int create_space_info(struct btrfs_fs_info *info, u64 flags) -{ - - struct btrfs_space_info *space_info; - int i; - int ret; - - space_info = kzalloc(sizeof(*space_info), GFP_NOFS); - if (!space_info) - return -ENOMEM; - - ret = percpu_counter_init(&space_info->total_bytes_pinned, 0, - GFP_KERNEL); - if (ret) { - kfree(space_info); - return ret; - } - - for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) - INIT_LIST_HEAD(&space_info->block_groups[i]); - init_rwsem(&space_info->groups_sem); - spin_lock_init(&space_info->lock); - space_info->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK; - space_info->force_alloc = CHUNK_ALLOC_NO_FORCE; - init_waitqueue_head(&space_info->wait); - INIT_LIST_HEAD(&space_info->ro_bgs); - INIT_LIST_HEAD(&space_info->tickets); - INIT_LIST_HEAD(&space_info->priority_tickets); - - ret = kobject_init_and_add(&space_info->kobj, &space_info_ktype, - info->space_info_kobj, "%s", - alloc_name(space_info->flags)); - if (ret) { - kobject_put(&space_info->kobj); - return ret; - } - - list_add_rcu(&space_info->list, &info->space_info); - if (flags & BTRFS_BLOCK_GROUP_DATA) - info->data_sinfo = space_info; - - return ret; -} - -static void update_space_info(struct btrfs_fs_info *info, u64 flags, - u64 total_bytes, u64 bytes_used, - u64 bytes_readonly, - struct btrfs_space_info **space_info) -{ - struct btrfs_space_info *found; - int factor; - - factor = btrfs_bg_type_to_factor(flags); - - found = __find_space_info(info, flags); - ASSERT(found); - spin_lock(&found->lock); - found->total_bytes += total_bytes; - found->disk_total += total_bytes * factor; - found->bytes_used += bytes_used; - found->disk_used += bytes_used * factor; - found->bytes_readonly += bytes_readonly; - if (total_bytes > 0) - found->full = 0; - space_info_add_new_bytes(info, found, total_bytes - - bytes_used - bytes_readonly); - spin_unlock(&found->lock); - *space_info = found; -} - static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) { u64 extra_flags = chunk_to_extended(flags) & @@ -4068,215 +3868,6 @@ u64 btrfs_system_alloc_profile(struct btrfs_fs_info *fs_info) return get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_SYSTEM); } -static u64 btrfs_space_info_used(struct btrfs_space_info *s_info, - bool may_use_included) -{ - ASSERT(s_info); - return s_info->bytes_used + s_info->bytes_reserved + - s_info->bytes_pinned + s_info->bytes_readonly + - (may_use_included ? s_info->bytes_may_use : 0); -} - -int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes) -{ - struct btrfs_root *root = inode->root; - struct btrfs_fs_info *fs_info = root->fs_info; - struct btrfs_space_info *data_sinfo = fs_info->data_sinfo; - u64 used; - int ret = 0; - int need_commit = 2; - int have_pinned_space; - - /* make sure bytes are sectorsize aligned */ - bytes = ALIGN(bytes, fs_info->sectorsize); - - if (btrfs_is_free_space_inode(inode)) { - need_commit = 0; - ASSERT(current->journal_info); - } - -again: - /* make sure we have enough space to handle the data first */ - spin_lock(&data_sinfo->lock); - used = btrfs_space_info_used(data_sinfo, true); - - if (used + bytes > data_sinfo->total_bytes) { - struct btrfs_trans_handle *trans; - - /* - * if we don't have enough free bytes in this space then we need - * to alloc a new chunk. - */ - if (!data_sinfo->full) { - u64 alloc_target; - - data_sinfo->force_alloc = CHUNK_ALLOC_FORCE; - spin_unlock(&data_sinfo->lock); - - alloc_target = btrfs_data_alloc_profile(fs_info); - /* - * It is ugly that we don't call nolock join - * transaction for the free space inode case here. - * But it is safe because we only do the data space - * reservation for the free space cache in the - * transaction context, the common join transaction - * just increase the counter of the current transaction - * handler, doesn't try to acquire the trans_lock of - * the fs. - */ - trans = btrfs_join_transaction(root); - if (IS_ERR(trans)) - return PTR_ERR(trans); - - ret = do_chunk_alloc(trans, alloc_target, - CHUNK_ALLOC_NO_FORCE); - btrfs_end_transaction(trans); - if (ret < 0) { - if (ret != -ENOSPC) - return ret; - else { - have_pinned_space = 1; - goto commit_trans; - } - } - - goto again; - } - - /* - * If we don't have enough pinned space to deal with this - * allocation, and no removed chunk in current transaction, - * don't bother committing the transaction. - */ - have_pinned_space = __percpu_counter_compare( - &data_sinfo->total_bytes_pinned, - used + bytes - data_sinfo->total_bytes, - BTRFS_TOTAL_BYTES_PINNED_BATCH); - spin_unlock(&data_sinfo->lock); - - /* commit the current transaction and try again */ -commit_trans: - if (need_commit) { - need_commit--; - - if (need_commit > 0) { - btrfs_start_delalloc_roots(fs_info, -1); - btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, - (u64)-1); - } - - trans = btrfs_join_transaction(root); - if (IS_ERR(trans)) - return PTR_ERR(trans); - if (have_pinned_space >= 0 || - test_bit(BTRFS_TRANS_HAVE_FREE_BGS, - &trans->transaction->flags) || - need_commit > 0) { - ret = btrfs_commit_transaction(trans); - if (ret) - return ret; - /* - * The cleaner kthread might still be doing iput - * operations. Wait for it to finish so that - * more space is released. We don't need to - * explicitly run the delayed iputs here because - * the commit_transaction would have woken up - * the cleaner. - */ - ret = btrfs_wait_on_delayed_iputs(fs_info); - if (ret) - return ret; - goto again; - } else { - btrfs_end_transaction(trans); - } - } - - trace_btrfs_space_reservation(fs_info, - "space_info:enospc", - data_sinfo->flags, bytes, 1); - return -ENOSPC; - } - update_bytes_may_use(data_sinfo, bytes); - trace_btrfs_space_reservation(fs_info, "space_info", - data_sinfo->flags, bytes, 1); - spin_unlock(&data_sinfo->lock); - - return 0; -} - -int btrfs_check_data_free_space(struct inode *inode, - struct extent_changeset **reserved, u64 start, u64 len) -{ - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - int ret; - - /* align the range */ - len = round_up(start + len, fs_info->sectorsize) - - round_down(start, fs_info->sectorsize); - start = round_down(start, fs_info->sectorsize); - - ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode), len); - if (ret < 0) - return ret; - - /* Use new btrfs_qgroup_reserve_data to reserve precious data space. */ - ret = btrfs_qgroup_reserve_data(inode, reserved, start, len); - if (ret < 0) - btrfs_free_reserved_data_space_noquota(inode, start, len); - else - ret = 0; - return ret; -} - -/* - * Called if we need to clear a data reservation for this inode - * Normally in a error case. - * - * This one will *NOT* use accurate qgroup reserved space API, just for case - * which we can't sleep and is sure it won't affect qgroup reserved space. - * Like clear_bit_hook(). - */ -void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start, - u64 len) -{ - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct btrfs_space_info *data_sinfo; - - /* Make sure the range is aligned to sectorsize */ - len = round_up(start + len, fs_info->sectorsize) - - round_down(start, fs_info->sectorsize); - start = round_down(start, fs_info->sectorsize); - - data_sinfo = fs_info->data_sinfo; - spin_lock(&data_sinfo->lock); - update_bytes_may_use(data_sinfo, -len); - trace_btrfs_space_reservation(fs_info, "space_info", - data_sinfo->flags, len, 0); - spin_unlock(&data_sinfo->lock); -} - -/* - * Called if we need to clear a data reservation for this inode - * Normally in a error case. - * - * This one will handle the per-inode data rsv map for accurate reserved - * space framework. - */ -void btrfs_free_reserved_data_space(struct inode *inode, - struct extent_changeset *reserved, u64 start, u64 len) -{ - struct btrfs_root *root = BTRFS_I(inode)->root; - - /* Make sure the range is aligned to sectorsize */ - len = round_up(start + len, root->fs_info->sectorsize) - - round_down(start, root->fs_info->sectorsize); - start = round_down(start, root->fs_info->sectorsize); - - btrfs_free_reserved_data_space_noquota(inode, start, len); - btrfs_qgroup_free_data(inode, reserved, start, len); -} - static void force_metadata_allocation(struct btrfs_fs_info *info) { struct list_head *head = &info->space_info; @@ -4290,11 +3881,6 @@ static void force_metadata_allocation(struct btrfs_fs_info *info) rcu_read_unlock(); } -static inline u64 calc_global_rsv_need_space(struct btrfs_block_rsv *global) -{ - return (global->size << 1); -} - static int should_alloc_chunk(struct btrfs_fs_info *fs_info, struct btrfs_space_info *sinfo, int force) { @@ -4325,15 +3911,9 @@ static u64 get_profile_num_devs(struct btrfs_fs_info *fs_info, u64 type) { u64 num_dev; - if (type & (BTRFS_BLOCK_GROUP_RAID10 | - BTRFS_BLOCK_GROUP_RAID0 | - BTRFS_BLOCK_GROUP_RAID5 | - BTRFS_BLOCK_GROUP_RAID6)) + num_dev = btrfs_raid_array[btrfs_bg_flags_to_raid_index(type)].devs_max; + if (!num_dev) num_dev = fs_info->fs_devices->rw_devices; - else if (type & BTRFS_BLOCK_GROUP_RAID1) - num_dev = 2; - else - num_dev = 1; /* DUP or single */ return num_dev; } @@ -4358,7 +3938,7 @@ void check_system_chunk(struct btrfs_trans_handle *trans, u64 type) */ lockdep_assert_held(&fs_info->chunk_mutex); - info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM); + info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM); spin_lock(&info->lock); left = info->total_bytes - btrfs_space_info_used(info, true); spin_unlock(&info->lock); @@ -4372,7 +3952,7 @@ void check_system_chunk(struct btrfs_trans_handle *trans, u64 type) if (left < thresh && btrfs_test_opt(fs_info, ENOSPC_DEBUG)) { btrfs_info(fs_info, "left=%llu, need=%llu, flags=%llu", left, thresh, type); - dump_space_info(fs_info, info, 0, 0); + btrfs_dump_space_info(fs_info, info, 0, 0); } if (left < thresh) { @@ -4405,8 +3985,8 @@ void check_system_chunk(struct btrfs_trans_handle *trans, u64 type) * - return 1 if it successfully allocates a chunk, * - return errors including -ENOSPC otherwise. */ -static int do_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags, - int force) +int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags, + enum btrfs_chunk_alloc_enum force) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_space_info *space_info; @@ -4418,7 +3998,7 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags, if (trans->allocating_chunk) return -ENOSPC; - space_info = __find_space_info(fs_info, flags); + space_info = btrfs_find_space_info(fs_info, flags); ASSERT(space_info); do { @@ -4525,1714 +4105,6 @@ out: return ret; } -static int can_overcommit(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info, u64 bytes, - enum btrfs_reserve_flush_enum flush, - bool system_chunk) -{ - struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; - u64 profile; - u64 space_size; - u64 avail; - u64 used; - int factor; - - /* Don't overcommit when in mixed mode. */ - if (space_info->flags & BTRFS_BLOCK_GROUP_DATA) - return 0; - - if (system_chunk) - profile = btrfs_system_alloc_profile(fs_info); - else - profile = btrfs_metadata_alloc_profile(fs_info); - - used = btrfs_space_info_used(space_info, false); - - /* - * We only want to allow over committing if we have lots of actual space - * free, but if we don't have enough space to handle the global reserve - * space then we could end up having a real enospc problem when trying - * to allocate a chunk or some other such important allocation. - */ - spin_lock(&global_rsv->lock); - space_size = calc_global_rsv_need_space(global_rsv); - spin_unlock(&global_rsv->lock); - if (used + space_size >= space_info->total_bytes) - return 0; - - used += space_info->bytes_may_use; - - avail = atomic64_read(&fs_info->free_chunk_space); - - /* - * If we have dup, raid1 or raid10 then only half of the free - * space is actually usable. For raid56, the space info used - * doesn't include the parity drive, so we don't have to - * change the math - */ - factor = btrfs_bg_type_to_factor(profile); - avail = div_u64(avail, factor); - - /* - * If we aren't flushing all things, let us overcommit up to - * 1/2th of the space. If we can flush, don't let us overcommit - * too much, let it overcommit up to 1/8 of the space. - */ - if (flush == BTRFS_RESERVE_FLUSH_ALL) - avail >>= 3; - else - avail >>= 1; - - if (used + bytes < space_info->total_bytes + avail) - return 1; - return 0; -} - -static void btrfs_writeback_inodes_sb_nr(struct btrfs_fs_info *fs_info, - unsigned long nr_pages, int nr_items) -{ - struct super_block *sb = fs_info->sb; - - if (down_read_trylock(&sb->s_umount)) { - writeback_inodes_sb_nr(sb, nr_pages, WB_REASON_FS_FREE_SPACE); - up_read(&sb->s_umount); - } else { - /* - * We needn't worry the filesystem going from r/w to r/o though - * we don't acquire ->s_umount mutex, because the filesystem - * should guarantee the delalloc inodes list be empty after - * the filesystem is readonly(all dirty pages are written to - * the disk). - */ - btrfs_start_delalloc_roots(fs_info, nr_items); - if (!current->journal_info) - btrfs_wait_ordered_roots(fs_info, nr_items, 0, (u64)-1); - } -} - -static inline u64 calc_reclaim_items_nr(struct btrfs_fs_info *fs_info, - u64 to_reclaim) -{ - u64 bytes; - u64 nr; - - bytes = btrfs_calc_trans_metadata_size(fs_info, 1); - nr = div64_u64(to_reclaim, bytes); - if (!nr) - nr = 1; - return nr; -} - -#define EXTENT_SIZE_PER_ITEM SZ_256K - -/* - * shrink metadata reservation for delalloc - */ -static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim, - u64 orig, bool wait_ordered) -{ - struct btrfs_space_info *space_info; - struct btrfs_trans_handle *trans; - u64 delalloc_bytes; - u64 dio_bytes; - u64 async_pages; - u64 items; - long time_left; - unsigned long nr_pages; - int loops; - - /* Calc the number of the pages we need flush for space reservation */ - items = calc_reclaim_items_nr(fs_info, to_reclaim); - to_reclaim = items * EXTENT_SIZE_PER_ITEM; - - trans = (struct btrfs_trans_handle *)current->journal_info; - space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); - - delalloc_bytes = percpu_counter_sum_positive( - &fs_info->delalloc_bytes); - dio_bytes = percpu_counter_sum_positive(&fs_info->dio_bytes); - if (delalloc_bytes == 0 && dio_bytes == 0) { - if (trans) - return; - if (wait_ordered) - btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1); - return; - } - - /* - * If we are doing more ordered than delalloc we need to just wait on - * ordered extents, otherwise we'll waste time trying to flush delalloc - * that likely won't give us the space back we need. - */ - if (dio_bytes > delalloc_bytes) - wait_ordered = true; - - loops = 0; - while ((delalloc_bytes || dio_bytes) && loops < 3) { - nr_pages = min(delalloc_bytes, to_reclaim) >> PAGE_SHIFT; - - /* - * Triggers inode writeback for up to nr_pages. This will invoke - * ->writepages callback and trigger delalloc filling - * (btrfs_run_delalloc_range()). - */ - btrfs_writeback_inodes_sb_nr(fs_info, nr_pages, items); - - /* - * We need to wait for the compressed pages to start before - * we continue. - */ - async_pages = atomic_read(&fs_info->async_delalloc_pages); - if (!async_pages) - goto skip_async; - - /* - * Calculate how many compressed pages we want to be written - * before we continue. I.e if there are more async pages than we - * require wait_event will wait until nr_pages are written. - */ - if (async_pages <= nr_pages) - async_pages = 0; - else - async_pages -= nr_pages; - - wait_event(fs_info->async_submit_wait, - atomic_read(&fs_info->async_delalloc_pages) <= - (int)async_pages); -skip_async: - spin_lock(&space_info->lock); - if (list_empty(&space_info->tickets) && - list_empty(&space_info->priority_tickets)) { - spin_unlock(&space_info->lock); - break; - } - spin_unlock(&space_info->lock); - - loops++; - if (wait_ordered && !trans) { - btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1); - } else { - time_left = schedule_timeout_killable(1); - if (time_left) - break; - } - delalloc_bytes = percpu_counter_sum_positive( - &fs_info->delalloc_bytes); - dio_bytes = percpu_counter_sum_positive(&fs_info->dio_bytes); - } -} - -struct reserve_ticket { - u64 orig_bytes; - u64 bytes; - int error; - struct list_head list; - wait_queue_head_t wait; -}; - -/** - * maybe_commit_transaction - possibly commit the transaction if its ok to - * @root - the root we're allocating for - * @bytes - the number of bytes we want to reserve - * @force - force the commit - * - * This will check to make sure that committing the transaction will actually - * get us somewhere and then commit the transaction if it does. Otherwise it - * will return -ENOSPC. - */ -static int may_commit_transaction(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info) -{ - struct reserve_ticket *ticket = NULL; - struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_block_rsv; - struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv; - struct btrfs_trans_handle *trans; - u64 bytes_needed; - u64 reclaim_bytes = 0; - - trans = (struct btrfs_trans_handle *)current->journal_info; - if (trans) - return -EAGAIN; - - spin_lock(&space_info->lock); - if (!list_empty(&space_info->priority_tickets)) - ticket = list_first_entry(&space_info->priority_tickets, - struct reserve_ticket, list); - else if (!list_empty(&space_info->tickets)) - ticket = list_first_entry(&space_info->tickets, - struct reserve_ticket, list); - bytes_needed = (ticket) ? ticket->bytes : 0; - spin_unlock(&space_info->lock); - - if (!bytes_needed) - return 0; - - trans = btrfs_join_transaction(fs_info->extent_root); - if (IS_ERR(trans)) - return PTR_ERR(trans); - - /* - * See if there is enough pinned space to make this reservation, or if - * we have block groups that are going to be freed, allowing us to - * possibly do a chunk allocation the next loop through. - */ - if (test_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags) || - __percpu_counter_compare(&space_info->total_bytes_pinned, - bytes_needed, - BTRFS_TOTAL_BYTES_PINNED_BATCH) >= 0) - goto commit; - - /* - * See if there is some space in the delayed insertion reservation for - * this reservation. - */ - if (space_info != delayed_rsv->space_info) - goto enospc; - - spin_lock(&delayed_rsv->lock); - reclaim_bytes += delayed_rsv->reserved; - spin_unlock(&delayed_rsv->lock); - - spin_lock(&delayed_refs_rsv->lock); - reclaim_bytes += delayed_refs_rsv->reserved; - spin_unlock(&delayed_refs_rsv->lock); - if (reclaim_bytes >= bytes_needed) - goto commit; - bytes_needed -= reclaim_bytes; - - if (__percpu_counter_compare(&space_info->total_bytes_pinned, - bytes_needed, - BTRFS_TOTAL_BYTES_PINNED_BATCH) < 0) - goto enospc; - -commit: - return btrfs_commit_transaction(trans); -enospc: - btrfs_end_transaction(trans); - return -ENOSPC; -} - -/* - * Try to flush some data based on policy set by @state. This is only advisory - * and may fail for various reasons. The caller is supposed to examine the - * state of @space_info to detect the outcome. - */ -static void flush_space(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info, u64 num_bytes, - int state) -{ - struct btrfs_root *root = fs_info->extent_root; - struct btrfs_trans_handle *trans; - int nr; - int ret = 0; - - switch (state) { - case FLUSH_DELAYED_ITEMS_NR: - case FLUSH_DELAYED_ITEMS: - if (state == FLUSH_DELAYED_ITEMS_NR) - nr = calc_reclaim_items_nr(fs_info, num_bytes) * 2; - else - nr = -1; - - trans = btrfs_join_transaction(root); - if (IS_ERR(trans)) { - ret = PTR_ERR(trans); - break; - } - ret = btrfs_run_delayed_items_nr(trans, nr); - btrfs_end_transaction(trans); - break; - case FLUSH_DELALLOC: - case FLUSH_DELALLOC_WAIT: - shrink_delalloc(fs_info, num_bytes * 2, num_bytes, - state == FLUSH_DELALLOC_WAIT); - break; - case FLUSH_DELAYED_REFS_NR: - case FLUSH_DELAYED_REFS: - trans = btrfs_join_transaction(root); - if (IS_ERR(trans)) { - ret = PTR_ERR(trans); - break; - } - if (state == FLUSH_DELAYED_REFS_NR) - nr = calc_reclaim_items_nr(fs_info, num_bytes); - else - nr = 0; - btrfs_run_delayed_refs(trans, nr); - btrfs_end_transaction(trans); - break; - case ALLOC_CHUNK: - case ALLOC_CHUNK_FORCE: - trans = btrfs_join_transaction(root); - if (IS_ERR(trans)) { - ret = PTR_ERR(trans); - break; - } - ret = do_chunk_alloc(trans, - btrfs_metadata_alloc_profile(fs_info), - (state == ALLOC_CHUNK) ? - CHUNK_ALLOC_NO_FORCE : CHUNK_ALLOC_FORCE); - btrfs_end_transaction(trans); - if (ret > 0 || ret == -ENOSPC) - ret = 0; - break; - case COMMIT_TRANS: - /* - * If we have pending delayed iputs then we could free up a - * bunch of pinned space, so make sure we run the iputs before - * we do our pinned bytes check below. - */ - btrfs_run_delayed_iputs(fs_info); - btrfs_wait_on_delayed_iputs(fs_info); - - ret = may_commit_transaction(fs_info, space_info); - break; - default: - ret = -ENOSPC; - break; - } - - trace_btrfs_flush_space(fs_info, space_info->flags, num_bytes, state, - ret); - return; -} - -static inline u64 -btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info, - bool system_chunk) -{ - struct reserve_ticket *ticket; - u64 used; - u64 expected; - u64 to_reclaim = 0; - - list_for_each_entry(ticket, &space_info->tickets, list) - to_reclaim += ticket->bytes; - list_for_each_entry(ticket, &space_info->priority_tickets, list) - to_reclaim += ticket->bytes; - if (to_reclaim) - return to_reclaim; - - to_reclaim = min_t(u64, num_online_cpus() * SZ_1M, SZ_16M); - if (can_overcommit(fs_info, space_info, to_reclaim, - BTRFS_RESERVE_FLUSH_ALL, system_chunk)) - return 0; - - used = btrfs_space_info_used(space_info, true); - - if (can_overcommit(fs_info, space_info, SZ_1M, - BTRFS_RESERVE_FLUSH_ALL, system_chunk)) - expected = div_factor_fine(space_info->total_bytes, 95); - else - expected = div_factor_fine(space_info->total_bytes, 90); - - if (used > expected) - to_reclaim = used - expected; - else - to_reclaim = 0; - to_reclaim = min(to_reclaim, space_info->bytes_may_use + - space_info->bytes_reserved); - return to_reclaim; -} - -static inline int need_do_async_reclaim(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info, - u64 used, bool system_chunk) -{ - u64 thresh = div_factor_fine(space_info->total_bytes, 98); - - /* If we're just plain full then async reclaim just slows us down. */ - if ((space_info->bytes_used + space_info->bytes_reserved) >= thresh) - return 0; - - if (!btrfs_calc_reclaim_metadata_size(fs_info, space_info, - system_chunk)) - return 0; - - return (used >= thresh && !btrfs_fs_closing(fs_info) && - !test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)); -} - -static bool wake_all_tickets(struct list_head *head) -{ - struct reserve_ticket *ticket; - - while (!list_empty(head)) { - ticket = list_first_entry(head, struct reserve_ticket, list); - list_del_init(&ticket->list); - ticket->error = -ENOSPC; - wake_up(&ticket->wait); - if (ticket->bytes != ticket->orig_bytes) - return true; - } - return false; -} - -/* - * This is for normal flushers, we can wait all goddamned day if we want to. We - * will loop and continuously try to flush as long as we are making progress. - * We count progress as clearing off tickets each time we have to loop. - */ -static void btrfs_async_reclaim_metadata_space(struct work_struct *work) -{ - struct btrfs_fs_info *fs_info; - struct btrfs_space_info *space_info; - u64 to_reclaim; - int flush_state; - int commit_cycles = 0; - u64 last_tickets_id; - - fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work); - space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); - - spin_lock(&space_info->lock); - to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info, - false); - if (!to_reclaim) { - space_info->flush = 0; - spin_unlock(&space_info->lock); - return; - } - last_tickets_id = space_info->tickets_id; - spin_unlock(&space_info->lock); - - flush_state = FLUSH_DELAYED_ITEMS_NR; - do { - flush_space(fs_info, space_info, to_reclaim, flush_state); - spin_lock(&space_info->lock); - if (list_empty(&space_info->tickets)) { - space_info->flush = 0; - spin_unlock(&space_info->lock); - return; - } - to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, - space_info, - false); - if (last_tickets_id == space_info->tickets_id) { - flush_state++; - } else { - last_tickets_id = space_info->tickets_id; - flush_state = FLUSH_DELAYED_ITEMS_NR; - if (commit_cycles) - commit_cycles--; - } - - /* - * We don't want to force a chunk allocation until we've tried - * pretty hard to reclaim space. Think of the case where we - * freed up a bunch of space and so have a lot of pinned space - * to reclaim. We would rather use that than possibly create a - * underutilized metadata chunk. So if this is our first run - * through the flushing state machine skip ALLOC_CHUNK_FORCE and - * commit the transaction. If nothing has changed the next go - * around then we can force a chunk allocation. - */ - if (flush_state == ALLOC_CHUNK_FORCE && !commit_cycles) - flush_state++; - - if (flush_state > COMMIT_TRANS) { - commit_cycles++; - if (commit_cycles > 2) { - if (wake_all_tickets(&space_info->tickets)) { - flush_state = FLUSH_DELAYED_ITEMS_NR; - commit_cycles--; - } else { - space_info->flush = 0; - } - } else { - flush_state = FLUSH_DELAYED_ITEMS_NR; - } - } - spin_unlock(&space_info->lock); - } while (flush_state <= COMMIT_TRANS); -} - -void btrfs_init_async_reclaim_work(struct work_struct *work) -{ - INIT_WORK(work, btrfs_async_reclaim_metadata_space); -} - -static const enum btrfs_flush_state priority_flush_states[] = { - FLUSH_DELAYED_ITEMS_NR, - FLUSH_DELAYED_ITEMS, - ALLOC_CHUNK, -}; - -static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info, - struct reserve_ticket *ticket) -{ - u64 to_reclaim; - int flush_state; - - spin_lock(&space_info->lock); - to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info, - false); - if (!to_reclaim) { - spin_unlock(&space_info->lock); - return; - } - spin_unlock(&space_info->lock); - - flush_state = 0; - do { - flush_space(fs_info, space_info, to_reclaim, - priority_flush_states[flush_state]); - flush_state++; - spin_lock(&space_info->lock); - if (ticket->bytes == 0) { - spin_unlock(&space_info->lock); - return; - } - spin_unlock(&space_info->lock); - } while (flush_state < ARRAY_SIZE(priority_flush_states)); -} - -static int wait_reserve_ticket(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info, - struct reserve_ticket *ticket) - -{ - DEFINE_WAIT(wait); - u64 reclaim_bytes = 0; - int ret = 0; - - spin_lock(&space_info->lock); - while (ticket->bytes > 0 && ticket->error == 0) { - ret = prepare_to_wait_event(&ticket->wait, &wait, TASK_KILLABLE); - if (ret) { - ret = -EINTR; - break; - } - spin_unlock(&space_info->lock); - - schedule(); - - finish_wait(&ticket->wait, &wait); - spin_lock(&space_info->lock); - } - if (!ret) - ret = ticket->error; - if (!list_empty(&ticket->list)) - list_del_init(&ticket->list); - if (ticket->bytes && ticket->bytes < ticket->orig_bytes) - reclaim_bytes = ticket->orig_bytes - ticket->bytes; - spin_unlock(&space_info->lock); - - if (reclaim_bytes) - space_info_add_old_bytes(fs_info, space_info, reclaim_bytes); - return ret; -} - -/** - * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space - * @root - the root we're allocating for - * @space_info - the space info we want to allocate from - * @orig_bytes - the number of bytes we want - * @flush - whether or not we can flush to make our reservation - * - * This will reserve orig_bytes number of bytes from the space info associated - * with the block_rsv. If there is not enough space it will make an attempt to - * flush out space to make room. It will do this by flushing delalloc if - * possible or committing the transaction. If flush is 0 then no attempts to - * regain reservations will be made and this will fail if there is not enough - * space already. - */ -static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info, - u64 orig_bytes, - enum btrfs_reserve_flush_enum flush, - bool system_chunk) -{ - struct reserve_ticket ticket; - u64 used; - u64 reclaim_bytes = 0; - int ret = 0; - - ASSERT(orig_bytes); - ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_ALL); - - spin_lock(&space_info->lock); - ret = -ENOSPC; - used = btrfs_space_info_used(space_info, true); - - /* - * If we have enough space then hooray, make our reservation and carry - * on. If not see if we can overcommit, and if we can, hooray carry on. - * If not things get more complicated. - */ - if (used + orig_bytes <= space_info->total_bytes) { - update_bytes_may_use(space_info, orig_bytes); - trace_btrfs_space_reservation(fs_info, "space_info", - space_info->flags, orig_bytes, 1); - ret = 0; - } else if (can_overcommit(fs_info, space_info, orig_bytes, flush, - system_chunk)) { - update_bytes_may_use(space_info, orig_bytes); - trace_btrfs_space_reservation(fs_info, "space_info", - space_info->flags, orig_bytes, 1); - ret = 0; - } - - /* - * If we couldn't make a reservation then setup our reservation ticket - * and kick the async worker if it's not already running. - * - * If we are a priority flusher then we just need to add our ticket to - * the list and we will do our own flushing further down. - */ - if (ret && flush != BTRFS_RESERVE_NO_FLUSH) { - ticket.orig_bytes = orig_bytes; - ticket.bytes = orig_bytes; - ticket.error = 0; - init_waitqueue_head(&ticket.wait); - if (flush == BTRFS_RESERVE_FLUSH_ALL) { - list_add_tail(&ticket.list, &space_info->tickets); - if (!space_info->flush) { - space_info->flush = 1; - trace_btrfs_trigger_flush(fs_info, - space_info->flags, - orig_bytes, flush, - "enospc"); - queue_work(system_unbound_wq, - &fs_info->async_reclaim_work); - } - } else { - list_add_tail(&ticket.list, - &space_info->priority_tickets); - } - } else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) { - used += orig_bytes; - /* - * We will do the space reservation dance during log replay, - * which means we won't have fs_info->fs_root set, so don't do - * the async reclaim as we will panic. - */ - if (!test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags) && - need_do_async_reclaim(fs_info, space_info, - used, system_chunk) && - !work_busy(&fs_info->async_reclaim_work)) { - trace_btrfs_trigger_flush(fs_info, space_info->flags, - orig_bytes, flush, "preempt"); - queue_work(system_unbound_wq, - &fs_info->async_reclaim_work); - } - } - spin_unlock(&space_info->lock); - if (!ret || flush == BTRFS_RESERVE_NO_FLUSH) - return ret; - - if (flush == BTRFS_RESERVE_FLUSH_ALL) - return wait_reserve_ticket(fs_info, space_info, &ticket); - - ret = 0; - priority_reclaim_metadata_space(fs_info, space_info, &ticket); - spin_lock(&space_info->lock); - if (ticket.bytes) { - if (ticket.bytes < orig_bytes) - reclaim_bytes = orig_bytes - ticket.bytes; - list_del_init(&ticket.list); - ret = -ENOSPC; - } - spin_unlock(&space_info->lock); - - if (reclaim_bytes) - space_info_add_old_bytes(fs_info, space_info, reclaim_bytes); - ASSERT(list_empty(&ticket.list)); - return ret; -} - -/** - * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space - * @root - the root we're allocating for - * @block_rsv - the block_rsv we're allocating for - * @orig_bytes - the number of bytes we want - * @flush - whether or not we can flush to make our reservation - * - * This will reserve orig_bytes number of bytes from the space info associated - * with the block_rsv. If there is not enough space it will make an attempt to - * flush out space to make room. It will do this by flushing delalloc if - * possible or committing the transaction. If flush is 0 then no attempts to - * regain reservations will be made and this will fail if there is not enough - * space already. - */ -static int reserve_metadata_bytes(struct btrfs_root *root, - struct btrfs_block_rsv *block_rsv, - u64 orig_bytes, - enum btrfs_reserve_flush_enum flush) -{ - struct btrfs_fs_info *fs_info = root->fs_info; - struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; - int ret; - bool system_chunk = (root == fs_info->chunk_root); - - ret = __reserve_metadata_bytes(fs_info, block_rsv->space_info, - orig_bytes, flush, system_chunk); - if (ret == -ENOSPC && - unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) { - if (block_rsv != global_rsv && - !block_rsv_use_bytes(global_rsv, orig_bytes)) - ret = 0; - } - if (ret == -ENOSPC) { - trace_btrfs_space_reservation(fs_info, "space_info:enospc", - block_rsv->space_info->flags, - orig_bytes, 1); - - if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) - dump_space_info(fs_info, block_rsv->space_info, - orig_bytes, 0); - } - return ret; -} - -static struct btrfs_block_rsv *get_block_rsv( - const struct btrfs_trans_handle *trans, - const struct btrfs_root *root) -{ - struct btrfs_fs_info *fs_info = root->fs_info; - struct btrfs_block_rsv *block_rsv = NULL; - - if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) || - (root == fs_info->csum_root && trans->adding_csums) || - (root == fs_info->uuid_root)) - block_rsv = trans->block_rsv; - - if (!block_rsv) - block_rsv = root->block_rsv; - - if (!block_rsv) - block_rsv = &fs_info->empty_block_rsv; - - return block_rsv; -} - -static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv, - u64 num_bytes) -{ - int ret = -ENOSPC; - spin_lock(&block_rsv->lock); - if (block_rsv->reserved >= num_bytes) { - block_rsv->reserved -= num_bytes; - if (block_rsv->reserved < block_rsv->size) - block_rsv->full = 0; - ret = 0; - } - spin_unlock(&block_rsv->lock); - return ret; -} - -static void block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv, - u64 num_bytes, bool update_size) -{ - spin_lock(&block_rsv->lock); - block_rsv->reserved += num_bytes; - if (update_size) - block_rsv->size += num_bytes; - else if (block_rsv->reserved >= block_rsv->size) - block_rsv->full = 1; - spin_unlock(&block_rsv->lock); -} - -int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info, - struct btrfs_block_rsv *dest, u64 num_bytes, - int min_factor) -{ - struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; - u64 min_bytes; - - if (global_rsv->space_info != dest->space_info) - return -ENOSPC; - - spin_lock(&global_rsv->lock); - min_bytes = div_factor(global_rsv->size, min_factor); - if (global_rsv->reserved < min_bytes + num_bytes) { - spin_unlock(&global_rsv->lock); - return -ENOSPC; - } - global_rsv->reserved -= num_bytes; - if (global_rsv->reserved < global_rsv->size) - global_rsv->full = 0; - spin_unlock(&global_rsv->lock); - - block_rsv_add_bytes(dest, num_bytes, true); - return 0; -} - -/** - * btrfs_migrate_to_delayed_refs_rsv - transfer bytes to our delayed refs rsv. - * @fs_info - the fs info for our fs. - * @src - the source block rsv to transfer from. - * @num_bytes - the number of bytes to transfer. - * - * This transfers up to the num_bytes amount from the src rsv to the - * delayed_refs_rsv. Any extra bytes are returned to the space info. - */ -void btrfs_migrate_to_delayed_refs_rsv(struct btrfs_fs_info *fs_info, - struct btrfs_block_rsv *src, - u64 num_bytes) -{ - struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv; - u64 to_free = 0; - - spin_lock(&src->lock); - src->reserved -= num_bytes; - src->size -= num_bytes; - spin_unlock(&src->lock); - - spin_lock(&delayed_refs_rsv->lock); - if (delayed_refs_rsv->size > delayed_refs_rsv->reserved) { - u64 delta = delayed_refs_rsv->size - - delayed_refs_rsv->reserved; - if (num_bytes > delta) { - to_free = num_bytes - delta; - num_bytes = delta; - } - } else { - to_free = num_bytes; - num_bytes = 0; - } - - if (num_bytes) - delayed_refs_rsv->reserved += num_bytes; - if (delayed_refs_rsv->reserved >= delayed_refs_rsv->size) - delayed_refs_rsv->full = 1; - spin_unlock(&delayed_refs_rsv->lock); - - if (num_bytes) - trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv", - 0, num_bytes, 1); - if (to_free) - space_info_add_old_bytes(fs_info, delayed_refs_rsv->space_info, - to_free); -} - -/** - * btrfs_delayed_refs_rsv_refill - refill based on our delayed refs usage. - * @fs_info - the fs_info for our fs. - * @flush - control how we can flush for this reservation. - * - * This will refill the delayed block_rsv up to 1 items size worth of space and - * will return -ENOSPC if we can't make the reservation. - */ -int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info, - enum btrfs_reserve_flush_enum flush) -{ - struct btrfs_block_rsv *block_rsv = &fs_info->delayed_refs_rsv; - u64 limit = btrfs_calc_trans_metadata_size(fs_info, 1); - u64 num_bytes = 0; - int ret = -ENOSPC; - - spin_lock(&block_rsv->lock); - if (block_rsv->reserved < block_rsv->size) { - num_bytes = block_rsv->size - block_rsv->reserved; - num_bytes = min(num_bytes, limit); - } - spin_unlock(&block_rsv->lock); - - if (!num_bytes) - return 0; - - ret = reserve_metadata_bytes(fs_info->extent_root, block_rsv, - num_bytes, flush); - if (ret) - return ret; - block_rsv_add_bytes(block_rsv, num_bytes, 0); - trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv", - 0, num_bytes, 1); - return 0; -} - -/* - * This is for space we already have accounted in space_info->bytes_may_use, so - * basically when we're returning space from block_rsv's. - */ -static void space_info_add_old_bytes(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info, - u64 num_bytes) -{ - struct reserve_ticket *ticket; - struct list_head *head; - u64 used; - enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_NO_FLUSH; - bool check_overcommit = false; - - spin_lock(&space_info->lock); - head = &space_info->priority_tickets; - - /* - * If we are over our limit then we need to check and see if we can - * overcommit, and if we can't then we just need to free up our space - * and not satisfy any requests. - */ - used = btrfs_space_info_used(space_info, true); - if (used - num_bytes >= space_info->total_bytes) - check_overcommit = true; -again: - while (!list_empty(head) && num_bytes) { - ticket = list_first_entry(head, struct reserve_ticket, - list); - /* - * We use 0 bytes because this space is already reserved, so - * adding the ticket space would be a double count. - */ - if (check_overcommit && - !can_overcommit(fs_info, space_info, 0, flush, false)) - break; - if (num_bytes >= ticket->bytes) { - list_del_init(&ticket->list); - num_bytes -= ticket->bytes; - ticket->bytes = 0; - space_info->tickets_id++; - wake_up(&ticket->wait); - } else { - ticket->bytes -= num_bytes; - num_bytes = 0; - } - } - - if (num_bytes && head == &space_info->priority_tickets) { - head = &space_info->tickets; - flush = BTRFS_RESERVE_FLUSH_ALL; - goto again; - } - update_bytes_may_use(space_info, -num_bytes); - trace_btrfs_space_reservation(fs_info, "space_info", - space_info->flags, num_bytes, 0); - spin_unlock(&space_info->lock); -} - -/* - * This is for newly allocated space that isn't accounted in - * space_info->bytes_may_use yet. So if we allocate a chunk or unpin an extent - * we use this helper. - */ -static void space_info_add_new_bytes(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info, - u64 num_bytes) -{ - struct reserve_ticket *ticket; - struct list_head *head = &space_info->priority_tickets; - -again: - while (!list_empty(head) && num_bytes) { - ticket = list_first_entry(head, struct reserve_ticket, - list); - if (num_bytes >= ticket->bytes) { - trace_btrfs_space_reservation(fs_info, "space_info", - space_info->flags, - ticket->bytes, 1); - list_del_init(&ticket->list); - num_bytes -= ticket->bytes; - update_bytes_may_use(space_info, ticket->bytes); - ticket->bytes = 0; - space_info->tickets_id++; - wake_up(&ticket->wait); - } else { - trace_btrfs_space_reservation(fs_info, "space_info", - space_info->flags, - num_bytes, 1); - update_bytes_may_use(space_info, num_bytes); - ticket->bytes -= num_bytes; - num_bytes = 0; - } - } - - if (num_bytes && head == &space_info->priority_tickets) { - head = &space_info->tickets; - goto again; - } -} - -static u64 block_rsv_release_bytes(struct btrfs_fs_info *fs_info, - struct btrfs_block_rsv *block_rsv, - struct btrfs_block_rsv *dest, u64 num_bytes, - u64 *qgroup_to_release_ret) -{ - struct btrfs_space_info *space_info = block_rsv->space_info; - u64 qgroup_to_release = 0; - u64 ret; - - spin_lock(&block_rsv->lock); - if (num_bytes == (u64)-1) { - num_bytes = block_rsv->size; - qgroup_to_release = block_rsv->qgroup_rsv_size; - } - block_rsv->size -= num_bytes; - if (block_rsv->reserved >= block_rsv->size) { - num_bytes = block_rsv->reserved - block_rsv->size; - block_rsv->reserved = block_rsv->size; - block_rsv->full = 1; - } else { - num_bytes = 0; - } - if (block_rsv->qgroup_rsv_reserved >= block_rsv->qgroup_rsv_size) { - qgroup_to_release = block_rsv->qgroup_rsv_reserved - - block_rsv->qgroup_rsv_size; - block_rsv->qgroup_rsv_reserved = block_rsv->qgroup_rsv_size; - } else { - qgroup_to_release = 0; - } - spin_unlock(&block_rsv->lock); - - ret = num_bytes; - if (num_bytes > 0) { - if (dest) { - spin_lock(&dest->lock); - if (!dest->full) { - u64 bytes_to_add; - - bytes_to_add = dest->size - dest->reserved; - bytes_to_add = min(num_bytes, bytes_to_add); - dest->reserved += bytes_to_add; - if (dest->reserved >= dest->size) - dest->full = 1; - num_bytes -= bytes_to_add; - } - spin_unlock(&dest->lock); - } - if (num_bytes) - space_info_add_old_bytes(fs_info, space_info, - num_bytes); - } - if (qgroup_to_release_ret) - *qgroup_to_release_ret = qgroup_to_release; - return ret; -} - -int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src, - struct btrfs_block_rsv *dst, u64 num_bytes, - bool update_size) -{ - int ret; - - ret = block_rsv_use_bytes(src, num_bytes); - if (ret) - return ret; - - block_rsv_add_bytes(dst, num_bytes, update_size); - return 0; -} - -void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type) -{ - memset(rsv, 0, sizeof(*rsv)); - spin_lock_init(&rsv->lock); - rsv->type = type; -} - -void btrfs_init_metadata_block_rsv(struct btrfs_fs_info *fs_info, - struct btrfs_block_rsv *rsv, - unsigned short type) -{ - btrfs_init_block_rsv(rsv, type); - rsv->space_info = __find_space_info(fs_info, - BTRFS_BLOCK_GROUP_METADATA); -} - -struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info, - unsigned short type) -{ - struct btrfs_block_rsv *block_rsv; - - block_rsv = kmalloc(sizeof(*block_rsv), GFP_NOFS); - if (!block_rsv) - return NULL; - - btrfs_init_metadata_block_rsv(fs_info, block_rsv, type); - return block_rsv; -} - -void btrfs_free_block_rsv(struct btrfs_fs_info *fs_info, - struct btrfs_block_rsv *rsv) -{ - if (!rsv) - return; - btrfs_block_rsv_release(fs_info, rsv, (u64)-1); - kfree(rsv); -} - -int btrfs_block_rsv_add(struct btrfs_root *root, - struct btrfs_block_rsv *block_rsv, u64 num_bytes, - enum btrfs_reserve_flush_enum flush) -{ - int ret; - - if (num_bytes == 0) - return 0; - - ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush); - if (!ret) - block_rsv_add_bytes(block_rsv, num_bytes, true); - - return ret; -} - -int btrfs_block_rsv_check(struct btrfs_block_rsv *block_rsv, int min_factor) -{ - u64 num_bytes = 0; - int ret = -ENOSPC; - - if (!block_rsv) - return 0; - - spin_lock(&block_rsv->lock); - num_bytes = div_factor(block_rsv->size, min_factor); - if (block_rsv->reserved >= num_bytes) - ret = 0; - spin_unlock(&block_rsv->lock); - - return ret; -} - -int btrfs_block_rsv_refill(struct btrfs_root *root, - struct btrfs_block_rsv *block_rsv, u64 min_reserved, - enum btrfs_reserve_flush_enum flush) -{ - u64 num_bytes = 0; - int ret = -ENOSPC; - - if (!block_rsv) - return 0; - - spin_lock(&block_rsv->lock); - num_bytes = min_reserved; - if (block_rsv->reserved >= num_bytes) - ret = 0; - else - num_bytes -= block_rsv->reserved; - spin_unlock(&block_rsv->lock); - - if (!ret) - return 0; - - ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush); - if (!ret) { - block_rsv_add_bytes(block_rsv, num_bytes, false); - return 0; - } - - return ret; -} - -static u64 __btrfs_block_rsv_release(struct btrfs_fs_info *fs_info, - struct btrfs_block_rsv *block_rsv, - u64 num_bytes, u64 *qgroup_to_release) -{ - struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; - struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv; - struct btrfs_block_rsv *target = delayed_rsv; - - if (target->full || target == block_rsv) - target = global_rsv; - - if (block_rsv->space_info != target->space_info) - target = NULL; - - return block_rsv_release_bytes(fs_info, block_rsv, target, num_bytes, - qgroup_to_release); -} - -void btrfs_block_rsv_release(struct btrfs_fs_info *fs_info, - struct btrfs_block_rsv *block_rsv, - u64 num_bytes) -{ - __btrfs_block_rsv_release(fs_info, block_rsv, num_bytes, NULL); -} - -/** - * btrfs_inode_rsv_release - release any excessive reservation. - * @inode - the inode we need to release from. - * @qgroup_free - free or convert qgroup meta. - * Unlike normal operation, qgroup meta reservation needs to know if we are - * freeing qgroup reservation or just converting it into per-trans. Normally - * @qgroup_free is true for error handling, and false for normal release. - * - * This is the same as btrfs_block_rsv_release, except that it handles the - * tracepoint for the reservation. - */ -static void btrfs_inode_rsv_release(struct btrfs_inode *inode, bool qgroup_free) -{ - struct btrfs_fs_info *fs_info = inode->root->fs_info; - struct btrfs_block_rsv *block_rsv = &inode->block_rsv; - u64 released = 0; - u64 qgroup_to_release = 0; - - /* - * Since we statically set the block_rsv->size we just want to say we - * are releasing 0 bytes, and then we'll just get the reservation over - * the size free'd. - */ - released = __btrfs_block_rsv_release(fs_info, block_rsv, 0, - &qgroup_to_release); - if (released > 0) - trace_btrfs_space_reservation(fs_info, "delalloc", - btrfs_ino(inode), released, 0); - if (qgroup_free) - btrfs_qgroup_free_meta_prealloc(inode->root, qgroup_to_release); - else - btrfs_qgroup_convert_reserved_meta(inode->root, - qgroup_to_release); -} - -/** - * btrfs_delayed_refs_rsv_release - release a ref head's reservation. - * @fs_info - the fs_info for our fs. - * @nr - the number of items to drop. - * - * This drops the delayed ref head's count from the delayed refs rsv and frees - * any excess reservation we had. - */ -void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr) -{ - struct btrfs_block_rsv *block_rsv = &fs_info->delayed_refs_rsv; - struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; - u64 num_bytes = btrfs_calc_trans_metadata_size(fs_info, nr); - u64 released = 0; - - released = block_rsv_release_bytes(fs_info, block_rsv, global_rsv, - num_bytes, NULL); - if (released) - trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv", - 0, released, 0); -} - -static void update_global_block_rsv(struct btrfs_fs_info *fs_info) -{ - struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv; - struct btrfs_space_info *sinfo = block_rsv->space_info; - u64 num_bytes; - - /* - * The global block rsv is based on the size of the extent tree, the - * checksum tree and the root tree. If the fs is empty we want to set - * it to a minimal amount for safety. - */ - num_bytes = btrfs_root_used(&fs_info->extent_root->root_item) + - btrfs_root_used(&fs_info->csum_root->root_item) + - btrfs_root_used(&fs_info->tree_root->root_item); - num_bytes = max_t(u64, num_bytes, SZ_16M); - - spin_lock(&sinfo->lock); - spin_lock(&block_rsv->lock); - - block_rsv->size = min_t(u64, num_bytes, SZ_512M); - - if (block_rsv->reserved < block_rsv->size) { - num_bytes = btrfs_space_info_used(sinfo, true); - if (sinfo->total_bytes > num_bytes) { - num_bytes = sinfo->total_bytes - num_bytes; - num_bytes = min(num_bytes, - block_rsv->size - block_rsv->reserved); - block_rsv->reserved += num_bytes; - update_bytes_may_use(sinfo, num_bytes); - trace_btrfs_space_reservation(fs_info, "space_info", - sinfo->flags, num_bytes, - 1); - } - } else if (block_rsv->reserved > block_rsv->size) { - num_bytes = block_rsv->reserved - block_rsv->size; - update_bytes_may_use(sinfo, -num_bytes); - trace_btrfs_space_reservation(fs_info, "space_info", - sinfo->flags, num_bytes, 0); - block_rsv->reserved = block_rsv->size; - } - - if (block_rsv->reserved == block_rsv->size) - block_rsv->full = 1; - else - block_rsv->full = 0; - - spin_unlock(&block_rsv->lock); - spin_unlock(&sinfo->lock); -} - -static void init_global_block_rsv(struct btrfs_fs_info *fs_info) -{ - struct btrfs_space_info *space_info; - - space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM); - fs_info->chunk_block_rsv.space_info = space_info; - - space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); - fs_info->global_block_rsv.space_info = space_info; - fs_info->trans_block_rsv.space_info = space_info; - fs_info->empty_block_rsv.space_info = space_info; - fs_info->delayed_block_rsv.space_info = space_info; - fs_info->delayed_refs_rsv.space_info = space_info; - - fs_info->extent_root->block_rsv = &fs_info->delayed_refs_rsv; - fs_info->csum_root->block_rsv = &fs_info->delayed_refs_rsv; - fs_info->dev_root->block_rsv = &fs_info->global_block_rsv; - fs_info->tree_root->block_rsv = &fs_info->global_block_rsv; - if (fs_info->quota_root) - fs_info->quota_root->block_rsv = &fs_info->global_block_rsv; - fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv; - - update_global_block_rsv(fs_info); -} - -static void release_global_block_rsv(struct btrfs_fs_info *fs_info) -{ - block_rsv_release_bytes(fs_info, &fs_info->global_block_rsv, NULL, - (u64)-1, NULL); - WARN_ON(fs_info->trans_block_rsv.size > 0); - WARN_ON(fs_info->trans_block_rsv.reserved > 0); - WARN_ON(fs_info->chunk_block_rsv.size > 0); - WARN_ON(fs_info->chunk_block_rsv.reserved > 0); - WARN_ON(fs_info->delayed_block_rsv.size > 0); - WARN_ON(fs_info->delayed_block_rsv.reserved > 0); - WARN_ON(fs_info->delayed_refs_rsv.reserved > 0); - WARN_ON(fs_info->delayed_refs_rsv.size > 0); -} - -/* - * btrfs_update_delayed_refs_rsv - adjust the size of the delayed refs rsv - * @trans - the trans that may have generated delayed refs - * - * This is to be called anytime we may have adjusted trans->delayed_ref_updates, - * it'll calculate the additional size and add it to the delayed_refs_rsv. - */ -void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans) -{ - struct btrfs_fs_info *fs_info = trans->fs_info; - struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv; - u64 num_bytes; - - if (!trans->delayed_ref_updates) - return; - - num_bytes = btrfs_calc_trans_metadata_size(fs_info, - trans->delayed_ref_updates); - spin_lock(&delayed_rsv->lock); - delayed_rsv->size += num_bytes; - delayed_rsv->full = 0; - spin_unlock(&delayed_rsv->lock); - trans->delayed_ref_updates = 0; -} - -/* - * To be called after all the new block groups attached to the transaction - * handle have been created (btrfs_create_pending_block_groups()). - */ -void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans) -{ - struct btrfs_fs_info *fs_info = trans->fs_info; - - if (!trans->chunk_bytes_reserved) - return; - - WARN_ON_ONCE(!list_empty(&trans->new_bgs)); - - block_rsv_release_bytes(fs_info, &fs_info->chunk_block_rsv, NULL, - trans->chunk_bytes_reserved, NULL); - trans->chunk_bytes_reserved = 0; -} - -/* - * btrfs_subvolume_reserve_metadata() - reserve space for subvolume operation - * root: the root of the parent directory - * rsv: block reservation - * items: the number of items that we need do reservation - * use_global_rsv: allow fallback to the global block reservation - * - * This function is used to reserve the space for snapshot/subvolume - * creation and deletion. Those operations are different with the - * common file/directory operations, they change two fs/file trees - * and root tree, the number of items that the qgroup reserves is - * different with the free space reservation. So we can not use - * the space reservation mechanism in start_transaction(). - */ -int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, - struct btrfs_block_rsv *rsv, int items, - bool use_global_rsv) -{ - u64 qgroup_num_bytes = 0; - u64 num_bytes; - int ret; - struct btrfs_fs_info *fs_info = root->fs_info; - struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; - - if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) { - /* One for parent inode, two for dir entries */ - qgroup_num_bytes = 3 * fs_info->nodesize; - ret = btrfs_qgroup_reserve_meta_prealloc(root, - qgroup_num_bytes, true); - if (ret) - return ret; - } - - num_bytes = btrfs_calc_trans_metadata_size(fs_info, items); - rsv->space_info = __find_space_info(fs_info, - BTRFS_BLOCK_GROUP_METADATA); - ret = btrfs_block_rsv_add(root, rsv, num_bytes, - BTRFS_RESERVE_FLUSH_ALL); - - if (ret == -ENOSPC && use_global_rsv) - ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes, true); - - if (ret && qgroup_num_bytes) - btrfs_qgroup_free_meta_prealloc(root, qgroup_num_bytes); - - return ret; -} - -void btrfs_subvolume_release_metadata(struct btrfs_fs_info *fs_info, - struct btrfs_block_rsv *rsv) -{ - btrfs_block_rsv_release(fs_info, rsv, (u64)-1); -} - -static void btrfs_calculate_inode_block_rsv_size(struct btrfs_fs_info *fs_info, - struct btrfs_inode *inode) -{ - struct btrfs_block_rsv *block_rsv = &inode->block_rsv; - u64 reserve_size = 0; - u64 qgroup_rsv_size = 0; - u64 csum_leaves; - unsigned outstanding_extents; - - lockdep_assert_held(&inode->lock); - outstanding_extents = inode->outstanding_extents; - if (outstanding_extents) - reserve_size = btrfs_calc_trans_metadata_size(fs_info, - outstanding_extents + 1); - csum_leaves = btrfs_csum_bytes_to_leaves(fs_info, - inode->csum_bytes); - reserve_size += btrfs_calc_trans_metadata_size(fs_info, - csum_leaves); - /* - * For qgroup rsv, the calculation is very simple: - * account one nodesize for each outstanding extent - * - * This is overestimating in most cases. - */ - qgroup_rsv_size = (u64)outstanding_extents * fs_info->nodesize; - - spin_lock(&block_rsv->lock); - block_rsv->size = reserve_size; - block_rsv->qgroup_rsv_size = qgroup_rsv_size; - spin_unlock(&block_rsv->lock); -} - -static void calc_inode_reservations(struct btrfs_fs_info *fs_info, - u64 num_bytes, u64 *meta_reserve, - u64 *qgroup_reserve) -{ - u64 nr_extents = count_max_extents(num_bytes); - u64 csum_leaves = btrfs_csum_bytes_to_leaves(fs_info, num_bytes); - - /* We add one for the inode update at finish ordered time */ - *meta_reserve = btrfs_calc_trans_metadata_size(fs_info, - nr_extents + csum_leaves + 1); - *qgroup_reserve = nr_extents * fs_info->nodesize; -} - -int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes) -{ - struct btrfs_root *root = inode->root; - struct btrfs_fs_info *fs_info = root->fs_info; - struct btrfs_block_rsv *block_rsv = &inode->block_rsv; - u64 meta_reserve, qgroup_reserve; - unsigned nr_extents; - enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL; - int ret = 0; - bool delalloc_lock = true; - - /* If we are a free space inode we need to not flush since we will be in - * the middle of a transaction commit. We also don't need the delalloc - * mutex since we won't race with anybody. We need this mostly to make - * lockdep shut its filthy mouth. - * - * If we have a transaction open (can happen if we call truncate_block - * from truncate), then we need FLUSH_LIMIT so we don't deadlock. - */ - if (btrfs_is_free_space_inode(inode)) { - flush = BTRFS_RESERVE_NO_FLUSH; - delalloc_lock = false; - } else { - if (current->journal_info) - flush = BTRFS_RESERVE_FLUSH_LIMIT; - - if (btrfs_transaction_in_commit(fs_info)) - schedule_timeout(1); - } - - if (delalloc_lock) - mutex_lock(&inode->delalloc_mutex); - - num_bytes = ALIGN(num_bytes, fs_info->sectorsize); - - /* - * We always want to do it this way, every other way is wrong and ends - * in tears. Pre-reserving the amount we are going to add will always - * be the right way, because otherwise if we have enough parallelism we - * could end up with thousands of inodes all holding little bits of - * reservations they were able to make previously and the only way to - * reclaim that space is to ENOSPC out the operations and clear - * everything out and try again, which is bad. This way we just - * over-reserve slightly, and clean up the mess when we are done. - */ - calc_inode_reservations(fs_info, num_bytes, &meta_reserve, - &qgroup_reserve); - ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_reserve, true); - if (ret) - goto out_fail; - ret = reserve_metadata_bytes(root, block_rsv, meta_reserve, flush); - if (ret) - goto out_qgroup; - - /* - * Now we need to update our outstanding extents and csum bytes _first_ - * and then add the reservation to the block_rsv. This keeps us from - * racing with an ordered completion or some such that would think it - * needs to free the reservation we just made. - */ - spin_lock(&inode->lock); - nr_extents = count_max_extents(num_bytes); - btrfs_mod_outstanding_extents(inode, nr_extents); - inode->csum_bytes += num_bytes; - btrfs_calculate_inode_block_rsv_size(fs_info, inode); - spin_unlock(&inode->lock); - - /* Now we can safely add our space to our block rsv */ - block_rsv_add_bytes(block_rsv, meta_reserve, false); - trace_btrfs_space_reservation(root->fs_info, "delalloc", - btrfs_ino(inode), meta_reserve, 1); - - spin_lock(&block_rsv->lock); - block_rsv->qgroup_rsv_reserved += qgroup_reserve; - spin_unlock(&block_rsv->lock); - - if (delalloc_lock) - mutex_unlock(&inode->delalloc_mutex); - return 0; -out_qgroup: - btrfs_qgroup_free_meta_prealloc(root, qgroup_reserve); -out_fail: - btrfs_inode_rsv_release(inode, true); - if (delalloc_lock) - mutex_unlock(&inode->delalloc_mutex); - return ret; -} - -/** - * btrfs_delalloc_release_metadata - release a metadata reservation for an inode - * @inode: the inode to release the reservation for. - * @num_bytes: the number of bytes we are releasing. - * @qgroup_free: free qgroup reservation or convert it to per-trans reservation - * - * This will release the metadata reservation for an inode. This can be called - * once we complete IO for a given set of bytes to release their metadata - * reservations, or on error for the same reason. - */ -void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes, - bool qgroup_free) -{ - struct btrfs_fs_info *fs_info = inode->root->fs_info; - - num_bytes = ALIGN(num_bytes, fs_info->sectorsize); - spin_lock(&inode->lock); - inode->csum_bytes -= num_bytes; - btrfs_calculate_inode_block_rsv_size(fs_info, inode); - spin_unlock(&inode->lock); - - if (btrfs_is_testing(fs_info)) - return; - - btrfs_inode_rsv_release(inode, qgroup_free); -} - -/** - * btrfs_delalloc_release_extents - release our outstanding_extents - * @inode: the inode to balance the reservation for. - * @num_bytes: the number of bytes we originally reserved with - * @qgroup_free: do we need to free qgroup meta reservation or convert them. - * - * When we reserve space we increase outstanding_extents for the extents we may - * add. Once we've set the range as delalloc or created our ordered extents we - * have outstanding_extents to track the real usage, so we use this to free our - * temporarily tracked outstanding_extents. This _must_ be used in conjunction - * with btrfs_delalloc_reserve_metadata. - */ -void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes, - bool qgroup_free) -{ - struct btrfs_fs_info *fs_info = inode->root->fs_info; - unsigned num_extents; - - spin_lock(&inode->lock); - num_extents = count_max_extents(num_bytes); - btrfs_mod_outstanding_extents(inode, -num_extents); - btrfs_calculate_inode_block_rsv_size(fs_info, inode); - spin_unlock(&inode->lock); - - if (btrfs_is_testing(fs_info)) - return; - - btrfs_inode_rsv_release(inode, qgroup_free); -} - -/** - * btrfs_delalloc_reserve_space - reserve data and metadata space for - * delalloc - * @inode: inode we're writing to - * @start: start range we are writing to - * @len: how long the range we are writing to - * @reserved: mandatory parameter, record actually reserved qgroup ranges of - * current reservation. - * - * This will do the following things - * - * o reserve space in data space info for num bytes - * and reserve precious corresponding qgroup space - * (Done in check_data_free_space) - * - * o reserve space for metadata space, based on the number of outstanding - * extents and how much csums will be needed - * also reserve metadata space in a per root over-reserve method. - * o add to the inodes->delalloc_bytes - * o add it to the fs_info's delalloc inodes list. - * (Above 3 all done in delalloc_reserve_metadata) - * - * Return 0 for success - * Return <0 for error(-ENOSPC or -EQUOT) - */ -int btrfs_delalloc_reserve_space(struct inode *inode, - struct extent_changeset **reserved, u64 start, u64 len) -{ - int ret; - - ret = btrfs_check_data_free_space(inode, reserved, start, len); - if (ret < 0) - return ret; - ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len); - if (ret < 0) - btrfs_free_reserved_data_space(inode, *reserved, start, len); - return ret; -} - -/** - * btrfs_delalloc_release_space - release data and metadata space for delalloc - * @inode: inode we're releasing space for - * @start: start position of the space already reserved - * @len: the len of the space already reserved - * @release_bytes: the len of the space we consumed or didn't use - * - * This function will release the metadata space that was not used and will - * decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes - * list if there are no delalloc bytes left. - * Also it will handle the qgroup reserved space. - */ -void btrfs_delalloc_release_space(struct inode *inode, - struct extent_changeset *reserved, - u64 start, u64 len, bool qgroup_free) -{ - btrfs_delalloc_release_metadata(BTRFS_I(inode), len, qgroup_free); - btrfs_free_reserved_data_space(inode, reserved, start, len); -} - static int update_block_group(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, int alloc) { @@ -6296,7 +4168,8 @@ static int update_block_group(struct btrfs_trans_handle *trans, old_val -= num_bytes; btrfs_set_block_group_used(&cache->item, old_val); cache->pinned += num_bytes; - update_bytes_pinned(cache->space_info, num_bytes); + btrfs_space_info_update_bytes_pinned(info, + cache->space_info, num_bytes); cache->space_info->bytes_used -= num_bytes; cache->space_info->disk_used -= num_bytes * factor; spin_unlock(&cache->lock); @@ -6371,7 +4244,8 @@ static int pin_down_extent(struct btrfs_block_group_cache *cache, spin_lock(&cache->space_info->lock); spin_lock(&cache->lock); cache->pinned += num_bytes; - update_bytes_pinned(cache->space_info, num_bytes); + btrfs_space_info_update_bytes_pinned(fs_info, cache->space_info, + num_bytes); if (reserved) { cache->reserved -= num_bytes; cache->space_info->bytes_reserved -= num_bytes; @@ -6580,7 +4454,8 @@ static int btrfs_add_reserved_bytes(struct btrfs_block_group_cache *cache, } else { cache->reserved += num_bytes; space_info->bytes_reserved += num_bytes; - update_bytes_may_use(space_info, -ram_bytes); + btrfs_space_info_update_bytes_may_use(cache->fs_info, + space_info, -ram_bytes); if (delalloc) cache->delalloc_bytes += num_bytes; } @@ -6646,7 +4521,7 @@ void btrfs_prepare_extent_commit(struct btrfs_fs_info *fs_info) up_write(&fs_info->commit_root_sem); - update_global_block_rsv(fs_info); + btrfs_update_global_block_rsv(fs_info); } /* @@ -6736,7 +4611,7 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info, spin_lock(&space_info->lock); spin_lock(&cache->lock); cache->pinned -= len; - update_bytes_pinned(space_info, -len); + btrfs_space_info_update_bytes_pinned(fs_info, space_info, -len); trace_btrfs_space_reservation(fs_info, "pinned", space_info->flags, len, 0); @@ -6757,7 +4632,8 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info, to_add = min(len, global_rsv->size - global_rsv->reserved); global_rsv->reserved += to_add; - update_bytes_may_use(space_info, to_add); + btrfs_space_info_update_bytes_may_use(fs_info, + space_info, to_add); if (global_rsv->reserved >= global_rsv->size) global_rsv->full = 1; trace_btrfs_space_reservation(fs_info, @@ -6769,8 +4645,8 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info, spin_unlock(&global_rsv->lock); /* Add to any tickets we may have */ if (len) - space_info_add_new_bytes(fs_info, space_info, - len); + btrfs_space_info_add_new_bytes(fs_info, + space_info, len); } spin_unlock(&space_info->lock); } @@ -7191,7 +5067,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, } out: if (pin) - add_pinned_bytes(fs_info, &generic_ref, 1); + add_pinned_bytes(fs_info, &generic_ref); if (last_ref) { /* @@ -7239,7 +5115,7 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref) btrfs_ref_tree_mod(fs_info, ref); if (ret == 0 && old_ref_mod >= 0 && new_ref_mod < 0) - add_pinned_bytes(fs_info, ref, 1); + add_pinned_bytes(fs_info, ref); return ret; } @@ -7292,10 +5168,10 @@ wait_block_group_cache_done(struct btrfs_block_group_cache *cache) } enum btrfs_loop_type { - LOOP_CACHING_NOWAIT = 0, - LOOP_CACHING_WAIT = 1, - LOOP_ALLOC_CHUNK = 2, - LOOP_NO_EMPTY_SIZE = 3, + LOOP_CACHING_NOWAIT, + LOOP_CACHING_WAIT, + LOOP_ALLOC_CHUNK, + LOOP_NO_EMPTY_SIZE, }; static inline void @@ -7661,8 +5537,8 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info, return ret; } - ret = do_chunk_alloc(trans, ffe_ctl->flags, - CHUNK_ALLOC_FORCE); + ret = btrfs_chunk_alloc(trans, ffe_ctl->flags, + CHUNK_ALLOC_FORCE); /* * If we can't allocate a new chunk we've already looped @@ -7758,7 +5634,7 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info, trace_find_free_extent(fs_info, num_bytes, empty_size, flags); - space_info = __find_space_info(fs_info, flags); + space_info = btrfs_find_space_info(fs_info, flags); if (!space_info) { btrfs_err(fs_info, "No space info for %llu", flags); return -ENOSPC; @@ -7863,9 +5739,8 @@ search: */ if (!block_group_bits(block_group, flags)) { u64 extra = BTRFS_BLOCK_GROUP_DUP | - BTRFS_BLOCK_GROUP_RAID1 | - BTRFS_BLOCK_GROUP_RAID5 | - BTRFS_BLOCK_GROUP_RAID6 | + BTRFS_BLOCK_GROUP_RAID1_MASK | + BTRFS_BLOCK_GROUP_RAID56_MASK | BTRFS_BLOCK_GROUP_RAID10; /* @@ -7984,60 +5859,6 @@ loop: return ret; } -#define DUMP_BLOCK_RSV(fs_info, rsv_name) \ -do { \ - struct btrfs_block_rsv *__rsv = &(fs_info)->rsv_name; \ - spin_lock(&__rsv->lock); \ - btrfs_info(fs_info, #rsv_name ": size %llu reserved %llu", \ - __rsv->size, __rsv->reserved); \ - spin_unlock(&__rsv->lock); \ -} while (0) - -static void dump_space_info(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *info, u64 bytes, - int dump_block_groups) -{ - struct btrfs_block_group_cache *cache; - int index = 0; - - spin_lock(&info->lock); - btrfs_info(fs_info, "space_info %llu has %llu free, is %sfull", - info->flags, - info->total_bytes - btrfs_space_info_used(info, true), - info->full ? "" : "not "); - btrfs_info(fs_info, - "space_info total=%llu, used=%llu, pinned=%llu, reserved=%llu, may_use=%llu, readonly=%llu", - info->total_bytes, info->bytes_used, info->bytes_pinned, - info->bytes_reserved, info->bytes_may_use, - info->bytes_readonly); - spin_unlock(&info->lock); - - DUMP_BLOCK_RSV(fs_info, global_block_rsv); - DUMP_BLOCK_RSV(fs_info, trans_block_rsv); - DUMP_BLOCK_RSV(fs_info, chunk_block_rsv); - DUMP_BLOCK_RSV(fs_info, delayed_block_rsv); - DUMP_BLOCK_RSV(fs_info, delayed_refs_rsv); - - if (!dump_block_groups) - return; - - down_read(&info->groups_sem); -again: - list_for_each_entry(cache, &info->block_groups[index], list) { - spin_lock(&cache->lock); - btrfs_info(fs_info, - "block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %s", - cache->key.objectid, cache->key.offset, - btrfs_block_group_used(&cache->item), cache->pinned, - cache->reserved, cache->ro ? "[readonly]" : ""); - btrfs_dump_free_space(cache, bytes); - spin_unlock(&cache->lock); - } - if (++index < BTRFS_NR_RAID_TYPES) - goto again; - up_read(&info->groups_sem); -} - /* * btrfs_reserve_extent - entry point to the extent allocator. Tries to find a * hole that is at least as big as @num_bytes. @@ -8113,12 +5934,13 @@ again: } else if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) { struct btrfs_space_info *sinfo; - sinfo = __find_space_info(fs_info, flags); + sinfo = btrfs_find_space_info(fs_info, flags); btrfs_err(fs_info, "allocation failed flags %llu, wanted %llu", flags, num_bytes); if (sinfo) - dump_space_info(fs_info, sinfo, num_bytes, 1); + btrfs_dump_space_info(fs_info, sinfo, + num_bytes, 1); } } @@ -8456,73 +6278,6 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, return buf; } -static struct btrfs_block_rsv * -use_block_rsv(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u32 blocksize) -{ - struct btrfs_fs_info *fs_info = root->fs_info; - struct btrfs_block_rsv *block_rsv; - struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; - int ret; - bool global_updated = false; - - block_rsv = get_block_rsv(trans, root); - - if (unlikely(block_rsv->size == 0)) - goto try_reserve; -again: - ret = block_rsv_use_bytes(block_rsv, blocksize); - if (!ret) - return block_rsv; - - if (block_rsv->failfast) - return ERR_PTR(ret); - - if (block_rsv->type == BTRFS_BLOCK_RSV_GLOBAL && !global_updated) { - global_updated = true; - update_global_block_rsv(fs_info); - goto again; - } - - /* - * The global reserve still exists to save us from ourselves, so don't - * warn_on if we are short on our delayed refs reserve. - */ - if (block_rsv->type != BTRFS_BLOCK_RSV_DELREFS && - btrfs_test_opt(fs_info, ENOSPC_DEBUG)) { - static DEFINE_RATELIMIT_STATE(_rs, - DEFAULT_RATELIMIT_INTERVAL * 10, - /*DEFAULT_RATELIMIT_BURST*/ 1); - if (__ratelimit(&_rs)) - WARN(1, KERN_DEBUG - "BTRFS: block rsv returned %d\n", ret); - } -try_reserve: - ret = reserve_metadata_bytes(root, block_rsv, blocksize, - BTRFS_RESERVE_NO_FLUSH); - if (!ret) - return block_rsv; - /* - * If we couldn't reserve metadata bytes try and use some from - * the global reserve if its space type is the same as the global - * reservation. - */ - if (block_rsv->type != BTRFS_BLOCK_RSV_GLOBAL && - block_rsv->space_info == global_rsv->space_info) { - ret = block_rsv_use_bytes(global_rsv, blocksize); - if (!ret) - return global_rsv; - } - return ERR_PTR(ret); -} - -static void unuse_block_rsv(struct btrfs_fs_info *fs_info, - struct btrfs_block_rsv *block_rsv, u32 blocksize) -{ - block_rsv_add_bytes(block_rsv, blocksize, false); - block_rsv_release_bytes(fs_info, block_rsv, NULL, 0, NULL); -} - /* * finds a free extent and does all the dirty work required for allocation * returns the tree buffer or an ERR_PTR on error. @@ -8555,7 +6310,7 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, } #endif - block_rsv = use_block_rsv(trans, root, blocksize); + block_rsv = btrfs_use_block_rsv(trans, root, blocksize); if (IS_ERR(block_rsv)) return ERR_CAST(block_rsv); @@ -8613,7 +6368,7 @@ out_free_buf: out_free_reserved: btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 0); out_unuse: - unuse_block_rsv(fs_info, block_rsv, blocksize); + btrfs_unuse_block_rsv(fs_info, block_rsv, blocksize); return ERR_PTR(ret); } @@ -9552,9 +7307,8 @@ static u64 update_block_group_flags(struct btrfs_fs_info *fs_info, u64 flags) num_devices = fs_info->fs_devices->rw_devices; - stripped = BTRFS_BLOCK_GROUP_RAID0 | - BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6 | - BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10; + stripped = BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID56_MASK | + BTRFS_BLOCK_GROUP_RAID1_MASK | BTRFS_BLOCK_GROUP_RAID10; if (num_devices == 1) { stripped |= BTRFS_BLOCK_GROUP_DUP; @@ -9565,7 +7319,7 @@ static u64 update_block_group_flags(struct btrfs_fs_info *fs_info, u64 flags) return stripped; /* turn mirroring into duplication */ - if (flags & (BTRFS_BLOCK_GROUP_RAID1 | + if (flags & (BTRFS_BLOCK_GROUP_RAID1_MASK | BTRFS_BLOCK_GROUP_RAID10)) return stripped | BTRFS_BLOCK_GROUP_DUP; } else { @@ -9636,7 +7390,7 @@ out: btrfs_info(cache->fs_info, "sinfo_used=%llu bg_num_bytes=%llu min_allocable=%llu", sinfo_used, num_bytes, min_allocable_bytes); - dump_space_info(cache->fs_info, cache->space_info, 0, 0); + btrfs_dump_space_info(cache->fs_info, cache->space_info, 0, 0); } return ret; } @@ -9678,8 +7432,7 @@ again: */ alloc_flags = update_block_group_flags(fs_info, cache->flags); if (alloc_flags != cache->flags) { - ret = do_chunk_alloc(trans, alloc_flags, - CHUNK_ALLOC_FORCE); + ret = btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE); /* * ENOSPC is allowed here, we may have enough space * already allocated at the new raid level to @@ -9695,7 +7448,7 @@ again: if (!ret) goto out; alloc_flags = get_alloc_profile(fs_info, cache->space_info->flags); - ret = do_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE); + ret = btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE); if (ret < 0) goto out; ret = inc_block_group_ro(cache, 0); @@ -9716,7 +7469,7 @@ int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, u64 type) { u64 alloc_flags = get_alloc_profile(trans->fs_info, type); - return do_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE); + return btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE); } /* @@ -9949,7 +7702,7 @@ static int find_first_block_group(struct btrfs_fs_info *fs_info, struct extent_map_tree *em_tree; struct extent_map *em; - em_tree = &root->fs_info->mapping_tree.map_tree; + em_tree = &root->fs_info->mapping_tree; read_lock(&em_tree->lock); em = lookup_extent_mapping(em_tree, found_key.objectid, found_key.offset); @@ -10102,7 +7855,7 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) */ synchronize_rcu(); - release_global_block_rsv(info); + btrfs_release_global_block_rsv(info); while (!list_empty(&info->space_info)) { int i; @@ -10118,7 +7871,7 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) if (WARN_ON(space_info->bytes_pinned > 0 || space_info->bytes_reserved > 0 || space_info->bytes_may_use > 0)) - dump_space_info(info, space_info, 0, 0); + btrfs_dump_space_info(info, space_info, 0, 0); list_del(&space_info->list); for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) { struct kobject *kobj; @@ -10141,7 +7894,6 @@ void btrfs_add_raid_kobjects(struct btrfs_fs_info *fs_info) struct btrfs_space_info *space_info; struct raid_kobject *rkobj; LIST_HEAD(list); - int index; int ret = 0; spin_lock(&fs_info->pending_raid_kobjs_lock); @@ -10149,11 +7901,10 @@ void btrfs_add_raid_kobjects(struct btrfs_fs_info *fs_info) spin_unlock(&fs_info->pending_raid_kobjs_lock); list_for_each_entry(rkobj, &list, list) { - space_info = __find_space_info(fs_info, rkobj->flags); - index = btrfs_bg_flags_to_raid_index(rkobj->flags); + space_info = btrfs_find_space_info(fs_info, rkobj->flags); ret = kobject_add(&rkobj->kobj, &space_info->kobj, - "%s", get_raid_name(index)); + "%s", btrfs_bg_type_to_raid_name(rkobj->flags)); if (ret) { kobject_put(&rkobj->kobj); break; @@ -10243,21 +7994,21 @@ btrfs_create_block_group_cache(struct btrfs_fs_info *fs_info, */ static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info) { - struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; + struct extent_map_tree *map_tree = &fs_info->mapping_tree; struct extent_map *em; struct btrfs_block_group_cache *bg; u64 start = 0; int ret = 0; while (1) { - read_lock(&map_tree->map_tree.lock); + read_lock(&map_tree->lock); /* * lookup_extent_mapping will return the first extent map * intersecting the range, so setting @len to 1 is enough to * get the first chunk. */ - em = lookup_extent_mapping(&map_tree->map_tree, start, 1); - read_unlock(&map_tree->map_tree.lock); + em = lookup_extent_mapping(map_tree, start, 1); + read_unlock(&map_tree->lock); if (!em) break; @@ -10417,9 +8168,9 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info) } trace_btrfs_add_block_group(info, cache, 0); - update_space_info(info, cache->flags, found_key.offset, - btrfs_block_group_used(&cache->item), - cache->bytes_super, &space_info); + btrfs_update_space_info(info, cache->flags, found_key.offset, + btrfs_block_group_used(&cache->item), + cache->bytes_super, &space_info); cache->space_info = space_info; @@ -10437,9 +8188,8 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info) list_for_each_entry_rcu(space_info, &info->space_info, list) { if (!(get_alloc_profile(info, space_info->flags) & (BTRFS_BLOCK_GROUP_RAID10 | - BTRFS_BLOCK_GROUP_RAID1 | - BTRFS_BLOCK_GROUP_RAID5 | - BTRFS_BLOCK_GROUP_RAID6 | + BTRFS_BLOCK_GROUP_RAID1_MASK | + BTRFS_BLOCK_GROUP_RAID56_MASK | BTRFS_BLOCK_GROUP_DUP))) continue; /* @@ -10457,7 +8207,7 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info) } btrfs_add_raid_kobjects(info); - init_global_block_rsv(info); + btrfs_init_global_block_rsv(info); ret = check_chunk_block_group_mappings(info); error: btrfs_free_path(path); @@ -10554,7 +8304,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used, * assigned to our block group. We want our bg to be added to the rbtree * with its ->space_info set. */ - cache->space_info = __find_space_info(fs_info, cache->flags); + cache->space_info = btrfs_find_space_info(fs_info, cache->flags); ASSERT(cache->space_info); ret = btrfs_add_block_group_cache(fs_info, cache); @@ -10569,9 +8319,9 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used, * the rbtree, update the space info's counters. */ trace_btrfs_add_block_group(fs_info, cache, 1); - update_space_info(fs_info, cache->flags, size, bytes_used, + btrfs_update_space_info(fs_info, cache->flags, size, bytes_used, cache->bytes_super, &cache->space_info); - update_global_block_rsv(fs_info); + btrfs_update_global_block_rsv(fs_info); link_block_group(cache); @@ -10598,6 +8348,35 @@ static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) write_sequnlock(&fs_info->profiles_lock); } +/* + * Clear incompat bits for the following feature(s): + * + * - RAID56 - in case there's neither RAID5 nor RAID6 profile block group + * in the whole filesystem + */ +static void clear_incompat_bg_bits(struct btrfs_fs_info *fs_info, u64 flags) +{ + if (flags & BTRFS_BLOCK_GROUP_RAID56_MASK) { + struct list_head *head = &fs_info->space_info; + struct btrfs_space_info *sinfo; + + list_for_each_entry_rcu(sinfo, head, list) { + bool found = false; + + down_read(&sinfo->groups_sem); + if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID5])) + found = true; + if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID6])) + found = true; + up_read(&sinfo->groups_sem); + + if (found) + return; + } + btrfs_clear_fs_incompat(fs_info, RAID56); + } +} + int btrfs_remove_block_group(struct btrfs_trans_handle *trans, u64 group_start, struct extent_map *em) { @@ -10744,6 +8523,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, clear_avail_alloc_bits(fs_info, block_group->flags); } up_write(&block_group->space_info->groups_sem); + clear_incompat_bg_bits(fs_info, block_group->flags); if (kobj) { kobject_del(kobj); kobject_put(kobj); @@ -10853,7 +8633,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, if (remove_em) { struct extent_map_tree *em_tree; - em_tree = &fs_info->mapping_tree.map_tree; + em_tree = &fs_info->mapping_tree; write_lock(&em_tree->lock); remove_extent_mapping(em_tree, em); write_unlock(&em_tree->lock); @@ -10871,7 +8651,7 @@ struct btrfs_trans_handle * btrfs_start_trans_remove_block_group(struct btrfs_fs_info *fs_info, const u64 chunk_offset) { - struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree; + struct extent_map_tree *em_tree = &fs_info->mapping_tree; struct extent_map *em; struct map_lookup *map; unsigned int num_items; @@ -11020,7 +8800,8 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) spin_lock(&space_info->lock); spin_lock(&block_group->lock); - update_bytes_pinned(space_info, -block_group->pinned); + btrfs_space_info_update_bytes_pinned(fs_info, space_info, + -block_group->pinned); space_info->bytes_readonly += block_group->pinned; percpu_counter_add_batch(&space_info->total_bytes_pinned, -block_group->pinned, @@ -11076,43 +8857,6 @@ next: spin_unlock(&fs_info->unused_bgs_lock); } -int btrfs_init_space_info(struct btrfs_fs_info *fs_info) -{ - struct btrfs_super_block *disk_super; - u64 features; - u64 flags; - int mixed = 0; - int ret; - - disk_super = fs_info->super_copy; - if (!btrfs_super_root(disk_super)) - return -EINVAL; - - features = btrfs_super_incompat_flags(disk_super); - if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) - mixed = 1; - - flags = BTRFS_BLOCK_GROUP_SYSTEM; - ret = create_space_info(fs_info, flags); - if (ret) - goto out; - - if (mixed) { - flags = BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA; - ret = create_space_info(fs_info, flags); - } else { - flags = BTRFS_BLOCK_GROUP_METADATA; - ret = create_space_info(fs_info, flags); - if (ret) - goto out; - - flags = BTRFS_BLOCK_GROUP_DATA; - ret = create_space_info(fs_info, flags); - } -out: - return ret; -} - int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info, u64 start, u64 end) { @@ -11171,12 +8915,17 @@ static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed) find_first_clear_extent_bit(&device->alloc_state, start, &start, &end, CHUNK_TRIMMED | CHUNK_ALLOCATED); + + /* Ensure we skip the reserved area in the first 1M */ + start = max_t(u64, start, SZ_1M); + /* * If find_first_clear_extent_bit find a range that spans the * end of the device it will set end to -1, in this case it's up * to the caller to trim the value to the size of the device. */ end = min(end, device->total_bytes - 1); + len = end - start + 1; /* We didn't find any extents */ diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 5106008f5e28..1ff438fd5bc2 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -359,6 +359,24 @@ do_insert: return NULL; } +/** + * __etree_search - searche @tree for an entry that contains @offset. Such + * entry would have entry->start <= offset && entry->end >= offset. + * + * @tree - the tree to search + * @offset - offset that should fall within an entry in @tree + * @next_ret - pointer to the first entry whose range ends after @offset + * @prev - pointer to the first entry whose range begins before @offset + * @p_ret - pointer where new node should be anchored (used when inserting an + * entry in the tree) + * @parent_ret - points to entry which would have been the parent of the entry, + * containing @offset + * + * This function returns a pointer to the entry that contains @offset byte + * address. If no such entry exists, then NULL is returned and the other + * pointer arguments to the function are filled, otherwise the found entry is + * returned and other pointers are left untouched. + */ static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset, struct rb_node **next_ret, struct rb_node **prev_ret, @@ -504,9 +522,11 @@ static int insert_state(struct extent_io_tree *tree, { struct rb_node *node; - if (end < start) - WARN(1, KERN_ERR "BTRFS: end < start %llu %llu\n", - end, start); + if (end < start) { + btrfs_err(tree->fs_info, + "insert state: end < start %llu %llu", end, start); + WARN_ON(1); + } state->start = start; state->end = end; @@ -516,7 +536,8 @@ static int insert_state(struct extent_io_tree *tree, if (node) { struct extent_state *found; found = rb_entry(node, struct extent_state, rb_node); - pr_err("BTRFS: found node %llu %llu on insert of %llu %llu\n", + btrfs_err(tree->fs_info, + "found node %llu %llu on insert of %llu %llu", found->start, found->end, start, end); return -EEXIST; } @@ -1537,8 +1558,8 @@ out: } /** - * find_first_clear_extent_bit - finds the first range that has @bits not set - * and that starts after @start + * find_first_clear_extent_bit - find the first range that has @bits not set. + * This range could start before @start. * * @tree - the tree to search * @start - the offset at/after which the found extent should start @@ -1578,12 +1599,52 @@ void find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start, goto out; } } + /* + * At this point 'node' either contains 'start' or start is + * before 'node' + */ state = rb_entry(node, struct extent_state, rb_node); - if (in_range(start, state->start, state->end - state->start + 1) && - (state->state & bits)) { - start = state->end + 1; + + if (in_range(start, state->start, state->end - state->start + 1)) { + if (state->state & bits) { + /* + * |--range with bits sets--| + * | + * start + */ + start = state->end + 1; + } else { + /* + * 'start' falls within a range that doesn't + * have the bits set, so take its start as + * the beginning of the desired range + * + * |--range with bits cleared----| + * | + * start + */ + *start_ret = state->start; + break; + } } else { - *start_ret = start; + /* + * |---prev range---|---hole/unset---|---node range---| + * | + * start + * + * or + * + * |---hole/unset--||--first node--| + * 0 | + * start + */ + if (prev) { + state = rb_entry(prev, struct extent_state, + rb_node); + *start_ret = state->end + 1; + } else { + *start_ret = 0; + } break; } } @@ -1719,10 +1780,10 @@ static noinline int lock_delalloc_pages(struct inode *inode, */ EXPORT_FOR_TESTS noinline_for_stack bool find_lock_delalloc_range(struct inode *inode, - struct extent_io_tree *tree, struct page *locked_page, u64 *start, u64 *end) { + struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; u64 max_bytes = BTRFS_MAX_EXTENT_SIZE; u64 delalloc_start; u64 delalloc_end; @@ -2800,12 +2861,11 @@ static inline void btrfs_io_bio_init(struct btrfs_io_bio *btrfs_bio) * never fail. We're returning a bio right now but you can call btrfs_io_bio * for the appropriate container_of magic */ -struct bio *btrfs_bio_alloc(struct block_device *bdev, u64 first_byte) +struct bio *btrfs_bio_alloc(u64 first_byte) { struct bio *bio; bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_PAGES, &btrfs_bioset); - bio_set_dev(bio, bdev); bio->bi_iter.bi_sector = first_byte >> 9; btrfs_io_bio_init(btrfs_io_bio(bio)); return bio; @@ -2916,7 +2976,8 @@ static int submit_extent_page(unsigned int opf, struct extent_io_tree *tree, } } - bio = btrfs_bio_alloc(bdev, offset); + bio = btrfs_bio_alloc(offset); + bio_set_dev(bio, bdev); bio_add_page(bio, page, page_size, pg_offset); bio->bi_end_io = end_io_func; bio->bi_private = tree; @@ -3204,21 +3265,10 @@ static inline void contiguous_readpages(struct extent_io_tree *tree, unsigned long *bio_flags, u64 *prev_em_start) { - struct inode *inode; - struct btrfs_ordered_extent *ordered; + struct btrfs_inode *inode = BTRFS_I(pages[0]->mapping->host); int index; - inode = pages[0]->mapping->host; - while (1) { - lock_extent(tree, start, end); - ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), start, - end - start + 1); - if (!ordered) - break; - unlock_extent(tree, start, end); - btrfs_start_ordered_extent(inode, ordered, 1); - btrfs_put_ordered_extent(ordered); - } + btrfs_lock_and_flush_ordered_range(tree, inode, start, end, NULL); for (index = 0; index < nr_pages; index++) { __do_readpage(tree, pages[index], btrfs_get_extent, em_cached, @@ -3234,22 +3284,12 @@ static int __extent_read_full_page(struct extent_io_tree *tree, unsigned long *bio_flags, unsigned int read_flags) { - struct inode *inode = page->mapping->host; - struct btrfs_ordered_extent *ordered; + struct btrfs_inode *inode = BTRFS_I(page->mapping->host); u64 start = page_offset(page); u64 end = start + PAGE_SIZE - 1; int ret; - while (1) { - lock_extent(tree, start, end); - ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), start, - PAGE_SIZE); - if (!ordered) - break; - unlock_extent(tree, start, end); - btrfs_start_ordered_extent(inode, ordered, 1); - btrfs_put_ordered_extent(ordered); - } + btrfs_lock_and_flush_ordered_range(tree, inode, start, end, NULL); ret = __do_readpage(tree, page, get_extent, NULL, bio, mirror_num, bio_flags, read_flags, NULL); @@ -3290,7 +3330,6 @@ static noinline_for_stack int writepage_delalloc(struct inode *inode, struct page *page, struct writeback_control *wbc, u64 delalloc_start, unsigned long *nr_written) { - struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; u64 page_end = delalloc_start + PAGE_SIZE - 1; bool found; u64 delalloc_to_write = 0; @@ -3300,8 +3339,7 @@ static noinline_for_stack int writepage_delalloc(struct inode *inode, while (delalloc_end < page_end) { - found = find_lock_delalloc_range(inode, tree, - page, + found = find_lock_delalloc_range(inode, page, &delalloc_start, &delalloc_end); if (!found) { @@ -3310,7 +3348,6 @@ static noinline_for_stack int writepage_delalloc(struct inode *inode, } ret = btrfs_run_delalloc_range(inode, page, delalloc_start, delalloc_end, &page_started, nr_written, wbc); - /* File system has been set read-only */ if (ret) { SetPageError(page); /* @@ -4542,6 +4579,8 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, struct btrfs_path *path; struct btrfs_root *root = BTRFS_I(inode)->root; struct fiemap_cache cache = { 0 }; + struct ulist *roots; + struct ulist *tmp_ulist; int end = 0; u64 em_start = 0; u64 em_len = 0; @@ -4555,6 +4594,13 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, return -ENOMEM; path->leave_spinning = 1; + roots = ulist_alloc(GFP_KERNEL); + tmp_ulist = ulist_alloc(GFP_KERNEL); + if (!roots || !tmp_ulist) { + ret = -ENOMEM; + goto out_free_ulist; + } + start = round_down(start, btrfs_inode_sectorsize(inode)); len = round_up(max, btrfs_inode_sectorsize(inode)) - start; @@ -4565,8 +4611,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(BTRFS_I(inode)), -1, 0); if (ret < 0) { - btrfs_free_path(path); - return ret; + goto out_free_ulist; } else { WARN_ON(!ret); if (ret == 1) @@ -4675,7 +4720,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, */ ret = btrfs_check_shared(root, btrfs_ino(BTRFS_I(inode)), - bytenr); + bytenr, roots, tmp_ulist); if (ret < 0) goto out_free; if (ret) @@ -4718,9 +4763,13 @@ out_free: ret = emit_last_fiemap_cache(fieinfo, &cache); free_extent_map(em); out: - btrfs_free_path(path); unlock_extent_cached(&BTRFS_I(inode)->io_tree, start, start + len - 1, &cached_state); + +out_free_ulist: + btrfs_free_path(path); + ulist_free(roots); + ulist_free(tmp_ulist); return ret; } @@ -4808,7 +4857,7 @@ __alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, eb->bflags = 0; rwlock_init(&eb->lock); atomic_set(&eb->blocking_readers, 0); - atomic_set(&eb->blocking_writers, 0); + eb->blocking_writers = 0; eb->lock_nested = false; init_waitqueue_head(&eb->write_lock_wq); init_waitqueue_head(&eb->read_lock_wq); @@ -4827,10 +4876,10 @@ __alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, BUG_ON(len > MAX_INLINE_EXTENT_BUFFER_SIZE); #ifdef CONFIG_BTRFS_DEBUG - atomic_set(&eb->spinning_writers, 0); + eb->spinning_writers = 0; atomic_set(&eb->spinning_readers, 0); atomic_set(&eb->read_locks, 0); - atomic_set(&eb->write_locks, 0); + eb->write_locks = 0; #endif return eb; diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index aa18a16a6ed7..401423b16976 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -167,7 +167,7 @@ struct extent_buffer { struct rcu_head rcu_head; pid_t lock_owner; - atomic_t blocking_writers; + int blocking_writers; atomic_t blocking_readers; bool lock_nested; /* >= 0 if eb belongs to a log tree, -1 otherwise */ @@ -187,10 +187,10 @@ struct extent_buffer { wait_queue_head_t read_lock_wq; struct page *pages[INLINE_EXTENT_BUFFER_PAGES]; #ifdef CONFIG_BTRFS_DEBUG - atomic_t spinning_writers; + int spinning_writers; atomic_t spinning_readers; atomic_t read_locks; - atomic_t write_locks; + int write_locks; struct list_head leak_list; #endif }; @@ -497,7 +497,7 @@ void extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end, u64 delalloc_end, struct page *locked_page, unsigned bits_to_clear, unsigned long page_ops); -struct bio *btrfs_bio_alloc(struct block_device *bdev, u64 first_byte); +struct bio *btrfs_bio_alloc(u64 first_byte); struct bio *btrfs_io_bio_alloc(unsigned int nr_iovecs); struct bio *btrfs_bio_clone(struct bio *bio); struct bio *btrfs_bio_clone_partial(struct bio *orig, int offset, int size); @@ -549,7 +549,7 @@ int free_io_failure(struct extent_io_tree *failure_tree, struct extent_io_tree *io_tree, struct io_failure_record *rec); #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS -bool find_lock_delalloc_range(struct inode *inode, struct extent_io_tree *tree, +bool find_lock_delalloc_range(struct inode *inode, struct page *locked_page, u64 *start, u64 *end); #endif diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index d431ea8198e4..1a599f50837b 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -8,6 +8,7 @@ #include <linux/pagemap.h> #include <linux/highmem.h> #include <linux/sched/mm.h> +#include <crypto/hash.h> #include "ctree.h" #include "disk-io.h" #include "transaction.h" @@ -22,9 +23,13 @@ #define MAX_CSUM_ITEMS(r, size) (min_t(u32, __MAX_CSUM_ITEMS(r, size), \ PAGE_SIZE)) -#define MAX_ORDERED_SUM_BYTES(fs_info) ((PAGE_SIZE - \ - sizeof(struct btrfs_ordered_sum)) / \ - sizeof(u32) * (fs_info)->sectorsize) +static inline u32 max_ordered_sum_bytes(struct btrfs_fs_info *fs_info, + u16 csum_size) +{ + u32 ncsums = (PAGE_SIZE - sizeof(struct btrfs_ordered_sum)) / csum_size; + + return ncsums * fs_info->sectorsize; +} int btrfs_insert_file_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -144,7 +149,7 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans, } static blk_status_t __btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, - u64 logical_offset, u32 *dst, int dio) + u64 logical_offset, u8 *dst, int dio) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct bio_vec bvec; @@ -182,7 +187,7 @@ static blk_status_t __btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio } csum = btrfs_bio->csum; } else { - csum = (u8 *)dst; + csum = dst; } if (bio->bi_iter.bi_size > PAGE_SIZE * 8) @@ -211,7 +216,7 @@ static blk_status_t __btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio if (!dio) offset = page_offset(bvec.bv_page) + bvec.bv_offset; count = btrfs_find_ordered_sum(inode, offset, disk_bytenr, - (u32 *)csum, nblocks); + csum, nblocks); if (count) goto found; @@ -283,7 +288,8 @@ next: return 0; } -blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u32 *dst) +blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, + u8 *dst) { return __btrfs_lookup_bio_sums(inode, bio, 0, dst, 0); } @@ -374,7 +380,7 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, struct btrfs_csum_item); while (start < csum_end) { size = min_t(size_t, csum_end - start, - MAX_ORDERED_SUM_BYTES(fs_info)); + max_ordered_sum_bytes(fs_info, csum_size)); sums = kzalloc(btrfs_ordered_sum_size(fs_info, size), GFP_NOFS); if (!sums) { @@ -427,6 +433,7 @@ blk_status_t btrfs_csum_one_bio(struct inode *inode, struct bio *bio, u64 file_start, int contig) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); struct btrfs_ordered_sum *sums; struct btrfs_ordered_extent *ordered = NULL; char *data; @@ -439,6 +446,7 @@ blk_status_t btrfs_csum_one_bio(struct inode *inode, struct bio *bio, int i; u64 offset; unsigned nofs_flag; + const u16 csum_size = btrfs_super_csum_size(fs_info->super_copy); nofs_flag = memalloc_nofs_save(); sums = kvzalloc(btrfs_ordered_sum_size(fs_info, bio->bi_iter.bi_size), @@ -459,6 +467,8 @@ blk_status_t btrfs_csum_one_bio(struct inode *inode, struct bio *bio, sums->bytenr = (u64)bio->bi_iter.bi_sector << 9; index = 0; + shash->tfm = fs_info->csum_shash; + bio_for_each_segment(bvec, bio, iter) { if (!contig) offset = page_offset(bvec.bv_page) + bvec.bv_offset; @@ -498,17 +508,14 @@ blk_status_t btrfs_csum_one_bio(struct inode *inode, struct bio *bio, index = 0; } - sums->sums[index] = ~(u32)0; + crypto_shash_init(shash); data = kmap_atomic(bvec.bv_page); - sums->sums[index] - = btrfs_csum_data(data + bvec.bv_offset - + (i * fs_info->sectorsize), - sums->sums[index], - fs_info->sectorsize); + crypto_shash_update(shash, data + bvec.bv_offset + + (i * fs_info->sectorsize), + fs_info->sectorsize); kunmap_atomic(data); - btrfs_csum_final(sums->sums[index], - (char *)(sums->sums + index)); - index++; + crypto_shash_final(shash, (char *)(sums->sums + index)); + index += csum_size; offset += fs_info->sectorsize; this_sum_bytes += fs_info->sectorsize; total_bytes += fs_info->sectorsize; @@ -904,9 +911,9 @@ found: write_extent_buffer(leaf, sums->sums + index, (unsigned long)item, ins_size); + index += ins_size; ins_size /= csum_size; total_bytes += ins_size * fs_info->sectorsize; - index += ins_size; btrfs_mark_buffer_dirty(path->nodes[0]); if (total_bytes < sums->len) { diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 89f5be2bfb43..58a18ed11546 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -26,6 +26,7 @@ #include "volumes.h" #include "qgroup.h" #include "compression.h" +#include "delalloc-space.h" static struct kmem_cache *btrfs_inode_defrag_cachep; /* @@ -1550,30 +1551,20 @@ static noinline int check_can_nocow(struct btrfs_inode *inode, loff_t pos, { struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_root *root = inode->root; - struct btrfs_ordered_extent *ordered; u64 lockstart, lockend; u64 num_bytes; int ret; ret = btrfs_start_write_no_snapshotting(root); if (!ret) - return -ENOSPC; + return -EAGAIN; lockstart = round_down(pos, fs_info->sectorsize); lockend = round_up(pos + *write_bytes, fs_info->sectorsize) - 1; - while (1) { - lock_extent(&inode->io_tree, lockstart, lockend); - ordered = btrfs_lookup_ordered_range(inode, lockstart, - lockend - lockstart + 1); - if (!ordered) { - break; - } - unlock_extent(&inode->io_tree, lockstart, lockend); - btrfs_start_ordered_extent(&inode->vfs_inode, ordered, 1); - btrfs_put_ordered_extent(ordered); - } + btrfs_lock_and_flush_ordered_range(&inode->io_tree, inode, lockstart, + lockend, NULL); num_bytes = lockend - lockstart + 1; ret = can_nocow_extent(&inode->vfs_inode, lockstart, &num_bytes, @@ -2721,6 +2712,11 @@ out_only_mutex: * for detecting, at fsync time, if the inode isn't yet in the * log tree or it's there but not up to date. */ + struct timespec64 now = current_time(inode); + + inode_inc_iversion(inode); + inode->i_mtime = now; + inode->i_ctime = now; trans = btrfs_start_transaction(root, 1); if (IS_ERR(trans)) { err = PTR_ERR(trans); @@ -2801,9 +2797,9 @@ static int btrfs_fallocate_update_isize(struct inode *inode, } enum { - RANGE_BOUNDARY_WRITTEN_EXTENT = 0, - RANGE_BOUNDARY_PREALLOC_EXTENT = 1, - RANGE_BOUNDARY_HOLE = 2, + RANGE_BOUNDARY_WRITTEN_EXTENT, + RANGE_BOUNDARY_PREALLOC_EXTENT, + RANGE_BOUNDARY_HOLE, }; static int btrfs_zero_range_check_range_boundary(struct inode *inode, diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index f74dc259307b..062be9dde4c6 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -18,6 +18,8 @@ #include "extent_io.h" #include "inode-map.h" #include "volumes.h" +#include "space-info.h" +#include "delalloc-space.h" #define BITS_PER_BITMAP (PAGE_SIZE * 8UL) #define MAX_CACHE_BYTES_PER_GIG SZ_32K @@ -465,9 +467,8 @@ static void io_ctl_set_crc(struct btrfs_io_ctl *io_ctl, int index) if (index == 0) offset = sizeof(u32) * io_ctl->num_pages; - crc = btrfs_csum_data(io_ctl->orig + offset, crc, - PAGE_SIZE - offset); - btrfs_csum_final(crc, (u8 *)&crc); + crc = btrfs_crc32c(crc, io_ctl->orig + offset, PAGE_SIZE - offset); + btrfs_crc32c_final(crc, (u8 *)&crc); io_ctl_unmap_page(io_ctl); tmp = page_address(io_ctl->pages[0]); tmp += index; @@ -493,9 +494,8 @@ static int io_ctl_check_crc(struct btrfs_io_ctl *io_ctl, int index) val = *tmp; io_ctl_map_page(io_ctl, 0); - crc = btrfs_csum_data(io_ctl->orig + offset, crc, - PAGE_SIZE - offset); - btrfs_csum_final(crc, (u8 *)&crc); + crc = btrfs_crc32c(crc, io_ctl->orig + offset, PAGE_SIZE - offset); + btrfs_crc32c_final(crc, (u8 *)&crc); if (val != crc) { btrfs_err_rl(io_ctl->fs_info, "csum mismatch on free space cache"); @@ -3166,8 +3166,8 @@ static int do_trimming(struct btrfs_block_group_cache *block_group, space_info->bytes_readonly += reserved_bytes; block_group->reserved -= reserved_bytes; space_info->bytes_reserved -= reserved_bytes; - spin_unlock(&space_info->lock); spin_unlock(&block_group->lock); + spin_unlock(&space_info->lock); } return ret; @@ -3358,7 +3358,7 @@ void btrfs_put_block_group_trimming(struct btrfs_block_group_cache *block_group) if (cleanup) { mutex_lock(&fs_info->chunk_mutex); - em_tree = &fs_info->mapping_tree.map_tree; + em_tree = &fs_info->mapping_tree; write_lock(&em_tree->lock); em = lookup_extent_mapping(em_tree, block_group->key.objectid, 1); diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c index ffca2abf13d0..2e8bb402050b 100644 --- a/fs/btrfs/inode-map.c +++ b/fs/btrfs/inode-map.c @@ -11,6 +11,7 @@ #include "free-space-cache.h" #include "inode-map.h" #include "transaction.h" +#include "delalloc-space.h" static int caching_kthread(void *data) { diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index a2aabdb85226..1af069a9a0c7 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -47,6 +47,7 @@ #include "props.h" #include "qgroup.h" #include "dedupe.h" +#include "delalloc-space.h" struct btrfs_iget_args { struct btrfs_key *location; @@ -1932,17 +1933,19 @@ int btrfs_bio_fits_in_stripe(struct page *page, size_t size, struct bio *bio, u64 length = 0; u64 map_length; int ret; + struct btrfs_io_geometry geom; if (bio_flags & EXTENT_BIO_COMPRESSED) return 0; length = bio->bi_iter.bi_size; map_length = length; - ret = btrfs_map_block(fs_info, btrfs_op(bio), logical, &map_length, - NULL, 0); + ret = btrfs_get_io_geometry(fs_info, btrfs_op(bio), logical, map_length, + &geom); if (ret < 0) return ret; - if (map_length < length + size) + + if (geom.len < length + size) return 1; return 0; } @@ -3203,16 +3206,23 @@ static int __readpage_endio_check(struct inode *inode, int icsum, struct page *page, int pgoff, u64 start, size_t len) { + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); char *kaddr; - u32 csum_expected; - u32 csum = ~(u32)0; + u16 csum_size = btrfs_super_csum_size(fs_info->super_copy); + u8 *csum_expected; + u8 csum[BTRFS_CSUM_SIZE]; - csum_expected = *(((u32 *)io_bio->csum) + icsum); + csum_expected = ((u8 *)io_bio->csum) + icsum * csum_size; kaddr = kmap_atomic(page); - csum = btrfs_csum_data(kaddr + pgoff, csum, len); - btrfs_csum_final(csum, (u8 *)&csum); - if (csum != csum_expected) + shash->tfm = fs_info->csum_shash; + + crypto_shash_init(shash); + crypto_shash_update(shash, kaddr + pgoff, len); + crypto_shash_final(shash, csum); + + if (memcmp(csum, csum_expected, csum_size)) goto zeroit; kunmap_atomic(kaddr); @@ -3286,6 +3296,28 @@ void btrfs_add_delayed_iput(struct inode *inode) wake_up_process(fs_info->cleaner_kthread); } +static void run_delayed_iput_locked(struct btrfs_fs_info *fs_info, + struct btrfs_inode *inode) +{ + list_del_init(&inode->delayed_iput); + spin_unlock(&fs_info->delayed_iput_lock); + iput(&inode->vfs_inode); + if (atomic_dec_and_test(&fs_info->nr_delayed_iputs)) + wake_up(&fs_info->delayed_iputs_wait); + spin_lock(&fs_info->delayed_iput_lock); +} + +static void btrfs_run_delayed_iput(struct btrfs_fs_info *fs_info, + struct btrfs_inode *inode) +{ + if (!list_empty(&inode->delayed_iput)) { + spin_lock(&fs_info->delayed_iput_lock); + if (!list_empty(&inode->delayed_iput)) + run_delayed_iput_locked(fs_info, inode); + spin_unlock(&fs_info->delayed_iput_lock); + } +} + void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info) { @@ -3295,12 +3327,7 @@ void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info) inode = list_first_entry(&fs_info->delayed_iputs, struct btrfs_inode, delayed_iput); - list_del_init(&inode->delayed_iput); - spin_unlock(&fs_info->delayed_iput_lock); - iput(&inode->vfs_inode); - if (atomic_dec_and_test(&fs_info->nr_delayed_iputs)) - wake_up(&fs_info->delayed_iputs_wait); - spin_lock(&fs_info->delayed_iput_lock); + run_delayed_iput_locked(fs_info, inode); } spin_unlock(&fs_info->delayed_iput_lock); } @@ -3935,9 +3962,7 @@ static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_path *path; int ret = 0; - struct extent_buffer *leaf; struct btrfs_dir_item *di; - struct btrfs_key key; u64 index; u64 ino = btrfs_ino(inode); u64 dir_ino = btrfs_ino(dir); @@ -3955,8 +3980,6 @@ static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans, ret = di ? PTR_ERR(di) : -ENOENT; goto err; } - leaf = path->nodes[0]; - btrfs_dir_item_key_to_cpu(leaf, di, &key); ret = btrfs_delete_one_dir_name(trans, root, path, di); if (ret) goto err; @@ -4009,6 +4032,17 @@ skip_backref: ret = 0; else if (ret) btrfs_abort_transaction(trans, ret); + + /* + * If we have a pending delayed iput we could end up with the final iput + * being run in btrfs-cleaner context. If we have enough of these built + * up we can end up burning a lot of time in btrfs-cleaner without any + * way to throttle the unlinks. Since we're currently holding a ref on + * the inode we can run the delayed iput here without any issues as the + * final iput won't be done until after we drop the ref we're currently + * holding. + */ + btrfs_run_delayed_iput(fs_info, inode); err: btrfs_free_path(path); if (ret) @@ -5008,21 +5042,8 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) if (size <= hole_start) return 0; - while (1) { - struct btrfs_ordered_extent *ordered; - - lock_extent_bits(io_tree, hole_start, block_end - 1, - &cached_state); - ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), hole_start, - block_end - hole_start); - if (!ordered) - break; - unlock_extent_cached(io_tree, hole_start, block_end - 1, - &cached_state); - btrfs_start_ordered_extent(inode, ordered, 1); - btrfs_put_ordered_extent(ordered); - } - + btrfs_lock_and_flush_ordered_range(io_tree, BTRFS_I(inode), hole_start, + block_end - 1, &cached_state); cur_offset = hole_start; while (1) { em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, cur_offset, @@ -8318,22 +8339,21 @@ static int btrfs_submit_direct_hook(struct btrfs_dio_private *dip) struct bio *orig_bio = dip->orig_bio; u64 start_sector = orig_bio->bi_iter.bi_sector; u64 file_offset = dip->logical_offset; - u64 map_length; int async_submit = 0; u64 submit_len; int clone_offset = 0; int clone_len; int ret; blk_status_t status; + struct btrfs_io_geometry geom; - map_length = orig_bio->bi_iter.bi_size; - submit_len = map_length; - ret = btrfs_map_block(fs_info, btrfs_op(orig_bio), start_sector << 9, - &map_length, NULL, 0); + submit_len = orig_bio->bi_iter.bi_size; + ret = btrfs_get_io_geometry(fs_info, btrfs_op(orig_bio), + start_sector << 9, submit_len, &geom); if (ret) return -EIO; - if (map_length >= submit_len) { + if (geom.len >= submit_len) { bio = orig_bio; dip->flags |= BTRFS_DIO_ORIG_BIO_SUBMITTED; goto submit; @@ -8346,10 +8366,10 @@ static int btrfs_submit_direct_hook(struct btrfs_dio_private *dip) async_submit = 1; /* bio split */ - ASSERT(map_length <= INT_MAX); + ASSERT(geom.len <= INT_MAX); atomic_inc(&dip->pending_bios); do { - clone_len = min_t(int, submit_len, map_length); + clone_len = min_t(int, submit_len, geom.len); /* * This will never fail as it's passing GPF_NOFS and @@ -8386,9 +8406,8 @@ static int btrfs_submit_direct_hook(struct btrfs_dio_private *dip) start_sector += clone_len >> 9; file_offset += clone_len; - map_length = submit_len; - ret = btrfs_map_block(fs_info, btrfs_op(orig_bio), - start_sector << 9, &map_length, NULL, 0); + ret = btrfs_get_io_geometry(fs_info, btrfs_op(orig_bio), + start_sector << 9, submit_len, &geom); if (ret) goto out_err; } while (submit_len > 0); diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index cfeff1b8dce0..818f7ec8bb0e 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -43,6 +43,8 @@ #include "qgroup.h" #include "tree-log.h" #include "compression.h" +#include "space-info.h" +#include "delalloc-space.h" #ifdef CONFIG_64BIT /* If we have a 32-bit userspace and 64-bit kernel, then the UAPI @@ -3993,6 +3995,27 @@ static int btrfs_remap_file_range_prep(struct file *file_in, loff_t pos_in, if (!same_inode) inode_dio_wait(inode_out); + /* + * Workaround to make sure NOCOW buffered write reach disk as NOCOW. + * + * Btrfs' back references do not have a block level granularity, they + * work at the whole extent level. + * NOCOW buffered write without data space reserved may not be able + * to fall back to CoW due to lack of data space, thus could cause + * data loss. + * + * Here we take a shortcut by flushing the whole inode, so that all + * nocow write should reach disk as nocow before we increase the + * reference of the extent. We could do better by only flushing NOCOW + * data, but that needs extra accounting. + * + * Also we don't need to check ASYNC_EXTENT, as async extent will be + * CoWed anyway, not affecting nocow part. + */ + ret = filemap_flush(inode_in->i_mapping); + if (ret < 0) + return ret; + ret = btrfs_wait_ordered_range(inode_in, ALIGN_DOWN(pos_in, bs), wb_len); if (ret < 0) diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c index 2f6c3c7851ed..98fccce4208c 100644 --- a/fs/btrfs/locking.c +++ b/fs/btrfs/locking.c @@ -15,19 +15,19 @@ #ifdef CONFIG_BTRFS_DEBUG static void btrfs_assert_spinning_writers_get(struct extent_buffer *eb) { - WARN_ON(atomic_read(&eb->spinning_writers)); - atomic_inc(&eb->spinning_writers); + WARN_ON(eb->spinning_writers); + eb->spinning_writers++; } static void btrfs_assert_spinning_writers_put(struct extent_buffer *eb) { - WARN_ON(atomic_read(&eb->spinning_writers) != 1); - atomic_dec(&eb->spinning_writers); + WARN_ON(eb->spinning_writers != 1); + eb->spinning_writers--; } static void btrfs_assert_no_spinning_writers(struct extent_buffer *eb) { - WARN_ON(atomic_read(&eb->spinning_writers)); + WARN_ON(eb->spinning_writers); } static void btrfs_assert_spinning_readers_get(struct extent_buffer *eb) @@ -58,17 +58,17 @@ static void btrfs_assert_tree_read_locked(struct extent_buffer *eb) static void btrfs_assert_tree_write_locks_get(struct extent_buffer *eb) { - atomic_inc(&eb->write_locks); + eb->write_locks++; } static void btrfs_assert_tree_write_locks_put(struct extent_buffer *eb) { - atomic_dec(&eb->write_locks); + eb->write_locks--; } void btrfs_assert_tree_locked(struct extent_buffer *eb) { - BUG_ON(!atomic_read(&eb->write_locks)); + BUG_ON(!eb->write_locks); } #else @@ -111,10 +111,10 @@ void btrfs_set_lock_blocking_write(struct extent_buffer *eb) */ if (eb->lock_nested && current->pid == eb->lock_owner) return; - if (atomic_read(&eb->blocking_writers) == 0) { + if (eb->blocking_writers == 0) { btrfs_assert_spinning_writers_put(eb); btrfs_assert_tree_locked(eb); - atomic_inc(&eb->blocking_writers); + eb->blocking_writers++; write_unlock(&eb->lock); } } @@ -148,12 +148,11 @@ void btrfs_clear_lock_blocking_write(struct extent_buffer *eb) */ if (eb->lock_nested && current->pid == eb->lock_owner) return; - BUG_ON(atomic_read(&eb->blocking_writers) != 1); write_lock(&eb->lock); + BUG_ON(eb->blocking_writers != 1); btrfs_assert_spinning_writers_get(eb); - /* atomic_dec_and_test implies a barrier */ - if (atomic_dec_and_test(&eb->blocking_writers)) - cond_wake_up_nomb(&eb->write_lock_wq); + if (--eb->blocking_writers == 0) + cond_wake_up(&eb->write_lock_wq); } /* @@ -167,12 +166,10 @@ void btrfs_tree_read_lock(struct extent_buffer *eb) if (trace_btrfs_tree_read_lock_enabled()) start_ns = ktime_get_ns(); again: - BUG_ON(!atomic_read(&eb->blocking_writers) && - current->pid == eb->lock_owner); - read_lock(&eb->lock); - if (atomic_read(&eb->blocking_writers) && - current->pid == eb->lock_owner) { + BUG_ON(eb->blocking_writers == 0 && + current->pid == eb->lock_owner); + if (eb->blocking_writers && current->pid == eb->lock_owner) { /* * This extent is already write-locked by our thread. We allow * an additional read lock to be added because it's for the same @@ -185,10 +182,10 @@ again: trace_btrfs_tree_read_lock(eb, start_ns); return; } - if (atomic_read(&eb->blocking_writers)) { + if (eb->blocking_writers) { read_unlock(&eb->lock); wait_event(eb->write_lock_wq, - atomic_read(&eb->blocking_writers) == 0); + eb->blocking_writers == 0); goto again; } btrfs_assert_tree_read_locks_get(eb); @@ -203,11 +200,11 @@ again: */ int btrfs_tree_read_lock_atomic(struct extent_buffer *eb) { - if (atomic_read(&eb->blocking_writers)) + if (eb->blocking_writers) return 0; read_lock(&eb->lock); - if (atomic_read(&eb->blocking_writers)) { + if (eb->blocking_writers) { read_unlock(&eb->lock); return 0; } @@ -223,13 +220,13 @@ int btrfs_tree_read_lock_atomic(struct extent_buffer *eb) */ int btrfs_try_tree_read_lock(struct extent_buffer *eb) { - if (atomic_read(&eb->blocking_writers)) + if (eb->blocking_writers) return 0; if (!read_trylock(&eb->lock)) return 0; - if (atomic_read(&eb->blocking_writers)) { + if (eb->blocking_writers) { read_unlock(&eb->lock); return 0; } @@ -245,13 +242,11 @@ int btrfs_try_tree_read_lock(struct extent_buffer *eb) */ int btrfs_try_tree_write_lock(struct extent_buffer *eb) { - if (atomic_read(&eb->blocking_writers) || - atomic_read(&eb->blocking_readers)) + if (eb->blocking_writers || atomic_read(&eb->blocking_readers)) return 0; write_lock(&eb->lock); - if (atomic_read(&eb->blocking_writers) || - atomic_read(&eb->blocking_readers)) { + if (eb->blocking_writers || atomic_read(&eb->blocking_readers)) { write_unlock(&eb->lock); return 0; } @@ -322,10 +317,9 @@ void btrfs_tree_lock(struct extent_buffer *eb) WARN_ON(eb->lock_owner == current->pid); again: wait_event(eb->read_lock_wq, atomic_read(&eb->blocking_readers) == 0); - wait_event(eb->write_lock_wq, atomic_read(&eb->blocking_writers) == 0); + wait_event(eb->write_lock_wq, eb->blocking_writers == 0); write_lock(&eb->lock); - if (atomic_read(&eb->blocking_readers) || - atomic_read(&eb->blocking_writers)) { + if (atomic_read(&eb->blocking_readers) || eb->blocking_writers) { write_unlock(&eb->lock); goto again; } @@ -340,7 +334,7 @@ again: */ void btrfs_tree_unlock(struct extent_buffer *eb) { - int blockers = atomic_read(&eb->blocking_writers); + int blockers = eb->blocking_writers; BUG_ON(blockers > 1); @@ -351,7 +345,7 @@ void btrfs_tree_unlock(struct extent_buffer *eb) if (blockers) { btrfs_assert_no_spinning_writers(eb); - atomic_dec(&eb->blocking_writers); + eb->blocking_writers--; /* Use the lighter barrier after atomic */ smp_mb__after_atomic(); cond_wake_up_nomb(&eb->write_lock_wq); diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 52889da69113..1744ba8b2754 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -13,6 +13,7 @@ #include "extent_io.h" #include "disk-io.h" #include "compression.h" +#include "delalloc-space.h" static struct kmem_cache *btrfs_ordered_extent_cache; @@ -924,14 +925,16 @@ out: * be reclaimed before their checksum is actually put into the btree */ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, - u32 *sum, int len) + u8 *sum, int len) { + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_ordered_sum *ordered_sum; struct btrfs_ordered_extent *ordered; struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree; unsigned long num_sectors; unsigned long i; u32 sectorsize = btrfs_inode_sectorsize(inode); + const u16 csum_size = btrfs_super_csum_size(fs_info->super_copy); int index = 0; ordered = btrfs_lookup_ordered_extent(inode, offset); @@ -947,10 +950,10 @@ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, num_sectors = ordered_sum->len >> inode->i_sb->s_blocksize_bits; num_sectors = min_t(int, len - index, num_sectors - i); - memcpy(sum + index, ordered_sum->sums + i, - num_sectors); + memcpy(sum + index, ordered_sum->sums + i * csum_size, + num_sectors * csum_size); - index += (int)num_sectors; + index += (int)num_sectors * csum_size; if (index == len) goto out; disk_bytenr += num_sectors * sectorsize; @@ -962,6 +965,51 @@ out: return index; } +/* + * btrfs_flush_ordered_range - Lock the passed range and ensures all pending + * ordered extents in it are run to completion. + * + * @tree: IO tree used for locking out other users of the range + * @inode: Inode whose ordered tree is to be searched + * @start: Beginning of range to flush + * @end: Last byte of range to lock + * @cached_state: If passed, will return the extent state responsible for the + * locked range. It's the caller's responsibility to free the cached state. + * + * This function always returns with the given range locked, ensuring after it's + * called no order extent can be pending. + */ +void btrfs_lock_and_flush_ordered_range(struct extent_io_tree *tree, + struct btrfs_inode *inode, u64 start, + u64 end, + struct extent_state **cached_state) +{ + struct btrfs_ordered_extent *ordered; + struct extent_state *cachedp = NULL; + + if (cached_state) + cachedp = *cached_state; + + while (1) { + lock_extent_bits(tree, start, end, &cachedp); + ordered = btrfs_lookup_ordered_range(inode, start, + end - start + 1); + if (!ordered) { + /* + * If no external cached_state has been passed then + * decrement the extra ref taken for cachedp since we + * aren't exposing it outside of this function + */ + if (!cached_state) + refcount_dec(&cachedp->refs); + break; + } + unlock_extent_cached(tree, start, end, &cachedp); + btrfs_start_ordered_extent(&inode->vfs_inode, ordered, 1); + btrfs_put_ordered_extent(ordered); + } +} + int __init ordered_data_init(void) { btrfs_ordered_extent_cache = kmem_cache_create("btrfs_ordered_extent", diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index 4c5991c3de14..5204171ea962 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -23,7 +23,7 @@ struct btrfs_ordered_sum { int len; struct list_head list; /* last field is a variable length array of csums */ - u32 sums[]; + u8 sums[]; }; /* @@ -183,11 +183,15 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_range( int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, struct btrfs_ordered_extent *ordered); int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, - u32 *sum, int len); + u8 *sum, int len); u64 btrfs_wait_ordered_extents(struct btrfs_root *root, u64 nr, const u64 range_start, const u64 range_len); u64 btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr, const u64 range_start, const u64 range_len); +void btrfs_lock_and_flush_ordered_range(struct extent_io_tree *tree, + struct btrfs_inode *inode, u64 start, + u64 end, + struct extent_state **cached_state); int __init ordered_data_init(void); void __cold ordered_data_exit(void); diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index 1141ca5fae6a..9cb50577d982 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -153,11 +153,11 @@ static void print_eb_refs_lock(struct extent_buffer *eb) #ifdef CONFIG_BTRFS_DEBUG btrfs_info(eb->fs_info, "refs %u lock (w:%d r:%d bw:%d br:%d sw:%d sr:%d) lock_owner %u current %u", - atomic_read(&eb->refs), atomic_read(&eb->write_locks), + atomic_read(&eb->refs), eb->write_locks, atomic_read(&eb->read_locks), - atomic_read(&eb->blocking_writers), + eb->blocking_writers, atomic_read(&eb->blocking_readers), - atomic_read(&eb->spinning_writers), + eb->spinning_writers, atomic_read(&eb->spinning_readers), eb->lock_owner, current->pid); #endif diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c index a9e2e66152ee..e0469816c678 100644 --- a/fs/btrfs/props.c +++ b/fs/btrfs/props.c @@ -257,11 +257,7 @@ static int prop_compression_validate(const char *value, size_t len) if (!value) return 0; - if (!strncmp("lzo", value, 3)) - return 0; - else if (!strncmp("zlib", value, 4)) - return 0; - else if (!strncmp("zstd", value, 4)) + if (btrfs_compress_is_valid_type(value, len)) return 0; return -EINVAL; @@ -341,7 +337,7 @@ static int inherit_props(struct btrfs_trans_handle *trans, for (i = 0; i < ARRAY_SIZE(prop_handlers); i++) { const struct prop_handler *h = &prop_handlers[i]; const char *value; - u64 num_bytes; + u64 num_bytes = 0; if (!h->inheritable) continue; diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 3e6ffbbd8b0a..f8a3c1b0a15a 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -2614,6 +2614,7 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid, int ret = 0; int i; u64 *i_qgroups; + bool committing = false; struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_root *quota_root; struct btrfs_qgroup *srcgroup; @@ -2621,7 +2622,25 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid, u32 level_size = 0; u64 nums; - mutex_lock(&fs_info->qgroup_ioctl_lock); + /* + * There are only two callers of this function. + * + * One in create_subvol() in the ioctl context, which needs to hold + * the qgroup_ioctl_lock. + * + * The other one in create_pending_snapshot() where no other qgroup + * code can modify the fs as they all need to either start a new trans + * or hold a trans handler, thus we don't need to hold + * qgroup_ioctl_lock. + * This would avoid long and complex lock chain and make lockdep happy. + */ + spin_lock(&fs_info->trans_lock); + if (trans->transaction->state == TRANS_STATE_COMMIT_DOING) + committing = true; + spin_unlock(&fs_info->trans_lock); + + if (!committing) + mutex_lock(&fs_info->qgroup_ioctl_lock); if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) goto out; @@ -2785,7 +2804,8 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid, unlock: spin_unlock(&fs_info->qgroup_lock); out: - mutex_unlock(&fs_info->qgroup_ioctl_lock); + if (!committing) + mutex_unlock(&fs_info->qgroup_ioctl_lock); return ret; } diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h index f5d4c13a8dbc..2503485db859 100644 --- a/fs/btrfs/raid56.h +++ b/fs/btrfs/raid56.h @@ -7,7 +7,7 @@ #ifndef BTRFS_RAID56_H #define BTRFS_RAID56_H -static inline int nr_parity_stripes(struct map_lookup *map) +static inline int nr_parity_stripes(const struct map_lookup *map) { if (map->type & BTRFS_BLOCK_GROUP_RAID5) return 1; @@ -17,7 +17,7 @@ static inline int nr_parity_stripes(struct map_lookup *map) return 0; } -static inline int nr_data_stripes(struct map_lookup *map) +static inline int nr_data_stripes(const struct map_lookup *map) { return map->num_stripes - nr_parity_stripes(map); } diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 22a3c69864fa..7f219851fa23 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -20,6 +20,7 @@ #include "inode-map.h" #include "qgroup.h" #include "print-tree.h" +#include "delalloc-space.h" /* * backref_node, mapping_node and tree_block start with this diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index 22124122728c..47733fb55df7 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -9,6 +9,8 @@ #include "transaction.h" #include "disk-io.h" #include "print-tree.h" +#include "qgroup.h" +#include "space-info.h" /* * Read a root item from the tree. In case we detect a root item smaller then @@ -497,3 +499,57 @@ void btrfs_update_root_times(struct btrfs_trans_handle *trans, btrfs_set_stack_timespec_nsec(&item->ctime, ct.tv_nsec); spin_unlock(&root->root_item_lock); } + +/* + * btrfs_subvolume_reserve_metadata() - reserve space for subvolume operation + * root: the root of the parent directory + * rsv: block reservation + * items: the number of items that we need do reservation + * use_global_rsv: allow fallback to the global block reservation + * + * This function is used to reserve the space for snapshot/subvolume + * creation and deletion. Those operations are different with the + * common file/directory operations, they change two fs/file trees + * and root tree, the number of items that the qgroup reserves is + * different with the free space reservation. So we can not use + * the space reservation mechanism in start_transaction(). + */ +int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, + struct btrfs_block_rsv *rsv, int items, + bool use_global_rsv) +{ + u64 qgroup_num_bytes = 0; + u64 num_bytes; + int ret; + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; + + if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) { + /* One for parent inode, two for dir entries */ + qgroup_num_bytes = 3 * fs_info->nodesize; + ret = btrfs_qgroup_reserve_meta_prealloc(root, + qgroup_num_bytes, true); + if (ret) + return ret; + } + + num_bytes = btrfs_calc_trans_metadata_size(fs_info, items); + rsv->space_info = btrfs_find_space_info(fs_info, + BTRFS_BLOCK_GROUP_METADATA); + ret = btrfs_block_rsv_add(root, rsv, num_bytes, + BTRFS_RESERVE_FLUSH_ALL); + + if (ret == -ENOSPC && use_global_rsv) + ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes, true); + + if (ret && qgroup_num_bytes) + btrfs_qgroup_free_meta_prealloc(root, qgroup_num_bytes); + + return ret; +} + +void btrfs_subvolume_release_metadata(struct btrfs_fs_info *fs_info, + struct btrfs_block_rsv *rsv) +{ + btrfs_block_rsv_release(fs_info, rsv, (u64)-1); +} diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index f7b29f9db5e2..0c99cf9fb595 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -6,6 +6,7 @@ #include <linux/blkdev.h> #include <linux/ratelimit.h> #include <linux/sched/mm.h> +#include <crypto/hash.h> #include "ctree.h" #include "volumes.h" #include "disk-io.h" @@ -1787,11 +1788,12 @@ static int scrub_checksum(struct scrub_block *sblock) static int scrub_checksum_data(struct scrub_block *sblock) { struct scrub_ctx *sctx = sblock->sctx; + struct btrfs_fs_info *fs_info = sctx->fs_info; + SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); u8 csum[BTRFS_CSUM_SIZE]; u8 *on_disk_csum; struct page *page; void *buffer; - u32 crc = ~(u32)0; u64 len; int index; @@ -1799,6 +1801,9 @@ static int scrub_checksum_data(struct scrub_block *sblock) if (!sblock->pagev[0]->have_csum) return 0; + shash->tfm = fs_info->csum_shash; + crypto_shash_init(shash); + on_disk_csum = sblock->pagev[0]->csum; page = sblock->pagev[0]->page; buffer = kmap_atomic(page); @@ -1808,7 +1813,7 @@ static int scrub_checksum_data(struct scrub_block *sblock) for (;;) { u64 l = min_t(u64, len, PAGE_SIZE); - crc = btrfs_csum_data(buffer, crc, l); + crypto_shash_update(shash, buffer, l); kunmap_atomic(buffer); len -= l; if (len == 0) @@ -1820,7 +1825,7 @@ static int scrub_checksum_data(struct scrub_block *sblock) buffer = kmap_atomic(page); } - btrfs_csum_final(crc, csum); + crypto_shash_final(shash, csum); if (memcmp(csum, on_disk_csum, sctx->csum_size)) sblock->checksum_error = 1; @@ -1832,16 +1837,19 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock) struct scrub_ctx *sctx = sblock->sctx; struct btrfs_header *h; struct btrfs_fs_info *fs_info = sctx->fs_info; + SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); u8 calculated_csum[BTRFS_CSUM_SIZE]; u8 on_disk_csum[BTRFS_CSUM_SIZE]; struct page *page; void *mapped_buffer; u64 mapped_size; void *p; - u32 crc = ~(u32)0; u64 len; int index; + shash->tfm = fs_info->csum_shash; + crypto_shash_init(shash); + BUG_ON(sblock->page_count < 1); page = sblock->pagev[0]->page; mapped_buffer = kmap_atomic(page); @@ -1875,7 +1883,7 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock) for (;;) { u64 l = min_t(u64, len, mapped_size); - crc = btrfs_csum_data(p, crc, l); + crypto_shash_update(shash, p, l); kunmap_atomic(mapped_buffer); len -= l; if (len == 0) @@ -1889,7 +1897,7 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock) p = mapped_buffer; } - btrfs_csum_final(crc, calculated_csum); + crypto_shash_final(shash, calculated_csum); if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size)) sblock->checksum_error = 1; @@ -1900,18 +1908,22 @@ static int scrub_checksum_super(struct scrub_block *sblock) { struct btrfs_super_block *s; struct scrub_ctx *sctx = sblock->sctx; + struct btrfs_fs_info *fs_info = sctx->fs_info; + SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); u8 calculated_csum[BTRFS_CSUM_SIZE]; u8 on_disk_csum[BTRFS_CSUM_SIZE]; struct page *page; void *mapped_buffer; u64 mapped_size; void *p; - u32 crc = ~(u32)0; int fail_gen = 0; int fail_cor = 0; u64 len; int index; + shash->tfm = fs_info->csum_shash; + crypto_shash_init(shash); + BUG_ON(sblock->page_count < 1); page = sblock->pagev[0]->page; mapped_buffer = kmap_atomic(page); @@ -1934,7 +1946,7 @@ static int scrub_checksum_super(struct scrub_block *sblock) for (;;) { u64 l = min_t(u64, len, mapped_size); - crc = btrfs_csum_data(p, crc, l); + crypto_shash_update(shash, p, l); kunmap_atomic(mapped_buffer); len -= l; if (len == 0) @@ -1948,7 +1960,7 @@ static int scrub_checksum_super(struct scrub_block *sblock) p = mapped_buffer; } - btrfs_csum_final(crc, calculated_csum); + crypto_shash_final(shash, calculated_csum); if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size)) ++fail_cor; @@ -2448,7 +2460,7 @@ static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u8 *csum) ASSERT(index < UINT_MAX); num_sectors = sum->len / sctx->fs_info->sectorsize; - memcpy(csum, sum->sums + index, sctx->csum_size); + memcpy(csum, sum->sums + index * sctx->csum_size, sctx->csum_size); if (index == num_sectors - 1) { list_del(&sum->list); kfree(sum); @@ -2660,18 +2672,18 @@ static int get_raid56_logic_offset(u64 physical, int num, u64 last_offset; u32 stripe_index; u32 rot; + const int data_stripes = nr_data_stripes(map); - last_offset = (physical - map->stripes[num].physical) * - nr_data_stripes(map); + last_offset = (physical - map->stripes[num].physical) * data_stripes; if (stripe_start) *stripe_start = last_offset; *offset = last_offset; - for (i = 0; i < nr_data_stripes(map); i++) { + for (i = 0; i < data_stripes; i++) { *offset = last_offset + i * map->stripe_len; stripe_nr = div64_u64(*offset, map->stripe_len); - stripe_nr = div_u64(stripe_nr, nr_data_stripes(map)); + stripe_nr = div_u64(stripe_nr, data_stripes); /* Work out the disk rotation on this stripe-set */ stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, &rot); @@ -3079,7 +3091,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, offset = map->stripe_len * (num / map->sub_stripes); increment = map->stripe_len * factor; mirror_num = num % map->sub_stripes + 1; - } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) { + } else if (map->type & BTRFS_BLOCK_GROUP_RAID1_MASK) { increment = map->stripe_len; mirror_num = num % map->num_stripes + 1; } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { @@ -3410,15 +3422,15 @@ static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx, struct btrfs_block_group_cache *cache) { struct btrfs_fs_info *fs_info = sctx->fs_info; - struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; + struct extent_map_tree *map_tree = &fs_info->mapping_tree; struct map_lookup *map; struct extent_map *em; int i; int ret = 0; - read_lock(&map_tree->map_tree.lock); - em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1); - read_unlock(&map_tree->map_tree.lock); + read_lock(&map_tree->lock); + em = lookup_extent_mapping(map_tree, chunk_offset, 1); + read_unlock(&map_tree->lock); if (!em) { /* diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index f7fe4770f0e5..69b59bf75882 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -686,7 +686,7 @@ static int send_cmd(struct send_ctx *sctx) hdr->len = cpu_to_le32(sctx->send_size - sizeof(*hdr)); hdr->crc = 0; - crc = crc32c(0, (unsigned char *)sctx->send_buf, sctx->send_size); + crc = btrfs_crc32c(0, (unsigned char *)sctx->send_buf, sctx->send_size); hdr->crc = cpu_to_le32(crc); ret = write_buf(sctx->send_filp, sctx->send_buf, sctx->send_size, @@ -6929,9 +6929,23 @@ long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg) if (ret) goto out; + mutex_lock(&fs_info->balance_mutex); + if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) { + mutex_unlock(&fs_info->balance_mutex); + btrfs_warn_rl(fs_info, + "cannot run send because a balance operation is in progress"); + ret = -EAGAIN; + goto out; + } + fs_info->send_in_progress++; + mutex_unlock(&fs_info->balance_mutex); + current->journal_info = BTRFS_SEND_TRANS_STUB; ret = send_subvol(sctx); current->journal_info = NULL; + mutex_lock(&fs_info->balance_mutex); + fs_info->send_in_progress--; + mutex_unlock(&fs_info->balance_mutex); if (ret < 0) goto out; diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c new file mode 100644 index 000000000000..ab7b9ec4c240 --- /dev/null +++ b/fs/btrfs/space-info.c @@ -0,0 +1,1094 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "ctree.h" +#include "space-info.h" +#include "sysfs.h" +#include "volumes.h" +#include "free-space-cache.h" +#include "ordered-data.h" +#include "transaction.h" +#include "math.h" + +u64 btrfs_space_info_used(struct btrfs_space_info *s_info, + bool may_use_included) +{ + ASSERT(s_info); + return s_info->bytes_used + s_info->bytes_reserved + + s_info->bytes_pinned + s_info->bytes_readonly + + (may_use_included ? s_info->bytes_may_use : 0); +} + +/* + * after adding space to the filesystem, we need to clear the full flags + * on all the space infos. + */ +void btrfs_clear_space_info_full(struct btrfs_fs_info *info) +{ + struct list_head *head = &info->space_info; + struct btrfs_space_info *found; + + rcu_read_lock(); + list_for_each_entry_rcu(found, head, list) + found->full = 0; + rcu_read_unlock(); +} + +static const char *alloc_name(u64 flags) +{ + switch (flags) { + case BTRFS_BLOCK_GROUP_METADATA|BTRFS_BLOCK_GROUP_DATA: + return "mixed"; + case BTRFS_BLOCK_GROUP_METADATA: + return "metadata"; + case BTRFS_BLOCK_GROUP_DATA: + return "data"; + case BTRFS_BLOCK_GROUP_SYSTEM: + return "system"; + default: + WARN_ON(1); + return "invalid-combination"; + }; +} + +static int create_space_info(struct btrfs_fs_info *info, u64 flags) +{ + + struct btrfs_space_info *space_info; + int i; + int ret; + + space_info = kzalloc(sizeof(*space_info), GFP_NOFS); + if (!space_info) + return -ENOMEM; + + ret = percpu_counter_init(&space_info->total_bytes_pinned, 0, + GFP_KERNEL); + if (ret) { + kfree(space_info); + return ret; + } + + for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) + INIT_LIST_HEAD(&space_info->block_groups[i]); + init_rwsem(&space_info->groups_sem); + spin_lock_init(&space_info->lock); + space_info->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK; + space_info->force_alloc = CHUNK_ALLOC_NO_FORCE; + init_waitqueue_head(&space_info->wait); + INIT_LIST_HEAD(&space_info->ro_bgs); + INIT_LIST_HEAD(&space_info->tickets); + INIT_LIST_HEAD(&space_info->priority_tickets); + + ret = kobject_init_and_add(&space_info->kobj, &space_info_ktype, + info->space_info_kobj, "%s", + alloc_name(space_info->flags)); + if (ret) { + kobject_put(&space_info->kobj); + return ret; + } + + list_add_rcu(&space_info->list, &info->space_info); + if (flags & BTRFS_BLOCK_GROUP_DATA) + info->data_sinfo = space_info; + + return ret; +} + +int btrfs_init_space_info(struct btrfs_fs_info *fs_info) +{ + struct btrfs_super_block *disk_super; + u64 features; + u64 flags; + int mixed = 0; + int ret; + + disk_super = fs_info->super_copy; + if (!btrfs_super_root(disk_super)) + return -EINVAL; + + features = btrfs_super_incompat_flags(disk_super); + if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) + mixed = 1; + + flags = BTRFS_BLOCK_GROUP_SYSTEM; + ret = create_space_info(fs_info, flags); + if (ret) + goto out; + + if (mixed) { + flags = BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA; + ret = create_space_info(fs_info, flags); + } else { + flags = BTRFS_BLOCK_GROUP_METADATA; + ret = create_space_info(fs_info, flags); + if (ret) + goto out; + + flags = BTRFS_BLOCK_GROUP_DATA; + ret = create_space_info(fs_info, flags); + } +out: + return ret; +} + +void btrfs_update_space_info(struct btrfs_fs_info *info, u64 flags, + u64 total_bytes, u64 bytes_used, + u64 bytes_readonly, + struct btrfs_space_info **space_info) +{ + struct btrfs_space_info *found; + int factor; + + factor = btrfs_bg_type_to_factor(flags); + + found = btrfs_find_space_info(info, flags); + ASSERT(found); + spin_lock(&found->lock); + found->total_bytes += total_bytes; + found->disk_total += total_bytes * factor; + found->bytes_used += bytes_used; + found->disk_used += bytes_used * factor; + found->bytes_readonly += bytes_readonly; + if (total_bytes > 0) + found->full = 0; + btrfs_space_info_add_new_bytes(info, found, + total_bytes - bytes_used - + bytes_readonly); + spin_unlock(&found->lock); + *space_info = found; +} + +struct btrfs_space_info *btrfs_find_space_info(struct btrfs_fs_info *info, + u64 flags) +{ + struct list_head *head = &info->space_info; + struct btrfs_space_info *found; + + flags &= BTRFS_BLOCK_GROUP_TYPE_MASK; + + rcu_read_lock(); + list_for_each_entry_rcu(found, head, list) { + if (found->flags & flags) { + rcu_read_unlock(); + return found; + } + } + rcu_read_unlock(); + return NULL; +} + +static inline u64 calc_global_rsv_need_space(struct btrfs_block_rsv *global) +{ + return (global->size << 1); +} + +static int can_overcommit(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, u64 bytes, + enum btrfs_reserve_flush_enum flush, + bool system_chunk) +{ + struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; + u64 profile; + u64 space_size; + u64 avail; + u64 used; + int factor; + + /* Don't overcommit when in mixed mode. */ + if (space_info->flags & BTRFS_BLOCK_GROUP_DATA) + return 0; + + if (system_chunk) + profile = btrfs_system_alloc_profile(fs_info); + else + profile = btrfs_metadata_alloc_profile(fs_info); + + used = btrfs_space_info_used(space_info, false); + + /* + * We only want to allow over committing if we have lots of actual space + * free, but if we don't have enough space to handle the global reserve + * space then we could end up having a real enospc problem when trying + * to allocate a chunk or some other such important allocation. + */ + spin_lock(&global_rsv->lock); + space_size = calc_global_rsv_need_space(global_rsv); + spin_unlock(&global_rsv->lock); + if (used + space_size >= space_info->total_bytes) + return 0; + + used += space_info->bytes_may_use; + + avail = atomic64_read(&fs_info->free_chunk_space); + + /* + * If we have dup, raid1 or raid10 then only half of the free + * space is actually usable. For raid56, the space info used + * doesn't include the parity drive, so we don't have to + * change the math + */ + factor = btrfs_bg_type_to_factor(profile); + avail = div_u64(avail, factor); + + /* + * If we aren't flushing all things, let us overcommit up to + * 1/2th of the space. If we can flush, don't let us overcommit + * too much, let it overcommit up to 1/8 of the space. + */ + if (flush == BTRFS_RESERVE_FLUSH_ALL) + avail >>= 3; + else + avail >>= 1; + + if (used + bytes < space_info->total_bytes + avail) + return 1; + return 0; +} + +/* + * This is for space we already have accounted in space_info->bytes_may_use, so + * basically when we're returning space from block_rsv's. + */ +void btrfs_space_info_add_old_bytes(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, + u64 num_bytes) +{ + struct reserve_ticket *ticket; + struct list_head *head; + u64 used; + enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_NO_FLUSH; + bool check_overcommit = false; + + spin_lock(&space_info->lock); + head = &space_info->priority_tickets; + + /* + * If we are over our limit then we need to check and see if we can + * overcommit, and if we can't then we just need to free up our space + * and not satisfy any requests. + */ + used = btrfs_space_info_used(space_info, true); + if (used - num_bytes >= space_info->total_bytes) + check_overcommit = true; +again: + while (!list_empty(head) && num_bytes) { + ticket = list_first_entry(head, struct reserve_ticket, + list); + /* + * We use 0 bytes because this space is already reserved, so + * adding the ticket space would be a double count. + */ + if (check_overcommit && + !can_overcommit(fs_info, space_info, 0, flush, false)) + break; + if (num_bytes >= ticket->bytes) { + list_del_init(&ticket->list); + num_bytes -= ticket->bytes; + ticket->bytes = 0; + space_info->tickets_id++; + wake_up(&ticket->wait); + } else { + ticket->bytes -= num_bytes; + num_bytes = 0; + } + } + + if (num_bytes && head == &space_info->priority_tickets) { + head = &space_info->tickets; + flush = BTRFS_RESERVE_FLUSH_ALL; + goto again; + } + btrfs_space_info_update_bytes_may_use(fs_info, space_info, -num_bytes); + trace_btrfs_space_reservation(fs_info, "space_info", + space_info->flags, num_bytes, 0); + spin_unlock(&space_info->lock); +} + +/* + * This is for newly allocated space that isn't accounted in + * space_info->bytes_may_use yet. So if we allocate a chunk or unpin an extent + * we use this helper. + */ +void btrfs_space_info_add_new_bytes(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, + u64 num_bytes) +{ + struct reserve_ticket *ticket; + struct list_head *head = &space_info->priority_tickets; + +again: + while (!list_empty(head) && num_bytes) { + ticket = list_first_entry(head, struct reserve_ticket, + list); + if (num_bytes >= ticket->bytes) { + trace_btrfs_space_reservation(fs_info, "space_info", + space_info->flags, + ticket->bytes, 1); + list_del_init(&ticket->list); + num_bytes -= ticket->bytes; + btrfs_space_info_update_bytes_may_use(fs_info, + space_info, + ticket->bytes); + ticket->bytes = 0; + space_info->tickets_id++; + wake_up(&ticket->wait); + } else { + trace_btrfs_space_reservation(fs_info, "space_info", + space_info->flags, + num_bytes, 1); + btrfs_space_info_update_bytes_may_use(fs_info, + space_info, + num_bytes); + ticket->bytes -= num_bytes; + num_bytes = 0; + } + } + + if (num_bytes && head == &space_info->priority_tickets) { + head = &space_info->tickets; + goto again; + } +} + +#define DUMP_BLOCK_RSV(fs_info, rsv_name) \ +do { \ + struct btrfs_block_rsv *__rsv = &(fs_info)->rsv_name; \ + spin_lock(&__rsv->lock); \ + btrfs_info(fs_info, #rsv_name ": size %llu reserved %llu", \ + __rsv->size, __rsv->reserved); \ + spin_unlock(&__rsv->lock); \ +} while (0) + +void btrfs_dump_space_info(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *info, u64 bytes, + int dump_block_groups) +{ + struct btrfs_block_group_cache *cache; + int index = 0; + + spin_lock(&info->lock); + btrfs_info(fs_info, "space_info %llu has %llu free, is %sfull", + info->flags, + info->total_bytes - btrfs_space_info_used(info, true), + info->full ? "" : "not "); + btrfs_info(fs_info, + "space_info total=%llu, used=%llu, pinned=%llu, reserved=%llu, may_use=%llu, readonly=%llu", + info->total_bytes, info->bytes_used, info->bytes_pinned, + info->bytes_reserved, info->bytes_may_use, + info->bytes_readonly); + spin_unlock(&info->lock); + + DUMP_BLOCK_RSV(fs_info, global_block_rsv); + DUMP_BLOCK_RSV(fs_info, trans_block_rsv); + DUMP_BLOCK_RSV(fs_info, chunk_block_rsv); + DUMP_BLOCK_RSV(fs_info, delayed_block_rsv); + DUMP_BLOCK_RSV(fs_info, delayed_refs_rsv); + + if (!dump_block_groups) + return; + + down_read(&info->groups_sem); +again: + list_for_each_entry(cache, &info->block_groups[index], list) { + spin_lock(&cache->lock); + btrfs_info(fs_info, + "block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %s", + cache->key.objectid, cache->key.offset, + btrfs_block_group_used(&cache->item), cache->pinned, + cache->reserved, cache->ro ? "[readonly]" : ""); + btrfs_dump_free_space(cache, bytes); + spin_unlock(&cache->lock); + } + if (++index < BTRFS_NR_RAID_TYPES) + goto again; + up_read(&info->groups_sem); +} + +static void btrfs_writeback_inodes_sb_nr(struct btrfs_fs_info *fs_info, + unsigned long nr_pages, int nr_items) +{ + struct super_block *sb = fs_info->sb; + + if (down_read_trylock(&sb->s_umount)) { + writeback_inodes_sb_nr(sb, nr_pages, WB_REASON_FS_FREE_SPACE); + up_read(&sb->s_umount); + } else { + /* + * We needn't worry the filesystem going from r/w to r/o though + * we don't acquire ->s_umount mutex, because the filesystem + * should guarantee the delalloc inodes list be empty after + * the filesystem is readonly(all dirty pages are written to + * the disk). + */ + btrfs_start_delalloc_roots(fs_info, nr_items); + if (!current->journal_info) + btrfs_wait_ordered_roots(fs_info, nr_items, 0, (u64)-1); + } +} + +static inline u64 calc_reclaim_items_nr(struct btrfs_fs_info *fs_info, + u64 to_reclaim) +{ + u64 bytes; + u64 nr; + + bytes = btrfs_calc_trans_metadata_size(fs_info, 1); + nr = div64_u64(to_reclaim, bytes); + if (!nr) + nr = 1; + return nr; +} + +#define EXTENT_SIZE_PER_ITEM SZ_256K + +/* + * shrink metadata reservation for delalloc + */ +static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim, + u64 orig, bool wait_ordered) +{ + struct btrfs_space_info *space_info; + struct btrfs_trans_handle *trans; + u64 delalloc_bytes; + u64 dio_bytes; + u64 async_pages; + u64 items; + long time_left; + unsigned long nr_pages; + int loops; + + /* Calc the number of the pages we need flush for space reservation */ + items = calc_reclaim_items_nr(fs_info, to_reclaim); + to_reclaim = items * EXTENT_SIZE_PER_ITEM; + + trans = (struct btrfs_trans_handle *)current->journal_info; + space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); + + delalloc_bytes = percpu_counter_sum_positive( + &fs_info->delalloc_bytes); + dio_bytes = percpu_counter_sum_positive(&fs_info->dio_bytes); + if (delalloc_bytes == 0 && dio_bytes == 0) { + if (trans) + return; + if (wait_ordered) + btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1); + return; + } + + /* + * If we are doing more ordered than delalloc we need to just wait on + * ordered extents, otherwise we'll waste time trying to flush delalloc + * that likely won't give us the space back we need. + */ + if (dio_bytes > delalloc_bytes) + wait_ordered = true; + + loops = 0; + while ((delalloc_bytes || dio_bytes) && loops < 3) { + nr_pages = min(delalloc_bytes, to_reclaim) >> PAGE_SHIFT; + + /* + * Triggers inode writeback for up to nr_pages. This will invoke + * ->writepages callback and trigger delalloc filling + * (btrfs_run_delalloc_range()). + */ + btrfs_writeback_inodes_sb_nr(fs_info, nr_pages, items); + + /* + * We need to wait for the compressed pages to start before + * we continue. + */ + async_pages = atomic_read(&fs_info->async_delalloc_pages); + if (!async_pages) + goto skip_async; + + /* + * Calculate how many compressed pages we want to be written + * before we continue. I.e if there are more async pages than we + * require wait_event will wait until nr_pages are written. + */ + if (async_pages <= nr_pages) + async_pages = 0; + else + async_pages -= nr_pages; + + wait_event(fs_info->async_submit_wait, + atomic_read(&fs_info->async_delalloc_pages) <= + (int)async_pages); +skip_async: + spin_lock(&space_info->lock); + if (list_empty(&space_info->tickets) && + list_empty(&space_info->priority_tickets)) { + spin_unlock(&space_info->lock); + break; + } + spin_unlock(&space_info->lock); + + loops++; + if (wait_ordered && !trans) { + btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1); + } else { + time_left = schedule_timeout_killable(1); + if (time_left) + break; + } + delalloc_bytes = percpu_counter_sum_positive( + &fs_info->delalloc_bytes); + dio_bytes = percpu_counter_sum_positive(&fs_info->dio_bytes); + } +} + +/** + * maybe_commit_transaction - possibly commit the transaction if its ok to + * @root - the root we're allocating for + * @bytes - the number of bytes we want to reserve + * @force - force the commit + * + * This will check to make sure that committing the transaction will actually + * get us somewhere and then commit the transaction if it does. Otherwise it + * will return -ENOSPC. + */ +static int may_commit_transaction(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info) +{ + struct reserve_ticket *ticket = NULL; + struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_block_rsv; + struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv; + struct btrfs_trans_handle *trans; + u64 bytes_needed; + u64 reclaim_bytes = 0; + + trans = (struct btrfs_trans_handle *)current->journal_info; + if (trans) + return -EAGAIN; + + spin_lock(&space_info->lock); + if (!list_empty(&space_info->priority_tickets)) + ticket = list_first_entry(&space_info->priority_tickets, + struct reserve_ticket, list); + else if (!list_empty(&space_info->tickets)) + ticket = list_first_entry(&space_info->tickets, + struct reserve_ticket, list); + bytes_needed = (ticket) ? ticket->bytes : 0; + spin_unlock(&space_info->lock); + + if (!bytes_needed) + return 0; + + trans = btrfs_join_transaction(fs_info->extent_root); + if (IS_ERR(trans)) + return PTR_ERR(trans); + + /* + * See if there is enough pinned space to make this reservation, or if + * we have block groups that are going to be freed, allowing us to + * possibly do a chunk allocation the next loop through. + */ + if (test_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags) || + __percpu_counter_compare(&space_info->total_bytes_pinned, + bytes_needed, + BTRFS_TOTAL_BYTES_PINNED_BATCH) >= 0) + goto commit; + + /* + * See if there is some space in the delayed insertion reservation for + * this reservation. + */ + if (space_info != delayed_rsv->space_info) + goto enospc; + + spin_lock(&delayed_rsv->lock); + reclaim_bytes += delayed_rsv->reserved; + spin_unlock(&delayed_rsv->lock); + + spin_lock(&delayed_refs_rsv->lock); + reclaim_bytes += delayed_refs_rsv->reserved; + spin_unlock(&delayed_refs_rsv->lock); + if (reclaim_bytes >= bytes_needed) + goto commit; + bytes_needed -= reclaim_bytes; + + if (__percpu_counter_compare(&space_info->total_bytes_pinned, + bytes_needed, + BTRFS_TOTAL_BYTES_PINNED_BATCH) < 0) + goto enospc; + +commit: + return btrfs_commit_transaction(trans); +enospc: + btrfs_end_transaction(trans); + return -ENOSPC; +} + +/* + * Try to flush some data based on policy set by @state. This is only advisory + * and may fail for various reasons. The caller is supposed to examine the + * state of @space_info to detect the outcome. + */ +static void flush_space(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, u64 num_bytes, + int state) +{ + struct btrfs_root *root = fs_info->extent_root; + struct btrfs_trans_handle *trans; + int nr; + int ret = 0; + + switch (state) { + case FLUSH_DELAYED_ITEMS_NR: + case FLUSH_DELAYED_ITEMS: + if (state == FLUSH_DELAYED_ITEMS_NR) + nr = calc_reclaim_items_nr(fs_info, num_bytes) * 2; + else + nr = -1; + + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + break; + } + ret = btrfs_run_delayed_items_nr(trans, nr); + btrfs_end_transaction(trans); + break; + case FLUSH_DELALLOC: + case FLUSH_DELALLOC_WAIT: + shrink_delalloc(fs_info, num_bytes * 2, num_bytes, + state == FLUSH_DELALLOC_WAIT); + break; + case FLUSH_DELAYED_REFS_NR: + case FLUSH_DELAYED_REFS: + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + break; + } + if (state == FLUSH_DELAYED_REFS_NR) + nr = calc_reclaim_items_nr(fs_info, num_bytes); + else + nr = 0; + btrfs_run_delayed_refs(trans, nr); + btrfs_end_transaction(trans); + break; + case ALLOC_CHUNK: + case ALLOC_CHUNK_FORCE: + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + break; + } + ret = btrfs_chunk_alloc(trans, + btrfs_metadata_alloc_profile(fs_info), + (state == ALLOC_CHUNK) ? CHUNK_ALLOC_NO_FORCE : + CHUNK_ALLOC_FORCE); + btrfs_end_transaction(trans); + if (ret > 0 || ret == -ENOSPC) + ret = 0; + break; + case COMMIT_TRANS: + /* + * If we have pending delayed iputs then we could free up a + * bunch of pinned space, so make sure we run the iputs before + * we do our pinned bytes check below. + */ + btrfs_run_delayed_iputs(fs_info); + btrfs_wait_on_delayed_iputs(fs_info); + + ret = may_commit_transaction(fs_info, space_info); + break; + default: + ret = -ENOSPC; + break; + } + + trace_btrfs_flush_space(fs_info, space_info->flags, num_bytes, state, + ret); + return; +} + +static inline u64 +btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, + bool system_chunk) +{ + struct reserve_ticket *ticket; + u64 used; + u64 expected; + u64 to_reclaim = 0; + + list_for_each_entry(ticket, &space_info->tickets, list) + to_reclaim += ticket->bytes; + list_for_each_entry(ticket, &space_info->priority_tickets, list) + to_reclaim += ticket->bytes; + if (to_reclaim) + return to_reclaim; + + to_reclaim = min_t(u64, num_online_cpus() * SZ_1M, SZ_16M); + if (can_overcommit(fs_info, space_info, to_reclaim, + BTRFS_RESERVE_FLUSH_ALL, system_chunk)) + return 0; + + used = btrfs_space_info_used(space_info, true); + + if (can_overcommit(fs_info, space_info, SZ_1M, + BTRFS_RESERVE_FLUSH_ALL, system_chunk)) + expected = div_factor_fine(space_info->total_bytes, 95); + else + expected = div_factor_fine(space_info->total_bytes, 90); + + if (used > expected) + to_reclaim = used - expected; + else + to_reclaim = 0; + to_reclaim = min(to_reclaim, space_info->bytes_may_use + + space_info->bytes_reserved); + return to_reclaim; +} + +static inline int need_do_async_reclaim(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, + u64 used, bool system_chunk) +{ + u64 thresh = div_factor_fine(space_info->total_bytes, 98); + + /* If we're just plain full then async reclaim just slows us down. */ + if ((space_info->bytes_used + space_info->bytes_reserved) >= thresh) + return 0; + + if (!btrfs_calc_reclaim_metadata_size(fs_info, space_info, + system_chunk)) + return 0; + + return (used >= thresh && !btrfs_fs_closing(fs_info) && + !test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)); +} + +static bool wake_all_tickets(struct list_head *head) +{ + struct reserve_ticket *ticket; + + while (!list_empty(head)) { + ticket = list_first_entry(head, struct reserve_ticket, list); + list_del_init(&ticket->list); + ticket->error = -ENOSPC; + wake_up(&ticket->wait); + if (ticket->bytes != ticket->orig_bytes) + return true; + } + return false; +} + +/* + * This is for normal flushers, we can wait all goddamned day if we want to. We + * will loop and continuously try to flush as long as we are making progress. + * We count progress as clearing off tickets each time we have to loop. + */ +static void btrfs_async_reclaim_metadata_space(struct work_struct *work) +{ + struct btrfs_fs_info *fs_info; + struct btrfs_space_info *space_info; + u64 to_reclaim; + int flush_state; + int commit_cycles = 0; + u64 last_tickets_id; + + fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work); + space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); + + spin_lock(&space_info->lock); + to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info, + false); + if (!to_reclaim) { + space_info->flush = 0; + spin_unlock(&space_info->lock); + return; + } + last_tickets_id = space_info->tickets_id; + spin_unlock(&space_info->lock); + + flush_state = FLUSH_DELAYED_ITEMS_NR; + do { + flush_space(fs_info, space_info, to_reclaim, flush_state); + spin_lock(&space_info->lock); + if (list_empty(&space_info->tickets)) { + space_info->flush = 0; + spin_unlock(&space_info->lock); + return; + } + to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, + space_info, + false); + if (last_tickets_id == space_info->tickets_id) { + flush_state++; + } else { + last_tickets_id = space_info->tickets_id; + flush_state = FLUSH_DELAYED_ITEMS_NR; + if (commit_cycles) + commit_cycles--; + } + + /* + * We don't want to force a chunk allocation until we've tried + * pretty hard to reclaim space. Think of the case where we + * freed up a bunch of space and so have a lot of pinned space + * to reclaim. We would rather use that than possibly create a + * underutilized metadata chunk. So if this is our first run + * through the flushing state machine skip ALLOC_CHUNK_FORCE and + * commit the transaction. If nothing has changed the next go + * around then we can force a chunk allocation. + */ + if (flush_state == ALLOC_CHUNK_FORCE && !commit_cycles) + flush_state++; + + if (flush_state > COMMIT_TRANS) { + commit_cycles++; + if (commit_cycles > 2) { + if (wake_all_tickets(&space_info->tickets)) { + flush_state = FLUSH_DELAYED_ITEMS_NR; + commit_cycles--; + } else { + space_info->flush = 0; + } + } else { + flush_state = FLUSH_DELAYED_ITEMS_NR; + } + } + spin_unlock(&space_info->lock); + } while (flush_state <= COMMIT_TRANS); +} + +void btrfs_init_async_reclaim_work(struct work_struct *work) +{ + INIT_WORK(work, btrfs_async_reclaim_metadata_space); +} + +static const enum btrfs_flush_state priority_flush_states[] = { + FLUSH_DELAYED_ITEMS_NR, + FLUSH_DELAYED_ITEMS, + ALLOC_CHUNK, +}; + +static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, + struct reserve_ticket *ticket) +{ + u64 to_reclaim; + int flush_state; + + spin_lock(&space_info->lock); + to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info, + false); + if (!to_reclaim) { + spin_unlock(&space_info->lock); + return; + } + spin_unlock(&space_info->lock); + + flush_state = 0; + do { + flush_space(fs_info, space_info, to_reclaim, + priority_flush_states[flush_state]); + flush_state++; + spin_lock(&space_info->lock); + if (ticket->bytes == 0) { + spin_unlock(&space_info->lock); + return; + } + spin_unlock(&space_info->lock); + } while (flush_state < ARRAY_SIZE(priority_flush_states)); +} + +static int wait_reserve_ticket(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, + struct reserve_ticket *ticket) + +{ + DEFINE_WAIT(wait); + u64 reclaim_bytes = 0; + int ret = 0; + + spin_lock(&space_info->lock); + while (ticket->bytes > 0 && ticket->error == 0) { + ret = prepare_to_wait_event(&ticket->wait, &wait, TASK_KILLABLE); + if (ret) { + ret = -EINTR; + break; + } + spin_unlock(&space_info->lock); + + schedule(); + + finish_wait(&ticket->wait, &wait); + spin_lock(&space_info->lock); + } + if (!ret) + ret = ticket->error; + if (!list_empty(&ticket->list)) + list_del_init(&ticket->list); + if (ticket->bytes && ticket->bytes < ticket->orig_bytes) + reclaim_bytes = ticket->orig_bytes - ticket->bytes; + spin_unlock(&space_info->lock); + + if (reclaim_bytes) + btrfs_space_info_add_old_bytes(fs_info, space_info, + reclaim_bytes); + return ret; +} + +/** + * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space + * @root - the root we're allocating for + * @space_info - the space info we want to allocate from + * @orig_bytes - the number of bytes we want + * @flush - whether or not we can flush to make our reservation + * + * This will reserve orig_bytes number of bytes from the space info associated + * with the block_rsv. If there is not enough space it will make an attempt to + * flush out space to make room. It will do this by flushing delalloc if + * possible or committing the transaction. If flush is 0 then no attempts to + * regain reservations will be made and this will fail if there is not enough + * space already. + */ +static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, + u64 orig_bytes, + enum btrfs_reserve_flush_enum flush, + bool system_chunk) +{ + struct reserve_ticket ticket; + u64 used; + u64 reclaim_bytes = 0; + int ret = 0; + + ASSERT(orig_bytes); + ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_ALL); + + spin_lock(&space_info->lock); + ret = -ENOSPC; + used = btrfs_space_info_used(space_info, true); + + /* + * Carry on if we have enough space (short-circuit) OR call + * can_overcommit() to ensure we can overcommit to continue. + */ + if ((used + orig_bytes <= space_info->total_bytes) || + can_overcommit(fs_info, space_info, orig_bytes, flush, + system_chunk)) { + btrfs_space_info_update_bytes_may_use(fs_info, space_info, + orig_bytes); + trace_btrfs_space_reservation(fs_info, "space_info", + space_info->flags, orig_bytes, 1); + ret = 0; + } + + /* + * If we couldn't make a reservation then setup our reservation ticket + * and kick the async worker if it's not already running. + * + * If we are a priority flusher then we just need to add our ticket to + * the list and we will do our own flushing further down. + */ + if (ret && flush != BTRFS_RESERVE_NO_FLUSH) { + ticket.orig_bytes = orig_bytes; + ticket.bytes = orig_bytes; + ticket.error = 0; + init_waitqueue_head(&ticket.wait); + if (flush == BTRFS_RESERVE_FLUSH_ALL) { + list_add_tail(&ticket.list, &space_info->tickets); + if (!space_info->flush) { + space_info->flush = 1; + trace_btrfs_trigger_flush(fs_info, + space_info->flags, + orig_bytes, flush, + "enospc"); + queue_work(system_unbound_wq, + &fs_info->async_reclaim_work); + } + } else { + list_add_tail(&ticket.list, + &space_info->priority_tickets); + } + } else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) { + used += orig_bytes; + /* + * We will do the space reservation dance during log replay, + * which means we won't have fs_info->fs_root set, so don't do + * the async reclaim as we will panic. + */ + if (!test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags) && + need_do_async_reclaim(fs_info, space_info, + used, system_chunk) && + !work_busy(&fs_info->async_reclaim_work)) { + trace_btrfs_trigger_flush(fs_info, space_info->flags, + orig_bytes, flush, "preempt"); + queue_work(system_unbound_wq, + &fs_info->async_reclaim_work); + } + } + spin_unlock(&space_info->lock); + if (!ret || flush == BTRFS_RESERVE_NO_FLUSH) + return ret; + + if (flush == BTRFS_RESERVE_FLUSH_ALL) + return wait_reserve_ticket(fs_info, space_info, &ticket); + + ret = 0; + priority_reclaim_metadata_space(fs_info, space_info, &ticket); + spin_lock(&space_info->lock); + if (ticket.bytes) { + if (ticket.bytes < orig_bytes) + reclaim_bytes = orig_bytes - ticket.bytes; + list_del_init(&ticket.list); + ret = -ENOSPC; + } + spin_unlock(&space_info->lock); + + if (reclaim_bytes) + btrfs_space_info_add_old_bytes(fs_info, space_info, + reclaim_bytes); + ASSERT(list_empty(&ticket.list)); + return ret; +} + +/** + * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space + * @root - the root we're allocating for + * @block_rsv - the block_rsv we're allocating for + * @orig_bytes - the number of bytes we want + * @flush - whether or not we can flush to make our reservation + * + * This will reserve orig_bytes number of bytes from the space info associated + * with the block_rsv. If there is not enough space it will make an attempt to + * flush out space to make room. It will do this by flushing delalloc if + * possible or committing the transaction. If flush is 0 then no attempts to + * regain reservations will be made and this will fail if there is not enough + * space already. + */ +int btrfs_reserve_metadata_bytes(struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, + u64 orig_bytes, + enum btrfs_reserve_flush_enum flush) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; + int ret; + bool system_chunk = (root == fs_info->chunk_root); + + ret = __reserve_metadata_bytes(fs_info, block_rsv->space_info, + orig_bytes, flush, system_chunk); + if (ret == -ENOSPC && + unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) { + if (block_rsv != global_rsv && + !btrfs_block_rsv_use_bytes(global_rsv, orig_bytes)) + ret = 0; + } + if (ret == -ENOSPC) { + trace_btrfs_space_reservation(fs_info, "space_info:enospc", + block_rsv->space_info->flags, + orig_bytes, 1); + + if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) + btrfs_dump_space_info(fs_info, block_rsv->space_info, + orig_bytes, 0); + } + return ret; +} diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h new file mode 100644 index 000000000000..c2b54b8e1a14 --- /dev/null +++ b/fs/btrfs/space-info.h @@ -0,0 +1,133 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef BTRFS_SPACE_INFO_H +#define BTRFS_SPACE_INFO_H + +struct btrfs_space_info { + spinlock_t lock; + + u64 total_bytes; /* total bytes in the space, + this doesn't take mirrors into account */ + u64 bytes_used; /* total bytes used, + this doesn't take mirrors into account */ + u64 bytes_pinned; /* total bytes pinned, will be freed when the + transaction finishes */ + u64 bytes_reserved; /* total bytes the allocator has reserved for + current allocations */ + u64 bytes_may_use; /* number of bytes that may be used for + delalloc/allocations */ + u64 bytes_readonly; /* total bytes that are read only */ + + u64 max_extent_size; /* This will hold the maximum extent size of + the space info if we had an ENOSPC in the + allocator. */ + + unsigned int full:1; /* indicates that we cannot allocate any more + chunks for this space */ + unsigned int chunk_alloc:1; /* set if we are allocating a chunk */ + + unsigned int flush:1; /* set if we are trying to make space */ + + unsigned int force_alloc; /* set if we need to force a chunk + alloc for this space */ + + u64 disk_used; /* total bytes used on disk */ + u64 disk_total; /* total bytes on disk, takes mirrors into + account */ + + u64 flags; + + /* + * bytes_pinned is kept in line with what is actually pinned, as in + * we've called update_block_group and dropped the bytes_used counter + * and increased the bytes_pinned counter. However this means that + * bytes_pinned does not reflect the bytes that will be pinned once the + * delayed refs are flushed, so this counter is inc'ed every time we + * call btrfs_free_extent so it is a realtime count of what will be + * freed once the transaction is committed. It will be zeroed every + * time the transaction commits. + */ + struct percpu_counter total_bytes_pinned; + + struct list_head list; + /* Protected by the spinlock 'lock'. */ + struct list_head ro_bgs; + struct list_head priority_tickets; + struct list_head tickets; + /* + * tickets_id just indicates the next ticket will be handled, so note + * it's not stored per ticket. + */ + u64 tickets_id; + + struct rw_semaphore groups_sem; + /* for block groups in our same type */ + struct list_head block_groups[BTRFS_NR_RAID_TYPES]; + wait_queue_head_t wait; + + struct kobject kobj; + struct kobject *block_group_kobjs[BTRFS_NR_RAID_TYPES]; +}; + +struct reserve_ticket { + u64 orig_bytes; + u64 bytes; + int error; + struct list_head list; + wait_queue_head_t wait; +}; + +static inline bool btrfs_mixed_space_info(struct btrfs_space_info *space_info) +{ + return ((space_info->flags & BTRFS_BLOCK_GROUP_METADATA) && + (space_info->flags & BTRFS_BLOCK_GROUP_DATA)); +} + +/* + * + * Declare a helper function to detect underflow of various space info members + */ +#define DECLARE_SPACE_INFO_UPDATE(name) \ +static inline void \ +btrfs_space_info_update_##name(struct btrfs_fs_info *fs_info, \ + struct btrfs_space_info *sinfo, \ + s64 bytes) \ +{ \ + lockdep_assert_held(&sinfo->lock); \ + trace_update_##name(fs_info, sinfo, sinfo->name, bytes); \ + if (bytes < 0 && sinfo->name < -bytes) { \ + WARN_ON(1); \ + sinfo->name = 0; \ + return; \ + } \ + sinfo->name += bytes; \ +} + +DECLARE_SPACE_INFO_UPDATE(bytes_may_use); +DECLARE_SPACE_INFO_UPDATE(bytes_pinned); + +void btrfs_space_info_add_new_bytes(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, + u64 num_bytes); +void btrfs_space_info_add_old_bytes(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, + u64 num_bytes); +int btrfs_init_space_info(struct btrfs_fs_info *fs_info); +void btrfs_update_space_info(struct btrfs_fs_info *info, u64 flags, + u64 total_bytes, u64 bytes_used, + u64 bytes_readonly, + struct btrfs_space_info **space_info); +struct btrfs_space_info *btrfs_find_space_info(struct btrfs_fs_info *info, + u64 flags); +u64 btrfs_space_info_used(struct btrfs_space_info *s_info, + bool may_use_included); +void btrfs_clear_space_info_full(struct btrfs_fs_info *info); +void btrfs_dump_space_info(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *info, u64 bytes, + int dump_block_groups); +int btrfs_reserve_metadata_bytes(struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, + u64 orig_bytes, + enum btrfs_reserve_flush_enum flush); + +#endif /* BTRFS_SPACE_INFO_H */ diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 0645ec428b4f..78de9d5d80c6 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -42,6 +42,7 @@ #include "dev-replace.h" #include "free-space-cache.h" #include "backref.h" +#include "space-info.h" #include "tests/btrfs-tests.h" #include "qgroup.h" @@ -1553,6 +1554,8 @@ static struct dentry *btrfs_mount_root(struct file_system_type *fs_type, } else { snprintf(s->s_id, sizeof(s->s_id), "%pg", bdev); btrfs_sb(s)->bdev_holder = fs_type; + if (!strstr(crc32c_impl(), "generic")) + set_bit(BTRFS_FS_CSUM_IMPL_FAST, &fs_info->flags); error = btrfs_fill_super(s, fs_devices, data); } if (!error) @@ -1601,14 +1604,10 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, { struct vfsmount *mnt_root; struct dentry *root; - fmode_t mode = FMODE_READ; char *subvol_name = NULL; u64 subvol_objectid = 0; int error = 0; - if (!(flags & SB_RDONLY)) - mode |= FMODE_WRITE; - error = btrfs_parse_subvol_options(data, &subvol_name, &subvol_objectid); if (error) { @@ -1904,8 +1903,9 @@ static inline int btrfs_calc_avail_data_space(struct btrfs_fs_info *fs_info, u64 type; u64 avail_space; u64 min_stripe_size; - int min_stripes = 1, num_stripes = 1; + int min_stripes, num_stripes = 1; int i = 0, nr_devices; + const struct btrfs_raid_attr *rattr; /* * We aren't under the device list lock, so this is racy-ish, but good @@ -1929,21 +1929,18 @@ static inline int btrfs_calc_avail_data_space(struct btrfs_fs_info *fs_info, /* calc min stripe number for data space allocation */ type = btrfs_data_alloc_profile(fs_info); - if (type & BTRFS_BLOCK_GROUP_RAID0) { - min_stripes = 2; + rattr = &btrfs_raid_array[btrfs_bg_flags_to_raid_index(type)]; + min_stripes = rattr->devs_min; + + if (type & BTRFS_BLOCK_GROUP_RAID0) num_stripes = nr_devices; - } else if (type & BTRFS_BLOCK_GROUP_RAID1) { - min_stripes = 2; + else if (type & BTRFS_BLOCK_GROUP_RAID1) num_stripes = 2; - } else if (type & BTRFS_BLOCK_GROUP_RAID10) { - min_stripes = 4; + else if (type & BTRFS_BLOCK_GROUP_RAID10) num_stripes = 4; - } - if (type & BTRFS_BLOCK_GROUP_DUP) - min_stripe_size = 2 * BTRFS_STRIPE_LEN; - else - min_stripe_size = BTRFS_STRIPE_LEN; + /* Adjust for more than 1 stripe per device */ + min_stripe_size = rattr->dev_stripes * BTRFS_STRIPE_LEN; rcu_read_lock(); list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) { @@ -2466,3 +2463,4 @@ late_initcall(init_btrfs_fs); module_exit(exit_btrfs_fs) MODULE_LICENSE("GPL"); +MODULE_SOFTDEP("pre: crc32c"); diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index c1dfc97893ba..9539f8143b7a 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -16,6 +16,7 @@ #include "transaction.h" #include "sysfs.h" #include "volumes.h" +#include "space-info.h" static inline struct btrfs_fs_info *to_fs_info(struct kobject *kobj); static inline struct btrfs_fs_devices *to_fs_devs(struct kobject *kobj); diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c index 7bf4d5734dbe..1bf6b5a79191 100644 --- a/fs/btrfs/tests/extent-io-tests.c +++ b/fs/btrfs/tests/extent-io-tests.c @@ -10,6 +10,7 @@ #include "btrfs-tests.h" #include "../ctree.h" #include "../extent_io.h" +#include "../btrfs_inode.h" #define PROCESS_UNLOCK (1 << 0) #define PROCESS_RELEASE (1 << 1) @@ -58,7 +59,7 @@ static noinline int process_page_range(struct inode *inode, u64 start, u64 end, static int test_find_delalloc(u32 sectorsize) { struct inode *inode; - struct extent_io_tree tmp; + struct extent_io_tree *tmp; struct page *page; struct page *locked_page = NULL; unsigned long index = 0; @@ -76,12 +77,13 @@ static int test_find_delalloc(u32 sectorsize) test_std_err(TEST_ALLOC_INODE); return -ENOMEM; } + tmp = &BTRFS_I(inode)->io_tree; /* * Passing NULL as we don't have fs_info but tracepoints are not used * at this point */ - extent_io_tree_init(NULL, &tmp, IO_TREE_SELFTEST, NULL); + extent_io_tree_init(NULL, tmp, IO_TREE_SELFTEST, NULL); /* * First go through and create and mark all of our pages dirty, we pin @@ -108,10 +110,10 @@ static int test_find_delalloc(u32 sectorsize) * |--- delalloc ---| * |--- search ---| */ - set_extent_delalloc(&tmp, 0, sectorsize - 1, 0, NULL); + set_extent_delalloc(tmp, 0, sectorsize - 1, 0, NULL); start = 0; end = 0; - found = find_lock_delalloc_range(inode, &tmp, locked_page, &start, + found = find_lock_delalloc_range(inode, locked_page, &start, &end); if (!found) { test_err("should have found at least one delalloc"); @@ -122,7 +124,7 @@ static int test_find_delalloc(u32 sectorsize) sectorsize - 1, start, end); goto out_bits; } - unlock_extent(&tmp, start, end); + unlock_extent(tmp, start, end); unlock_page(locked_page); put_page(locked_page); @@ -139,10 +141,10 @@ static int test_find_delalloc(u32 sectorsize) test_err("couldn't find the locked page"); goto out_bits; } - set_extent_delalloc(&tmp, sectorsize, max_bytes - 1, 0, NULL); + set_extent_delalloc(tmp, sectorsize, max_bytes - 1, 0, NULL); start = test_start; end = 0; - found = find_lock_delalloc_range(inode, &tmp, locked_page, &start, + found = find_lock_delalloc_range(inode, locked_page, &start, &end); if (!found) { test_err("couldn't find delalloc in our range"); @@ -158,7 +160,7 @@ static int test_find_delalloc(u32 sectorsize) test_err("there were unlocked pages in the range"); goto out_bits; } - unlock_extent(&tmp, start, end); + unlock_extent(tmp, start, end); /* locked_page was unlocked above */ put_page(locked_page); @@ -176,7 +178,7 @@ static int test_find_delalloc(u32 sectorsize) } start = test_start; end = 0; - found = find_lock_delalloc_range(inode, &tmp, locked_page, &start, + found = find_lock_delalloc_range(inode, locked_page, &start, &end); if (found) { test_err("found range when we shouldn't have"); @@ -194,10 +196,10 @@ static int test_find_delalloc(u32 sectorsize) * * We are re-using our test_start from above since it works out well. */ - set_extent_delalloc(&tmp, max_bytes, total_dirty - 1, 0, NULL); + set_extent_delalloc(tmp, max_bytes, total_dirty - 1, 0, NULL); start = test_start; end = 0; - found = find_lock_delalloc_range(inode, &tmp, locked_page, &start, + found = find_lock_delalloc_range(inode, locked_page, &start, &end); if (!found) { test_err("didn't find our range"); @@ -213,7 +215,7 @@ static int test_find_delalloc(u32 sectorsize) test_err("pages in range were not all locked"); goto out_bits; } - unlock_extent(&tmp, start, end); + unlock_extent(tmp, start, end); /* * Now to test where we run into a page that is no longer dirty in the @@ -238,7 +240,7 @@ static int test_find_delalloc(u32 sectorsize) * this changes at any point in the future we will need to fix this * tests expected behavior. */ - found = find_lock_delalloc_range(inode, &tmp, locked_page, &start, + found = find_lock_delalloc_range(inode, locked_page, &start, &end); if (!found) { test_err("didn't find our range"); @@ -256,7 +258,7 @@ static int test_find_delalloc(u32 sectorsize) } ret = 0; out_bits: - clear_extent_bits(&tmp, 0, total_dirty - 1, (unsigned)-1); + clear_extent_bits(tmp, 0, total_dirty - 1, (unsigned)-1); out: if (locked_page) put_page(locked_page); @@ -432,6 +434,89 @@ out: return ret; } +static int test_find_first_clear_extent_bit(void) +{ + struct extent_io_tree tree; + u64 start, end; + + test_msg("running find_first_clear_extent_bit test"); + extent_io_tree_init(NULL, &tree, IO_TREE_SELFTEST, NULL); + + /* + * Set 1M-4M alloc/discard and 32M-64M thus leaving a hole between + * 4M-32M + */ + set_extent_bits(&tree, SZ_1M, SZ_4M - 1, + CHUNK_TRIMMED | CHUNK_ALLOCATED); + + find_first_clear_extent_bit(&tree, SZ_512K, &start, &end, + CHUNK_TRIMMED | CHUNK_ALLOCATED); + + if (start != 0 || end != SZ_1M -1) + test_err("error finding beginning range: start %llu end %llu", + start, end); + + /* Now add 32M-64M so that we have a hole between 4M-32M */ + set_extent_bits(&tree, SZ_32M, SZ_64M - 1, + CHUNK_TRIMMED | CHUNK_ALLOCATED); + + /* + * Request first hole starting at 12M, we should get 4M-32M + */ + find_first_clear_extent_bit(&tree, 12 * SZ_1M, &start, &end, + CHUNK_TRIMMED | CHUNK_ALLOCATED); + + if (start != SZ_4M || end != SZ_32M - 1) + test_err("error finding trimmed range: start %llu end %llu", + start, end); + + /* + * Search in the middle of allocated range, should get the next one + * available, which happens to be unallocated -> 4M-32M + */ + find_first_clear_extent_bit(&tree, SZ_2M, &start, &end, + CHUNK_TRIMMED | CHUNK_ALLOCATED); + + if (start != SZ_4M || end != SZ_32M -1) + test_err("error finding next unalloc range: start %llu end %llu", + start, end); + + /* + * Set 64M-72M with CHUNK_ALLOC flag, then search for CHUNK_TRIMMED flag + * being unset in this range, we should get the entry in range 64M-72M + */ + set_extent_bits(&tree, SZ_64M, SZ_64M + SZ_8M - 1, CHUNK_ALLOCATED); + find_first_clear_extent_bit(&tree, SZ_64M + SZ_1M, &start, &end, + CHUNK_TRIMMED); + + if (start != SZ_64M || end != SZ_64M + SZ_8M - 1) + test_err("error finding exact range: start %llu end %llu", + start, end); + + find_first_clear_extent_bit(&tree, SZ_64M - SZ_8M, &start, &end, + CHUNK_TRIMMED); + + /* + * Search in the middle of set range whose immediate neighbour doesn't + * have the bits set so it must be returned + */ + if (start != SZ_64M || end != SZ_64M + SZ_8M - 1) + test_err("error finding next alloc range: start %llu end %llu", + start, end); + + /* + * Search beyond any known range, shall return after last known range + * and end should be -1 + */ + find_first_clear_extent_bit(&tree, -1, &start, &end, CHUNK_TRIMMED); + if (start != SZ_64M + SZ_8M || end != -1) + test_err( + "error handling beyond end of range search: start %llu end %llu", + start, end); + + return 0; +} + int btrfs_test_extent_io(u32 sectorsize, u32 nodesize) { int ret; @@ -442,6 +527,10 @@ int btrfs_test_extent_io(u32 sectorsize, u32 nodesize) if (ret) goto out; + ret = test_find_first_clear_extent_bit(); + if (ret) + goto out; + ret = test_eb_bitmaps(sectorsize, nodesize); out: return ret; diff --git a/fs/btrfs/tests/extent-map-tests.c b/fs/btrfs/tests/extent-map-tests.c index 87aeabe9d610..4a7f796c9900 100644 --- a/fs/btrfs/tests/extent-map-tests.c +++ b/fs/btrfs/tests/extent-map-tests.c @@ -66,7 +66,9 @@ static int test_case_1(struct btrfs_fs_info *fs_info, em->len = SZ_16K; em->block_start = 0; em->block_len = SZ_16K; + write_lock(&em_tree->lock); ret = add_extent_mapping(em_tree, em, 0); + write_unlock(&em_tree->lock); if (ret < 0) { test_err("cannot add extent range [0, 16K)"); goto out; @@ -85,7 +87,9 @@ static int test_case_1(struct btrfs_fs_info *fs_info, em->len = SZ_4K; em->block_start = SZ_32K; /* avoid merging */ em->block_len = SZ_4K; + write_lock(&em_tree->lock); ret = add_extent_mapping(em_tree, em, 0); + write_unlock(&em_tree->lock); if (ret < 0) { test_err("cannot add extent range [16K, 20K)"); goto out; @@ -104,7 +108,9 @@ static int test_case_1(struct btrfs_fs_info *fs_info, em->len = len; em->block_start = start; em->block_len = len; + write_lock(&em_tree->lock); ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, em->start, em->len); + write_unlock(&em_tree->lock); if (ret) { test_err("case1 [%llu %llu]: ret %d", start, start + len, ret); goto out; @@ -148,7 +154,9 @@ static int test_case_2(struct btrfs_fs_info *fs_info, em->len = SZ_1K; em->block_start = EXTENT_MAP_INLINE; em->block_len = (u64)-1; + write_lock(&em_tree->lock); ret = add_extent_mapping(em_tree, em, 0); + write_unlock(&em_tree->lock); if (ret < 0) { test_err("cannot add extent range [0, 1K)"); goto out; @@ -167,7 +175,9 @@ static int test_case_2(struct btrfs_fs_info *fs_info, em->len = SZ_4K; em->block_start = SZ_4K; em->block_len = SZ_4K; + write_lock(&em_tree->lock); ret = add_extent_mapping(em_tree, em, 0); + write_unlock(&em_tree->lock); if (ret < 0) { test_err("cannot add extent range [4K, 8K)"); goto out; @@ -186,7 +196,9 @@ static int test_case_2(struct btrfs_fs_info *fs_info, em->len = SZ_1K; em->block_start = EXTENT_MAP_INLINE; em->block_len = (u64)-1; + write_lock(&em_tree->lock); ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, em->start, em->len); + write_unlock(&em_tree->lock); if (ret) { test_err("case2 [0 1K]: ret %d", ret); goto out; @@ -225,7 +237,9 @@ static int __test_case_3(struct btrfs_fs_info *fs_info, em->len = SZ_4K; em->block_start = SZ_4K; em->block_len = SZ_4K; + write_lock(&em_tree->lock); ret = add_extent_mapping(em_tree, em, 0); + write_unlock(&em_tree->lock); if (ret < 0) { test_err("cannot add extent range [4K, 8K)"); goto out; @@ -244,7 +258,9 @@ static int __test_case_3(struct btrfs_fs_info *fs_info, em->len = SZ_16K; em->block_start = 0; em->block_len = SZ_16K; + write_lock(&em_tree->lock); ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, start, len); + write_unlock(&em_tree->lock); if (ret) { test_err("case3 [0x%llx 0x%llx): ret %d", start, start + len, ret); @@ -320,7 +336,9 @@ static int __test_case_4(struct btrfs_fs_info *fs_info, em->len = SZ_8K; em->block_start = 0; em->block_len = SZ_8K; + write_lock(&em_tree->lock); ret = add_extent_mapping(em_tree, em, 0); + write_unlock(&em_tree->lock); if (ret < 0) { test_err("cannot add extent range [0, 8K)"); goto out; @@ -339,7 +357,9 @@ static int __test_case_4(struct btrfs_fs_info *fs_info, em->len = 24 * SZ_1K; em->block_start = SZ_16K; /* avoid merging */ em->block_len = 24 * SZ_1K; + write_lock(&em_tree->lock); ret = add_extent_mapping(em_tree, em, 0); + write_unlock(&em_tree->lock); if (ret < 0) { test_err("cannot add extent range [8K, 32K)"); goto out; @@ -357,7 +377,9 @@ static int __test_case_4(struct btrfs_fs_info *fs_info, em->len = SZ_32K; em->block_start = 0; em->block_len = SZ_32K; + write_lock(&em_tree->lock); ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, start, len); + write_unlock(&em_tree->lock); if (ret) { test_err("case4 [0x%llx 0x%llx): ret %d", start, len, ret); diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 3f6811cdf803..3b8ae1a8f02d 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -129,6 +129,24 @@ static inline int extwriter_counter_read(struct btrfs_transaction *trans) } /* + * To be called after all the new block groups attached to the transaction + * handle have been created (btrfs_create_pending_block_groups()). + */ +void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + + if (!trans->chunk_bytes_reserved) + return; + + WARN_ON_ONCE(!list_empty(&trans->new_bgs)); + + btrfs_block_rsv_release(fs_info, &fs_info->chunk_block_rsv, + trans->chunk_bytes_reserved); + trans->chunk_bytes_reserved = 0; +} + +/* * either allocate a new transaction or hop into the existing one */ static noinline int join_transaction(struct btrfs_fs_info *fs_info, diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 78c446c222b7..527ea94b57d9 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -224,5 +224,6 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction); void btrfs_apply_pending_changes(struct btrfs_fs_info *fs_info); void btrfs_add_dropped_root(struct btrfs_trans_handle *trans, struct btrfs_root *root); +void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans); #endif diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index 96fce4bef4e7..ccd5706199d7 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -132,6 +132,7 @@ static int check_extent_data_item(struct extent_buffer *leaf, struct btrfs_file_extent_item *fi; u32 sectorsize = fs_info->sectorsize; u32 item_size = btrfs_item_size_nr(leaf, slot); + u64 extent_end; if (!IS_ALIGNED(key->offset, sectorsize)) { file_extent_err(leaf, slot, @@ -207,6 +208,16 @@ static int check_extent_data_item(struct extent_buffer *leaf, CHECK_FE_ALIGNED(leaf, slot, fi, num_bytes, sectorsize)) return -EUCLEAN; + /* Catch extent end overflow */ + if (check_add_overflow(btrfs_file_extent_num_bytes(leaf, fi), + key->offset, &extent_end)) { + file_extent_err(leaf, slot, + "extent end overflow, have file offset %llu extent num bytes %llu", + key->offset, + btrfs_file_extent_num_bytes(leaf, fi)); + return -EUCLEAN; + } + /* * Check that no two consecutive file extent items, in the same leaf, * present ranges that overlap each other. diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 3fc8d854d7fb..6c8297bcfeb7 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -3323,6 +3323,30 @@ int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans, } /* + * Check if an inode was logged in the current transaction. We can't always rely + * on an inode's logged_trans value, because it's an in-memory only field and + * therefore not persisted. This means that its value is lost if the inode gets + * evicted and loaded again from disk (in which case it has a value of 0, and + * certainly it is smaller then any possible transaction ID), when that happens + * the full_sync flag is set in the inode's runtime flags, so on that case we + * assume eviction happened and ignore the logged_trans value, assuming the + * worst case, that the inode was logged before in the current transaction. + */ +static bool inode_logged(struct btrfs_trans_handle *trans, + struct btrfs_inode *inode) +{ + if (inode->logged_trans == trans->transid) + return true; + + if (inode->last_trans == trans->transid && + test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags) && + !test_bit(BTRFS_FS_LOG_RECOVERING, &trans->fs_info->flags)) + return true; + + return false; +} + +/* * If both a file and directory are logged, and unlinks or renames are * mixed in, we have a few interesting corners: * @@ -3356,7 +3380,7 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, int bytes_del = 0; u64 dir_ino = btrfs_ino(dir); - if (dir->logged_trans < trans->transid) + if (!inode_logged(trans, dir)) return 0; ret = join_running_log_trans(root); @@ -3460,7 +3484,7 @@ int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, u64 index; int ret; - if (inode->logged_trans < trans->transid) + if (!inode_logged(trans, inode)) return 0; ret = join_running_log_trans(root); @@ -5420,9 +5444,19 @@ log_extents: } } + /* + * Don't update last_log_commit if we logged that an inode exists after + * it was loaded to memory (full_sync bit set). + * This is to prevent data loss when we do a write to the inode, then + * the inode gets evicted after all delalloc was flushed, then we log + * it exists (due to a rename for example) and then fsync it. This last + * fsync would do nothing (not logging the extents previously written). + */ spin_lock(&inode->lock); inode->logged_trans = trans->transid; - inode->last_log_commit = inode->last_sub_trans; + if (inode_only != LOG_INODE_EXISTS || + !test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags)) + inode->last_log_commit = inode->last_sub_trans; spin_unlock(&inode->lock); out_unlock: mutex_unlock(&inode->log_mutex); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 1c2a6e4b39da..a13ddba1ebc3 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -28,6 +28,7 @@ #include "dev-replace.h" #include "sysfs.h" #include "tree-checker.h" +#include "space-info.h" const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { [BTRFS_RAID_RAID10] = { @@ -123,12 +124,14 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { }, }; -const char *get_raid_name(enum btrfs_raid_types type) +const char *btrfs_bg_type_to_raid_name(u64 flags) { - if (type >= BTRFS_NR_RAID_TYPES) + const int index = btrfs_bg_flags_to_raid_index(flags); + + if (index >= BTRFS_NR_RAID_TYPES) return NULL; - return btrfs_raid_array[type].raid_name; + return btrfs_raid_array[index].raid_name; } /* @@ -237,7 +240,9 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, * chunk_mutex * ----------- * protects chunks, adding or removing during allocation, trim or when a new - * device is added/removed + * device is added/removed. Additionally it also protects post_commit_list of + * individual devices, since they can be added to the transaction's + * post_commit_list only with chunk_mutex held. * * cleaner_mutex * ------------- @@ -1818,7 +1823,7 @@ static u64 find_next_chunk(struct btrfs_fs_info *fs_info) struct rb_node *n; u64 ret = 0; - em_tree = &fs_info->mapping_tree.map_tree; + em_tree = &fs_info->mapping_tree; read_lock(&em_tree->lock); n = rb_last(&em_tree->map.rb_root); if (n) { @@ -2941,7 +2946,7 @@ struct extent_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info, struct extent_map_tree *em_tree; struct extent_map *em; - em_tree = &fs_info->mapping_tree.map_tree; + em_tree = &fs_info->mapping_tree; read_lock(&em_tree->lock); em = lookup_extent_mapping(em_tree, logical, length); read_unlock(&em_tree->lock); @@ -3474,6 +3479,18 @@ static int chunk_devid_filter(struct extent_buffer *leaf, return 1; } +static u64 calc_data_stripes(u64 type, int num_stripes) +{ + const int index = btrfs_bg_flags_to_raid_index(type); + const int ncopies = btrfs_raid_array[index].ncopies; + const int nparity = btrfs_raid_array[index].nparity; + + if (nparity) + return num_stripes - nparity; + else + return num_stripes / ncopies; +} + /* [pstart, pend) */ static int chunk_drange_filter(struct extent_buffer *leaf, struct btrfs_chunk *chunk, @@ -3483,22 +3500,15 @@ static int chunk_drange_filter(struct extent_buffer *leaf, int num_stripes = btrfs_chunk_num_stripes(leaf, chunk); u64 stripe_offset; u64 stripe_length; + u64 type; int factor; int i; if (!(bargs->flags & BTRFS_BALANCE_ARGS_DEVID)) return 0; - if (btrfs_chunk_type(leaf, chunk) & (BTRFS_BLOCK_GROUP_DUP | - BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)) { - factor = num_stripes / 2; - } else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID5) { - factor = num_stripes - 1; - } else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID6) { - factor = num_stripes - 2; - } else { - factor = num_stripes; - } + type = btrfs_chunk_type(leaf, chunk); + factor = calc_data_stripes(type, num_stripes); for (i = 0; i < num_stripes; i++) { stripe = btrfs_stripe_nr(chunk, i); @@ -3921,11 +3931,9 @@ static void describe_balance_args(struct btrfs_balance_args *bargs, char *buf, bp += ret; \ } while (0) - if (flags & BTRFS_BALANCE_ARGS_CONVERT) { - int index = btrfs_bg_flags_to_raid_index(bargs->target); - - CHECK_APPEND_1ARG("convert=%s,", get_raid_name(index)); - } + if (flags & BTRFS_BALANCE_ARGS_CONVERT) + CHECK_APPEND_1ARG("convert=%s,", + btrfs_bg_type_to_raid_name(bargs->target)); if (flags & BTRFS_BALANCE_ARGS_SOFT) CHECK_APPEND_NOARG("soft,"); @@ -4047,6 +4055,7 @@ int btrfs_balance(struct btrfs_fs_info *fs_info, u64 num_devices; unsigned seq; bool reducing_integrity; + int i; if (btrfs_fs_closing(fs_info) || atomic_read(&fs_info->balance_pause_req) || @@ -4076,48 +4085,43 @@ int btrfs_balance(struct btrfs_fs_info *fs_info, } num_devices = btrfs_num_devices(fs_info); + allowed = 0; + for (i = 0; i < ARRAY_SIZE(btrfs_raid_array); i++) + if (num_devices >= btrfs_raid_array[i].devs_min) + allowed |= btrfs_raid_array[i].bg_flag; - allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE | BTRFS_BLOCK_GROUP_DUP; - if (num_devices > 1) - allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1); - if (num_devices > 2) - allowed |= BTRFS_BLOCK_GROUP_RAID5; - if (num_devices > 3) - allowed |= (BTRFS_BLOCK_GROUP_RAID10 | - BTRFS_BLOCK_GROUP_RAID6); if (validate_convert_profile(&bctl->data, allowed)) { - int index = btrfs_bg_flags_to_raid_index(bctl->data.target); - btrfs_err(fs_info, "balance: invalid convert data profile %s", - get_raid_name(index)); + btrfs_bg_type_to_raid_name(bctl->data.target)); ret = -EINVAL; goto out; } if (validate_convert_profile(&bctl->meta, allowed)) { - int index = btrfs_bg_flags_to_raid_index(bctl->meta.target); - btrfs_err(fs_info, "balance: invalid convert metadata profile %s", - get_raid_name(index)); + btrfs_bg_type_to_raid_name(bctl->meta.target)); ret = -EINVAL; goto out; } if (validate_convert_profile(&bctl->sys, allowed)) { - int index = btrfs_bg_flags_to_raid_index(bctl->sys.target); - btrfs_err(fs_info, "balance: invalid convert system profile %s", - get_raid_name(index)); + btrfs_bg_type_to_raid_name(bctl->sys.target)); ret = -EINVAL; goto out; } - /* allow to reduce meta or sys integrity only if force set */ - allowed = BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 | - BTRFS_BLOCK_GROUP_RAID10 | - BTRFS_BLOCK_GROUP_RAID5 | - BTRFS_BLOCK_GROUP_RAID6; + /* + * Allow to reduce metadata or system integrity only if force set for + * profiles with redundancy (copies, parity) + */ + allowed = 0; + for (i = 0; i < ARRAY_SIZE(btrfs_raid_array); i++) { + if (btrfs_raid_array[i].ncopies >= 2 || + btrfs_raid_array[i].tolerated_failures >= 1) + allowed |= btrfs_raid_array[i].bg_flag; + } do { seq = read_seqbegin(&fs_info->profiles_lock); @@ -4152,12 +4156,18 @@ int btrfs_balance(struct btrfs_fs_info *fs_info, if (btrfs_get_num_tolerated_disk_barrier_failures(meta_target) < btrfs_get_num_tolerated_disk_barrier_failures(data_target)) { - int meta_index = btrfs_bg_flags_to_raid_index(meta_target); - int data_index = btrfs_bg_flags_to_raid_index(data_target); - btrfs_warn(fs_info, "balance: metadata profile %s has lower redundancy than data profile %s", - get_raid_name(meta_index), get_raid_name(data_index)); + btrfs_bg_type_to_raid_name(meta_target), + btrfs_bg_type_to_raid_name(data_target)); + } + + if (fs_info->send_in_progress) { + btrfs_warn_rl(fs_info, +"cannot run balance while send operations are in progress (%d in progress)", + fs_info->send_in_progress); + ret = -EAGAIN; + goto out; } ret = insert_balance_item(fs_info, bctl); @@ -4949,6 +4959,8 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, sub_stripes = btrfs_raid_array[index].sub_stripes; dev_stripes = btrfs_raid_array[index].dev_stripes; devs_max = btrfs_raid_array[index].devs_max; + if (!devs_max) + devs_max = BTRFS_MAX_DEVS(info); devs_min = btrfs_raid_array[index].devs_min; devs_increment = btrfs_raid_array[index].devs_increment; ncopies = btrfs_raid_array[index].ncopies; @@ -4957,8 +4969,6 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, if (type & BTRFS_BLOCK_GROUP_DATA) { max_stripe_size = SZ_1G; max_chunk_size = BTRFS_MAX_DATA_CHUNK_SIZE; - if (!devs_max) - devs_max = BTRFS_MAX_DEVS(info); } else if (type & BTRFS_BLOCK_GROUP_METADATA) { /* for larger filesystems, use larger metadata chunks */ if (fs_devices->total_rw_bytes > 50ULL * SZ_1G) @@ -4966,13 +4976,9 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, else max_stripe_size = SZ_256M; max_chunk_size = max_stripe_size; - if (!devs_max) - devs_max = BTRFS_MAX_DEVS(info); } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) { max_stripe_size = SZ_32M; max_chunk_size = 2 * max_stripe_size; - if (!devs_max) - devs_max = BTRFS_MAX_DEVS_SYS_CHUNK; } else { btrfs_err(info, "invalid chunk type 0x%llx requested", type); @@ -5143,7 +5149,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, em->block_len = em->len; em->orig_block_len = stripe_size; - em_tree = &info->mapping_tree.map_tree; + em_tree = &info->mapping_tree; write_lock(&em_tree->lock); ret = add_extent_mapping(em_tree, em, 0); if (ret) { @@ -5324,20 +5330,9 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans) static inline int btrfs_chunk_max_errors(struct map_lookup *map) { - int max_errors; + const int index = btrfs_bg_flags_to_raid_index(map->type); - if (map->type & (BTRFS_BLOCK_GROUP_RAID1 | - BTRFS_BLOCK_GROUP_RAID10 | - BTRFS_BLOCK_GROUP_RAID5 | - BTRFS_BLOCK_GROUP_DUP)) { - max_errors = 1; - } else if (map->type & BTRFS_BLOCK_GROUP_RAID6) { - max_errors = 2; - } else { - max_errors = 0; - } - - return max_errors; + return btrfs_raid_array[index].tolerated_failures; } int btrfs_chunk_readonly(struct btrfs_fs_info *fs_info, u64 chunk_offset) @@ -5378,21 +5373,16 @@ end: return readonly; } -void btrfs_mapping_init(struct btrfs_mapping_tree *tree) -{ - extent_map_tree_init(&tree->map_tree); -} - -void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree) +void btrfs_mapping_tree_free(struct extent_map_tree *tree) { struct extent_map *em; while (1) { - write_lock(&tree->map_tree.lock); - em = lookup_extent_mapping(&tree->map_tree, 0, (u64)-1); + write_lock(&tree->lock); + em = lookup_extent_mapping(tree, 0, (u64)-1); if (em) - remove_extent_mapping(&tree->map_tree, em); - write_unlock(&tree->map_tree.lock); + remove_extent_mapping(tree, em); + write_unlock(&tree->lock); if (!em) break; /* once for us */ @@ -5419,7 +5409,7 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len) return 1; map = em->map_lookup; - if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1)) + if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1_MASK)) ret = map->num_stripes; else if (map->type & BTRFS_BLOCK_GROUP_RAID10) ret = map->sub_stripes; @@ -5493,7 +5483,7 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info, struct btrfs_device *srcdev; ASSERT((map->type & - (BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10))); + (BTRFS_BLOCK_GROUP_RAID1_MASK | BTRFS_BLOCK_GROUP_RAID10))); if (map->type & BTRFS_BLOCK_GROUP_RAID10) num_stripes = map->sub_stripes; @@ -5682,7 +5672,7 @@ static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info, &remaining_stripes); div_u64_rem(stripe_nr_end - 1, factor, &last_stripe); last_stripe *= sub_stripes; - } else if (map->type & (BTRFS_BLOCK_GROUP_RAID1 | + } else if (map->type & (BTRFS_BLOCK_GROUP_RAID1_MASK | BTRFS_BLOCK_GROUP_DUP)) { num_stripes = map->num_stripes; } else { @@ -5926,6 +5916,102 @@ static bool need_full_stripe(enum btrfs_map_op op) return (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS); } +/* + * btrfs_get_io_geometry - calculates the geomery of a particular (address, len) + * tuple. This information is used to calculate how big a + * particular bio can get before it straddles a stripe. + * + * @fs_info - the filesystem + * @logical - address that we want to figure out the geometry of + * @len - the length of IO we are going to perform, starting at @logical + * @op - type of operation - write or read + * @io_geom - pointer used to return values + * + * Returns < 0 in case a chunk for the given logical address cannot be found, + * usually shouldn't happen unless @logical is corrupted, 0 otherwise. + */ +int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, + u64 logical, u64 len, struct btrfs_io_geometry *io_geom) +{ + struct extent_map *em; + struct map_lookup *map; + u64 offset; + u64 stripe_offset; + u64 stripe_nr; + u64 stripe_len; + u64 raid56_full_stripe_start = (u64)-1; + int data_stripes; + + ASSERT(op != BTRFS_MAP_DISCARD); + + em = btrfs_get_chunk_map(fs_info, logical, len); + if (IS_ERR(em)) + return PTR_ERR(em); + + map = em->map_lookup; + /* Offset of this logical address in the chunk */ + offset = logical - em->start; + /* Len of a stripe in a chunk */ + stripe_len = map->stripe_len; + /* Stripe wher this block falls in */ + stripe_nr = div64_u64(offset, stripe_len); + /* Offset of stripe in the chunk */ + stripe_offset = stripe_nr * stripe_len; + if (offset < stripe_offset) { + btrfs_crit(fs_info, +"stripe math has gone wrong, stripe_offset=%llu offset=%llu start=%llu logical=%llu stripe_len=%llu", + stripe_offset, offset, em->start, logical, stripe_len); + free_extent_map(em); + return -EINVAL; + } + + /* stripe_offset is the offset of this block in its stripe */ + stripe_offset = offset - stripe_offset; + data_stripes = nr_data_stripes(map); + + if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { + u64 max_len = stripe_len - stripe_offset; + + /* + * In case of raid56, we need to know the stripe aligned start + */ + if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { + unsigned long full_stripe_len = stripe_len * data_stripes; + raid56_full_stripe_start = offset; + + /* + * Allow a write of a full stripe, but make sure we + * don't allow straddling of stripes + */ + raid56_full_stripe_start = div64_u64(raid56_full_stripe_start, + full_stripe_len); + raid56_full_stripe_start *= full_stripe_len; + + /* + * For writes to RAID[56], allow a full stripeset across + * all disks. For other RAID types and for RAID[56] + * reads, just allow a single stripe (on a single disk). + */ + if (op == BTRFS_MAP_WRITE) { + max_len = stripe_len * data_stripes - + (offset - raid56_full_stripe_start); + } + } + len = min_t(u64, em->len - offset, max_len); + } else { + len = em->len - offset; + } + + io_geom->len = len; + io_geom->offset = offset; + io_geom->stripe_len = stripe_len; + io_geom->stripe_nr = stripe_nr; + io_geom->stripe_offset = stripe_offset; + io_geom->raid56_stripe_offset = raid56_full_stripe_start; + + return 0; +} + static int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, u64 logical, u64 *length, @@ -5939,6 +6025,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, u64 stripe_nr; u64 stripe_len; u32 stripe_index; + int data_stripes; int i; int ret = 0; int num_stripes; @@ -5951,76 +6038,29 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int patch_the_first_stripe_for_dev_replace = 0; u64 physical_to_patch_in_first_stripe = 0; u64 raid56_full_stripe_start = (u64)-1; + struct btrfs_io_geometry geom; + + ASSERT(bbio_ret); if (op == BTRFS_MAP_DISCARD) return __btrfs_map_block_for_discard(fs_info, logical, *length, bbio_ret); - em = btrfs_get_chunk_map(fs_info, logical, *length); - if (IS_ERR(em)) - return PTR_ERR(em); + ret = btrfs_get_io_geometry(fs_info, op, logical, *length, &geom); + if (ret < 0) + return ret; + em = btrfs_get_chunk_map(fs_info, logical, *length); + ASSERT(em); map = em->map_lookup; - offset = logical - em->start; - - stripe_len = map->stripe_len; - stripe_nr = offset; - /* - * stripe_nr counts the total number of stripes we have to stride - * to get to this block - */ - stripe_nr = div64_u64(stripe_nr, stripe_len); - - stripe_offset = stripe_nr * stripe_len; - if (offset < stripe_offset) { - btrfs_crit(fs_info, - "stripe math has gone wrong, stripe_offset=%llu, offset=%llu, start=%llu, logical=%llu, stripe_len=%llu", - stripe_offset, offset, em->start, logical, - stripe_len); - free_extent_map(em); - return -EINVAL; - } - - /* stripe_offset is the offset of this block in its stripe*/ - stripe_offset = offset - stripe_offset; - - /* if we're here for raid56, we need to know the stripe aligned start */ - if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { - unsigned long full_stripe_len = stripe_len * nr_data_stripes(map); - raid56_full_stripe_start = offset; - /* allow a write of a full stripe, but make sure we don't - * allow straddling of stripes - */ - raid56_full_stripe_start = div64_u64(raid56_full_stripe_start, - full_stripe_len); - raid56_full_stripe_start *= full_stripe_len; - } - - if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { - u64 max_len; - /* For writes to RAID[56], allow a full stripeset across all disks. - For other RAID types and for RAID[56] reads, just allow a single - stripe (on a single disk). */ - if ((map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) && - (op == BTRFS_MAP_WRITE)) { - max_len = stripe_len * nr_data_stripes(map) - - (offset - raid56_full_stripe_start); - } else { - /* we limit the length of each bio to what fits in a stripe */ - max_len = stripe_len - stripe_offset; - } - *length = min_t(u64, em->len - offset, max_len); - } else { - *length = em->len - offset; - } - - /* - * This is for when we're called from btrfs_bio_fits_in_stripe and all - * it cares about is the length - */ - if (!bbio_ret) - goto out; + *length = geom.len; + offset = geom.offset; + stripe_len = geom.stripe_len; + stripe_nr = geom.stripe_nr; + stripe_offset = geom.stripe_offset; + raid56_full_stripe_start = geom.raid56_stripe_offset; + data_stripes = nr_data_stripes(map); down_read(&dev_replace->rwsem); dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace); @@ -6052,7 +6092,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, &stripe_index); if (!need_full_stripe(op)) mirror_num = 1; - } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) { + } else if (map->type & BTRFS_BLOCK_GROUP_RAID1_MASK) { if (need_full_stripe(op)) num_stripes = map->num_stripes; else if (mirror_num) @@ -6094,7 +6134,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, if (need_raid_map && (need_full_stripe(op) || mirror_num > 1)) { /* push stripe_nr back to the start of the full stripe */ stripe_nr = div64_u64(raid56_full_stripe_start, - stripe_len * nr_data_stripes(map)); + stripe_len * data_stripes); /* RAID[56] write or recovery. Return all stripes */ num_stripes = map->num_stripes; @@ -6110,10 +6150,9 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, * Mirror #3 is RAID6 Q block. */ stripe_nr = div_u64_rem(stripe_nr, - nr_data_stripes(map), &stripe_index); + data_stripes, &stripe_index); if (mirror_num > 1) - stripe_index = nr_data_stripes(map) + - mirror_num - 2; + stripe_index = data_stripes + mirror_num - 2; /* We distribute the parity blocks across stripes */ div_u64_rem(stripe_nr + stripe_index, map->num_stripes, @@ -6171,8 +6210,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, div_u64_rem(stripe_nr, num_stripes, &rot); /* Fill in the logical address of each stripe */ - tmp = stripe_nr * nr_data_stripes(map); - for (i = 0; i < nr_data_stripes(map); i++) + tmp = stripe_nr * data_stripes; + for (i = 0; i < data_stripes; i++) bbio->raid_map[(i+rot) % num_stripes] = em->start + (tmp + i) * map->stripe_len; @@ -6687,7 +6726,7 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf, struct btrfs_chunk *chunk) { struct btrfs_fs_info *fs_info = leaf->fs_info; - struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; + struct extent_map_tree *map_tree = &fs_info->mapping_tree; struct map_lookup *map; struct extent_map *em; u64 logical; @@ -6712,9 +6751,9 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf, return ret; } - read_lock(&map_tree->map_tree.lock); - em = lookup_extent_mapping(&map_tree->map_tree, logical, 1); - read_unlock(&map_tree->map_tree.lock); + read_lock(&map_tree->lock); + em = lookup_extent_mapping(map_tree, logical, 1); + read_unlock(&map_tree->lock); /* already mapped? */ if (em && em->start <= logical && em->start + em->len > logical) { @@ -6783,9 +6822,9 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf, } - write_lock(&map_tree->map_tree.lock); - ret = add_extent_mapping(&map_tree->map_tree, em, 0); - write_unlock(&map_tree->map_tree.lock); + write_lock(&map_tree->lock); + ret = add_extent_mapping(map_tree, em, 0); + write_unlock(&map_tree->lock); if (ret < 0) { btrfs_err(fs_info, "failed to add chunk map, start=%llu len=%llu: %d", @@ -7103,14 +7142,14 @@ out_short_read: bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info, struct btrfs_device *failing_dev) { - struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; + struct extent_map_tree *map_tree = &fs_info->mapping_tree; struct extent_map *em; u64 next_start = 0; bool ret = true; - read_lock(&map_tree->map_tree.lock); - em = lookup_extent_mapping(&map_tree->map_tree, 0, (u64)-1); - read_unlock(&map_tree->map_tree.lock); + read_lock(&map_tree->lock); + em = lookup_extent_mapping(map_tree, 0, (u64)-1); + read_unlock(&map_tree->lock); /* No chunk at all? Return false anyway */ if (!em) { ret = false; @@ -7148,10 +7187,10 @@ bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info, next_start = extent_map_end(em); free_extent_map(em); - read_lock(&map_tree->map_tree.lock); - em = lookup_extent_mapping(&map_tree->map_tree, next_start, + read_lock(&map_tree->lock); + em = lookup_extent_mapping(map_tree, next_start, (u64)(-1) - next_start); - read_unlock(&map_tree->map_tree.lock); + read_unlock(&map_tree->lock); } out: return ret; @@ -7600,10 +7639,9 @@ void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info) */ int btrfs_bg_type_to_factor(u64 flags) { - if (flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 | - BTRFS_BLOCK_GROUP_RAID10)) - return 2; - return 1; + const int index = btrfs_bg_flags_to_raid_index(flags); + + return btrfs_raid_array[index].ncopies; } @@ -7612,7 +7650,7 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info, u64 chunk_offset, u64 devid, u64 physical_offset, u64 physical_len) { - struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree; + struct extent_map_tree *em_tree = &fs_info->mapping_tree; struct extent_map *em; struct map_lookup *map; struct btrfs_device *dev; @@ -7701,7 +7739,7 @@ out: static int verify_chunk_dev_extent_mapping(struct btrfs_fs_info *fs_info) { - struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree; + struct extent_map_tree *em_tree = &fs_info->mapping_tree; struct extent_map *em; struct rb_node *node; int ret = 0; diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 136a3eb64604..7f6aa1816409 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -23,6 +23,21 @@ struct btrfs_pending_bios { struct bio *tail; }; +struct btrfs_io_geometry { + /* remaining bytes before crossing a stripe */ + u64 len; + /* offset of logical address in chunk */ + u64 offset; + /* length of single IO stripe */ + u64 stripe_len; + /* number of stripe where address falls */ + u64 stripe_nr; + /* offset of address in stripe */ + u64 stripe_offset; + /* offset of raid56 stripe into the chunk */ + u64 raid56_stripe_offset; +}; + /* * Use sequence counter to get consistent device stat data on * 32-bit processors. @@ -43,8 +58,8 @@ struct btrfs_pending_bios { #define BTRFS_DEV_STATE_FLUSH_SENT (4) struct btrfs_device { - struct list_head dev_list; - struct list_head dev_alloc_list; + struct list_head dev_list; /* device_list_mutex */ + struct list_head dev_alloc_list; /* chunk mutex */ struct list_head post_commit_list; /* chunk mutex */ struct btrfs_fs_devices *fs_devices; struct btrfs_fs_info *fs_info; @@ -229,9 +244,14 @@ struct btrfs_fs_devices { * this mutex lock. */ struct mutex device_list_mutex; + + /* List of all devices, protected by device_list_mutex */ struct list_head devices; - /* devices not currently being allocated */ + /* + * Devices which can satisfy space allocation. Protected by + * chunk_mutex + */ struct list_head alloc_list; struct btrfs_fs_devices *seed; @@ -336,16 +356,16 @@ struct btrfs_device_info { }; struct btrfs_raid_attr { - int sub_stripes; /* sub_stripes info for map */ - int dev_stripes; /* stripes per dev */ - int devs_max; /* max devs to use */ - int devs_min; /* min devs needed */ - int tolerated_failures; /* max tolerated fail devs */ - int devs_increment; /* ndevs has to be a multiple of this */ - int ncopies; /* how many copies to data has */ - int nparity; /* number of stripes worth of bytes to store + u8 sub_stripes; /* sub_stripes info for map */ + u8 dev_stripes; /* stripes per dev */ + u8 devs_max; /* max devs to use */ + u8 devs_min; /* min devs needed */ + u8 tolerated_failures; /* max tolerated fail devs */ + u8 devs_increment; /* ndevs has to be a multiple of this */ + u8 ncopies; /* how many copies to data has */ + u8 nparity; /* number of stripes worth of bytes to store * parity information */ - int mindev_error; /* error code if min devs requisite is unmet */ + u8 mindev_error; /* error code if min devs requisite is unmet */ const char raid_name[8]; /* name of the raid */ u64 bg_flag; /* block group flag of the raid */ }; @@ -408,13 +428,14 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, u64 logical, u64 *length, struct btrfs_bio **bbio_ret); +int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, + u64 logical, u64 len, struct btrfs_io_geometry *io_geom); int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start, u64 physical, u64 **logical, int *naddrs, int *stripe_len); int btrfs_read_sys_array(struct btrfs_fs_info *fs_info); int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info); int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, u64 type); -void btrfs_mapping_init(struct btrfs_mapping_tree *tree); -void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree); +void btrfs_mapping_tree_free(struct extent_map_tree *tree); blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio, int mirror_num, int async_submit); int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, @@ -557,8 +578,6 @@ static inline enum btrfs_raid_types btrfs_bg_flags_to_raid_index(u64 flags) return BTRFS_RAID_SINGLE; /* BTRFS_BLOCK_GROUP_SINGLE */ } -const char *get_raid_name(enum btrfs_raid_types type); - void btrfs_commit_device_sizes(struct btrfs_transaction *trans); struct list_head *btrfs_get_fs_uuids(void); @@ -568,6 +587,7 @@ bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info, struct btrfs_device *failing_dev); int btrfs_bg_type_to_factor(u64 flags); +const char *btrfs_bg_type_to_raid_name(u64 flags); int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info); #endif |