From 34d9700702f4042ce10d68a092ab7f79575e7a3b Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Wed, 16 Mar 2016 16:43:06 +0800 Subject: btrfs: rename btrfs_std_error to btrfs_handle_fs_error btrfs_std_error() handles errors, puts FS into readonly mode (as of now). So its good idea to rename it to btrfs_handle_fs_error(). Signed-off-by: Anand Jain Reviewed-by: David Sterba [ edit changelog ] Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 6 +++--- fs/btrfs/ctree.h | 6 +++--- fs/btrfs/disk-io.c | 8 ++++---- fs/btrfs/extent-tree.c | 2 +- fs/btrfs/inode-item.c | 2 +- fs/btrfs/ioctl.c | 2 +- fs/btrfs/relocation.c | 2 +- fs/btrfs/root-tree.c | 4 ++-- fs/btrfs/super.c | 6 +++--- fs/btrfs/transaction.c | 2 +- fs/btrfs/tree-log.c | 8 ++++---- fs/btrfs/volumes.c | 14 +++++++------- 12 files changed, 31 insertions(+), 31 deletions(-) diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index ec7928a27aaa..decd0a3f5d61 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1011,7 +1011,7 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, return ret; if (refs == 0) { ret = -EROFS; - btrfs_std_error(root->fs_info, ret, NULL); + btrfs_handle_fs_error(root->fs_info, ret, NULL); return ret; } } else { @@ -1928,7 +1928,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, child = read_node_slot(root, mid, 0); if (!child) { ret = -EROFS; - btrfs_std_error(root->fs_info, ret, NULL); + btrfs_handle_fs_error(root->fs_info, ret, NULL); goto enospc; } @@ -2031,7 +2031,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, */ if (!left) { ret = -EROFS; - btrfs_std_error(root->fs_info, ret, NULL); + btrfs_handle_fs_error(root->fs_info, ret, NULL); goto enospc; } wret = balance_node_right(trans, root, mid, left); diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 84a6a5b3384a..12f80cd230ab 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -4329,7 +4329,7 @@ static inline void assfail(char *expr, char *file, int line) #define btrfs_assert() __printf(5, 6) __cold -void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function, +void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function, unsigned int line, int errno, const char *fmt, ...); const char *btrfs_decode_error(int errno); @@ -4472,9 +4472,9 @@ do { \ __LINE__, (errno)); \ } while (0) -#define btrfs_std_error(fs_info, errno, fmt, args...) \ +#define btrfs_handle_fs_error(fs_info, errno, fmt, args...) \ do { \ - __btrfs_std_error((fs_info), __func__, __LINE__, \ + __btrfs_handle_fs_error((fs_info), __func__, __LINE__, \ (errno), fmt, ##args); \ } while (0) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 4e47849d7427..aeb090583b78 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2417,7 +2417,7 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info, /* returns with log_tree_root freed on success */ ret = btrfs_recover_log_trees(log_tree_root); if (ret) { - btrfs_std_error(tree_root->fs_info, ret, + btrfs_handle_fs_error(tree_root->fs_info, ret, "Failed to recover log tree"); free_extent_buffer(log_tree_root->node); kfree(log_tree_root); @@ -3646,7 +3646,7 @@ static int write_all_supers(struct btrfs_root *root, int max_mirrors) if (ret) { mutex_unlock( &root->fs_info->fs_devices->device_list_mutex); - btrfs_std_error(root->fs_info, ret, + btrfs_handle_fs_error(root->fs_info, ret, "errors while submitting device barriers."); return ret; } @@ -3686,7 +3686,7 @@ static int write_all_supers(struct btrfs_root *root, int max_mirrors) mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); /* FUA is masked off if unsupported and can't be the reason */ - btrfs_std_error(root->fs_info, -EIO, + btrfs_handle_fs_error(root->fs_info, -EIO, "%d errors while writing supers", total_errors); return -EIO; } @@ -3704,7 +3704,7 @@ static int write_all_supers(struct btrfs_root *root, int max_mirrors) } mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); if (total_errors > max_errors) { - btrfs_std_error(root->fs_info, -EIO, + btrfs_handle_fs_error(root->fs_info, -EIO, "%d errors while writing supers", total_errors); return -EIO; } diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 84e060eb0de8..c0520a94a333 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -9058,7 +9058,7 @@ out: if (!for_reloc && root_dropped == false) btrfs_add_dead_root(root); if (err && err != -EAGAIN) - btrfs_std_error(root->fs_info, err, NULL); + btrfs_handle_fs_error(root->fs_info, err, NULL); return err; } diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c index be4d22a5022f..b8acc07ac6c2 100644 --- a/fs/btrfs/inode-item.c +++ b/fs/btrfs/inode-item.c @@ -157,7 +157,7 @@ static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans, */ if (!btrfs_find_name_in_ext_backref(path, ref_objectid, name, name_len, &extref)) { - btrfs_std_error(root->fs_info, -ENOENT, NULL); + btrfs_handle_fs_error(root->fs_info, -ENOENT, NULL); ret = -EROFS; goto out; } diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 5a23806ae418..06fcc448108b 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -4847,7 +4847,7 @@ static long btrfs_ioctl_qgroup_assign(struct file *file, void __user *arg) /* update qgroup status and info */ err = btrfs_run_qgroups(trans, root->fs_info); if (err < 0) - btrfs_std_error(root->fs_info, ret, + btrfs_handle_fs_error(root->fs_info, ret, "failed to update qgroup status and info\n"); err = btrfs_end_transaction(trans, root); if (err && !ret) diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 08ef890deca6..1c29514d8aff 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -2418,7 +2418,7 @@ again: } out: if (ret) { - btrfs_std_error(root->fs_info, ret, NULL); + btrfs_handle_fs_error(root->fs_info, ret, NULL); if (!list_empty(&reloc_roots)) free_reloc_roots(&reloc_roots); diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index 9fcd6dfc3266..b2b14e7115f1 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -284,7 +284,7 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root) trans = btrfs_join_transaction(tree_root); if (IS_ERR(trans)) { err = PTR_ERR(trans); - btrfs_std_error(tree_root->fs_info, err, + btrfs_handle_fs_error(tree_root->fs_info, err, "Failed to start trans to delete " "orphan item"); break; @@ -293,7 +293,7 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root) root_key.objectid); btrfs_end_transaction(trans, tree_root); if (err) { - btrfs_std_error(tree_root->fs_info, err, + btrfs_handle_fs_error(tree_root->fs_info, err, "Failed to delete root orphan " "item"); break; diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 00b8f37cc306..cc077887bd9d 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -131,11 +131,11 @@ static void btrfs_handle_error(struct btrfs_fs_info *fs_info) } /* - * __btrfs_std_error decodes expected errors from the caller and + * __btrfs_handle_fs_error decodes expected errors from the caller and * invokes the approciate error response. */ __cold -void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function, +void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function, unsigned int line, int errno, const char *fmt, ...) { struct super_block *sb = fs_info->sb; @@ -252,7 +252,7 @@ void __btrfs_abort_transaction(struct btrfs_trans_handle *trans, /* Wake up anybody who may be waiting on this transaction */ wake_up(&root->fs_info->transaction_wait); wake_up(&root->fs_info->transaction_blocked_wait); - __btrfs_std_error(root->fs_info, function, line, errno, NULL); + __btrfs_handle_fs_error(root->fs_info, function, line, errno, NULL); } /* * __btrfs_panic decodes unexpected, fatal errors from the caller, diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 43885e51b882..a469b55a7ac1 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -2145,7 +2145,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, ret = btrfs_write_and_wait_transaction(trans, root); if (ret) { - btrfs_std_error(root->fs_info, ret, + btrfs_handle_fs_error(root->fs_info, ret, "Error while writing out transaction"); mutex_unlock(&root->fs_info->tree_log_mutex); goto scrub_continue; diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 517d0ccb351e..16a74d1a2720 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -5519,7 +5519,7 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) ret = walk_log_tree(trans, log_root_tree, &wc); if (ret) { - btrfs_std_error(fs_info, ret, "Failed to pin buffers while " + btrfs_handle_fs_error(fs_info, ret, "Failed to pin buffers while " "recovering log root tree."); goto error; } @@ -5533,7 +5533,7 @@ again: ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0); if (ret < 0) { - btrfs_std_error(fs_info, ret, + btrfs_handle_fs_error(fs_info, ret, "Couldn't find tree log root."); goto error; } @@ -5551,7 +5551,7 @@ again: log = btrfs_read_fs_root(log_root_tree, &found_key); if (IS_ERR(log)) { ret = PTR_ERR(log); - btrfs_std_error(fs_info, ret, + btrfs_handle_fs_error(fs_info, ret, "Couldn't read tree log root."); goto error; } @@ -5566,7 +5566,7 @@ again: free_extent_buffer(log->node); free_extent_buffer(log->commit_root); kfree(log); - btrfs_std_error(fs_info, ret, "Couldn't read target root " + btrfs_handle_fs_error(fs_info, ret, "Couldn't read target root " "for tree log recovery."); goto error; } diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index bd0f45fb38c4..0d3dd3e10e31 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1454,7 +1454,7 @@ again: extent = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_extent); } else { - btrfs_std_error(root->fs_info, ret, "Slot search failed"); + btrfs_handle_fs_error(root->fs_info, ret, "Slot search failed"); goto out; } @@ -1462,7 +1462,7 @@ again: ret = btrfs_del_item(trans, root, path); if (ret) { - btrfs_std_error(root->fs_info, ret, + btrfs_handle_fs_error(root->fs_info, ret, "Failed to remove dev extent item"); } else { set_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags); @@ -2418,7 +2418,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) ret = btrfs_relocate_sys_chunks(root); if (ret < 0) - btrfs_std_error(root->fs_info, ret, + btrfs_handle_fs_error(root->fs_info, ret, "Failed to relocate sys chunks after " "device initialization. This can be fixed " "using the \"btrfs balance\" command."); @@ -2663,7 +2663,7 @@ static int btrfs_free_chunk(struct btrfs_trans_handle *trans, if (ret < 0) goto out; else if (ret > 0) { /* Logic error or corruption */ - btrfs_std_error(root->fs_info, -ENOENT, + btrfs_handle_fs_error(root->fs_info, -ENOENT, "Failed lookup while freeing chunk."); ret = -ENOENT; goto out; @@ -2671,7 +2671,7 @@ static int btrfs_free_chunk(struct btrfs_trans_handle *trans, ret = btrfs_del_item(trans, root, path); if (ret < 0) - btrfs_std_error(root->fs_info, ret, + btrfs_handle_fs_error(root->fs_info, ret, "Failed to delete chunk item."); out: btrfs_free_path(path); @@ -2857,7 +2857,7 @@ static int btrfs_relocate_chunk(struct btrfs_root *root, u64 chunk_offset) chunk_offset); if (IS_ERR(trans)) { ret = PTR_ERR(trans); - btrfs_std_error(root->fs_info, ret, NULL); + btrfs_handle_fs_error(root->fs_info, ret, NULL); return ret; } @@ -3632,7 +3632,7 @@ static void __cancel_balance(struct btrfs_fs_info *fs_info) unset_balance_control(fs_info); ret = del_balance_item(fs_info->tree_root); if (ret) - btrfs_std_error(fs_info, ret, NULL); + btrfs_handle_fs_error(fs_info, ret, NULL); atomic_set(&fs_info->mutually_exclusive_operation_running, 0); } -- cgit v1.2.3-55-g7522 From 2351d743f6b69d289f04246127c9d92516621974 Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Wed, 16 Mar 2016 16:43:07 +0800 Subject: btrfs: remove unused function btrfs_assert() Apparently looks like ASSERT does the same intended job, as intended btrfs_assert(). Signed-off-by: Anand Jain Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 12f80cd230ab..4f37e3766626 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -4326,7 +4326,6 @@ static inline void assfail(char *expr, char *file, int line) #define ASSERT(expr) ((void)0) #endif -#define btrfs_assert() __printf(5, 6) __cold void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function, -- cgit v1.2.3-55-g7522 From c5f4ccb2f77355ac44433feb362fecd7f1a7d03e Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Wed, 16 Mar 2016 16:43:08 +0800 Subject: btrfs: move error handling code together in ctree.h Looks like we added the incompatible defines in between the error handling defines in the file ctree.h. Now group them back. Signed-off-by: Anand Jain Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 78 +++++++++++++++++++++++++++++--------------------------- 1 file changed, 40 insertions(+), 38 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 4f37e3766626..e22568a54545 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -4338,6 +4338,46 @@ void __btrfs_abort_transaction(struct btrfs_trans_handle *trans, struct btrfs_root *root, const char *function, unsigned int line, int errno); +/* + * Call btrfs_abort_transaction as early as possible when an error condition is + * detected, that way the exact line number is reported. + */ +#define btrfs_abort_transaction(trans, root, errno) \ +do { \ + /* Report first abort since mount */ \ + if (!test_and_set_bit(BTRFS_FS_STATE_TRANS_ABORTED, \ + &((root)->fs_info->fs_state))) { \ + WARN(1, KERN_DEBUG \ + "BTRFS: Transaction aborted (error %d)\n", \ + (errno)); \ + } \ + __btrfs_abort_transaction((trans), (root), __func__, \ + __LINE__, (errno)); \ +} while (0) + +#define btrfs_handle_fs_error(fs_info, errno, fmt, args...) \ +do { \ + __btrfs_handle_fs_error((fs_info), __func__, __LINE__, \ + (errno), fmt, ##args); \ +} while (0) + +__printf(5, 6) +__cold +void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function, + unsigned int line, int errno, const char *fmt, ...); +/* + * If BTRFS_MOUNT_PANIC_ON_FATAL_ERROR is in mount_opt, __btrfs_panic + * will panic(). Otherwise we BUG() here. + */ +#define btrfs_panic(fs_info, errno, fmt, args...) \ +do { \ + __btrfs_panic(fs_info, __func__, __LINE__, errno, fmt, ##args); \ + BUG(); \ +} while (0) + + +/* compatibility and incompatibility defines */ + #define btrfs_set_fs_incompat(__fs_info, opt) \ __btrfs_set_fs_incompat((__fs_info), BTRFS_FEATURE_INCOMPAT_##opt) @@ -4454,44 +4494,6 @@ static inline int __btrfs_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag) return !!(btrfs_super_compat_ro_flags(disk_super) & flag); } -/* - * Call btrfs_abort_transaction as early as possible when an error condition is - * detected, that way the exact line number is reported. - */ -#define btrfs_abort_transaction(trans, root, errno) \ -do { \ - /* Report first abort since mount */ \ - if (!test_and_set_bit(BTRFS_FS_STATE_TRANS_ABORTED, \ - &((root)->fs_info->fs_state))) { \ - WARN(1, KERN_DEBUG \ - "BTRFS: Transaction aborted (error %d)\n", \ - (errno)); \ - } \ - __btrfs_abort_transaction((trans), (root), __func__, \ - __LINE__, (errno)); \ -} while (0) - -#define btrfs_handle_fs_error(fs_info, errno, fmt, args...) \ -do { \ - __btrfs_handle_fs_error((fs_info), __func__, __LINE__, \ - (errno), fmt, ##args); \ -} while (0) - -__printf(5, 6) -__cold -void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function, - unsigned int line, int errno, const char *fmt, ...); - -/* - * If BTRFS_MOUNT_PANIC_ON_FATAL_ERROR is in mount_opt, __btrfs_panic - * will panic(). Otherwise we BUG() here. - */ -#define btrfs_panic(fs_info, errno, fmt, args...) \ -do { \ - __btrfs_panic(fs_info, __func__, __LINE__, errno, fmt, ##args); \ - BUG(); \ -} while (0) - /* acl.c */ #ifdef CONFIG_BTRFS_FS_POSIX_ACL struct posix_acl *btrfs_get_acl(struct inode *inode, int type); -- cgit v1.2.3-55-g7522 From 13f48dc9094b56c5bffd8d57349a0a01a1926b2d Mon Sep 17 00:00:00 2001 From: Satoru Takeuchi Date: Tue, 15 Mar 2016 09:09:59 +0900 Subject: btrfs: Simplify conditions about compress while mapping btrfs flags to inode flags Signed-off-by: Satoru Takeuchi Signed-off-by: David Sterba --- fs/btrfs/ioctl.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 06fcc448108b..0f76aea6e398 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -125,10 +125,10 @@ static unsigned int btrfs_flags_to_ioctl(unsigned int flags) if (flags & BTRFS_INODE_NODATACOW) iflags |= FS_NOCOW_FL; - if ((flags & BTRFS_INODE_COMPRESS) && !(flags & BTRFS_INODE_NOCOMPRESS)) - iflags |= FS_COMPR_FL; - else if (flags & BTRFS_INODE_NOCOMPRESS) + if (flags & BTRFS_INODE_NOCOMPRESS) iflags |= FS_NOCOMP_FL; + else if (flags & BTRFS_INODE_COMPRESS) + iflags |= FS_COMPR_FL; return iflags; } -- cgit v1.2.3-55-g7522 From 0713d90c75745dc6148f6346d490e9ef63a4e8b4 Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Thu, 17 Mar 2016 10:38:57 +0800 Subject: btrfs: remove save_error_info() Actually save_error_info() sets the FS state to error and nothing else. Further the word save doesn't induce caffeine when compared to the word set in what actually it does. So to make it better understandable move save_error_info() code to its only consumer itself. Signed-off-by: Anand Jain Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/super.c | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index cc077887bd9d..dab51118b972 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -97,15 +97,6 @@ const char *btrfs_decode_error(int errno) return errstr; } -static void save_error_info(struct btrfs_fs_info *fs_info) -{ - /* - * today we only save the error info into ram. Long term we'll - * also send it down to the disk - */ - set_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state); -} - /* btrfs handle error by forcing the filesystem readonly */ static void btrfs_handle_error(struct btrfs_fs_info *fs_info) { @@ -170,8 +161,13 @@ void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function } #endif + /* + * Today we only save the error info to memory. Long term we'll + * also send it down to the disk + */ + set_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state); + /* Don't go through full error handling during mount */ - save_error_info(fs_info); if (sb->s_flags & MS_BORN) btrfs_handle_error(fs_info); } -- cgit v1.2.3-55-g7522 From 6719afdcf808b38145095b2e485c0f90bfc29722 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Sun, 22 Jun 2014 14:30:09 +0200 Subject: Btrfs: Refactor btrfs_lock_cluster() to kill compiler warning fs/btrfs/extent-tree.c: In function ‘btrfs_lock_cluster’: fs/btrfs/extent-tree.c:6399: warning: ‘used_bg’ may be used uninitialized in this function - Replace "again: ... goto again;" by standard C "while (1) { ... }", - Move block not processed during the first iteration of the loop to the end of the loop, which allows to kill the "locked" variable, Signed-off-by: Geert Uytterhoeven Reviewed-and-Tested-by: Miao Xie [ the compilation warning has been fixed by other patch, now we want to clean up the function ] Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 41 ++++++++++++++++++++--------------------- 1 file changed, 20 insertions(+), 21 deletions(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index c0520a94a333..b66f460f779a 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -7025,36 +7025,35 @@ btrfs_lock_cluster(struct btrfs_block_group_cache *block_group, int delalloc) { struct btrfs_block_group_cache *used_bg = NULL; - bool locked = false; -again: + spin_lock(&cluster->refill_lock); - if (locked) { - if (used_bg == cluster->block_group) + while (1) { + used_bg = cluster->block_group; + if (!used_bg) + return NULL; + + if (used_bg == block_group) return used_bg; - up_read(&used_bg->data_rwsem); - btrfs_put_block_group(used_bg); - } + btrfs_get_block_group(used_bg); - used_bg = cluster->block_group; - if (!used_bg) - return NULL; + if (!delalloc) + return used_bg; - if (used_bg == block_group) - return used_bg; + if (down_read_trylock(&used_bg->data_rwsem)) + return used_bg; - btrfs_get_block_group(used_bg); + spin_unlock(&cluster->refill_lock); - if (!delalloc) - return used_bg; + down_read(&used_bg->data_rwsem); - if (down_read_trylock(&used_bg->data_rwsem)) - return used_bg; + spin_lock(&cluster->refill_lock); + if (used_bg == cluster->block_group) + return used_bg; - spin_unlock(&cluster->refill_lock); - down_read(&used_bg->data_rwsem); - locked = true; - goto again; + up_read(&used_bg->data_rwsem); + btrfs_put_block_group(used_bg); + } } static inline void -- cgit v1.2.3-55-g7522 From 180e4d4700b1b7bfdffd9e58ae95220ae9482d17 Mon Sep 17 00:00:00 2001 From: Luis de Bethencourt Date: Mon, 4 Apr 2016 15:31:22 +0100 Subject: btrfs: fix typos in comments Correct a typo in the chunk_mutex name to make it grepable. Since it is better to fix several typos at once, fixing the 2 more in the same file. Signed-off-by: Luis de Bethencourt Signed-off-by: David Sterba --- fs/btrfs/super.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index dab51118b972..957976d9d627 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1484,10 +1484,10 @@ static int setup_security_options(struct btrfs_fs_info *fs_info, memcpy(&fs_info->security_opts, sec_opts, sizeof(*sec_opts)); } else { /* - * Since SELinux(the only one supports security_mnt_opts) does - * NOT support changing context during remount/mount same sb, - * This must be the same or part of the same security options, - * just free it. + * Since SELinux (the only one supporting security_mnt_opts) + * does NOT support changing context during remount/mount of + * the same sb, this must be the same or part of the same + * security options, just free it. */ security_free_mnt_opts(sec_opts); } @@ -1665,8 +1665,8 @@ static inline void btrfs_remount_cleanup(struct btrfs_fs_info *fs_info, unsigned long old_opts) { /* - * We need cleanup all defragable inodes if the autodefragment is - * close or the fs is R/O. + * We need to cleanup all defragable inodes if the autodefragment is + * close or the filesystem is read only. */ if (btrfs_raw_test_opt(old_opts, AUTO_DEFRAG) && (!btrfs_raw_test_opt(fs_info->mount_opt, AUTO_DEFRAG) || @@ -2049,7 +2049,7 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) u64 thresh = 0; /* - * holding chunk_muext to avoid allocating new chunks, holding + * holding chunk_mutex to avoid allocating new chunks, holding * device_list_mutex to avoid the device being removed */ rcu_read_lock(); -- cgit v1.2.3-55-g7522 From 4c63c2454eff996c5e27991221106eb511f7db38 Mon Sep 17 00:00:00 2001 From: Luke Dashjr Date: Thu, 29 Oct 2015 08:22:21 +0000 Subject: btrfs: bugfix: handle FS_IOC32_{GETFLAGS,SETFLAGS,GETVERSION} in btrfs_ioctl 32-bit ioctl uses these rather than the regular FS_IOC_* versions. They can be handled in btrfs using the same code. Without this, 32-bit {ch,ls}attr fail. Signed-off-by: Luke Dashjr Cc: stable@vger.kernel.org Reviewed-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 1 + fs/btrfs/file.c | 2 +- fs/btrfs/inode.c | 2 +- fs/btrfs/ioctl.c | 21 +++++++++++++++++++++ 4 files changed, 24 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 84a6a5b3384a..208d19938fdf 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -4122,6 +4122,7 @@ void btrfs_test_inode_set_ops(struct inode *inode); /* ioctl.c */ long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); +long btrfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg); int btrfs_ioctl_get_supported_features(void __user *arg); void btrfs_update_iflags(struct inode *inode); void btrfs_inherit_iflags(struct inode *inode, struct inode *dir); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 8d7b5a45c005..751daacd268d 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -2956,7 +2956,7 @@ const struct file_operations btrfs_file_operations = { .fallocate = btrfs_fallocate, .unlocked_ioctl = btrfs_ioctl, #ifdef CONFIG_COMPAT - .compat_ioctl = btrfs_ioctl, + .compat_ioctl = btrfs_compat_ioctl, #endif .copy_file_range = btrfs_copy_file_range, .clone_file_range = btrfs_clone_file_range, diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 2aaba58b4856..167fc3d49450 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -10184,7 +10184,7 @@ static const struct file_operations btrfs_dir_file_operations = { .iterate = btrfs_real_readdir, .unlocked_ioctl = btrfs_ioctl, #ifdef CONFIG_COMPAT - .compat_ioctl = btrfs_ioctl, + .compat_ioctl = btrfs_compat_ioctl, #endif .release = btrfs_release_file, .fsync = btrfs_sync_file, diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 5a23806ae418..f545f81f642d 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -5552,3 +5552,24 @@ long btrfs_ioctl(struct file *file, unsigned int return -ENOTTY; } + +#ifdef CONFIG_COMPAT +long btrfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + switch (cmd) { + case FS_IOC32_GETFLAGS: + cmd = FS_IOC_GETFLAGS; + break; + case FS_IOC32_SETFLAGS: + cmd = FS_IOC_SETFLAGS; + break; + case FS_IOC32_GETVERSION: + cmd = FS_IOC_GETVERSION; + break; + default: + return -ENOIOCTLCMD; + } + + return btrfs_ioctl(file, cmd, (unsigned long) compat_ptr(arg)); +} +#endif -- cgit v1.2.3-55-g7522 From a91326679f2a0a4c239cd643674fdcda28ee86be Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Mon, 7 Mar 2016 16:56:21 -0800 Subject: Btrfs: make mapping->writeback_index point to the last written page If sequential writer is writing in the middle of the page and it just redirties the last written page by continuing from it. In the above case this can end up with seeking back to that firstly redirtied page after writing all the pages at the end of file because btrfs updates mapping->writeback_index to 1 past the current one. For non-cow filesystems, the cost is only about extra seek, while for cow filesystems such as btrfs, it means unnecessary fragments. To avoid it, we just need to continue writeback from the last written page. This also updates btrfs to behave like what write_cache_pages() does, ie, bail out immediately if there is an error in writepage(). Reported-by: Holger Hoffstätte Signed-off-by: Liu Bo Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 32 ++++++++++++++++++++++++++------ 1 file changed, 26 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index d247fc0eea19..fd41a730d239 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3200,14 +3200,10 @@ int extent_read_full_page(struct extent_io_tree *tree, struct page *page, return ret; } -static noinline void update_nr_written(struct page *page, - struct writeback_control *wbc, - unsigned long nr_written) +static void update_nr_written(struct page *page, struct writeback_control *wbc, + unsigned long nr_written) { wbc->nr_to_write -= nr_written; - if (wbc->range_cyclic || (wbc->nr_to_write > 0 && - wbc->range_start == 0 && wbc->range_end == LLONG_MAX)) - page->mapping->writeback_index = page->index + nr_written; } /* @@ -3926,6 +3922,8 @@ static int extent_write_cache_pages(struct extent_io_tree *tree, int nr_pages; pgoff_t index; pgoff_t end; /* Inclusive */ + pgoff_t done_index; + int range_whole = 0; int scanned = 0; int tag; @@ -3948,6 +3946,8 @@ static int extent_write_cache_pages(struct extent_io_tree *tree, } else { index = wbc->range_start >> PAGE_SHIFT; end = wbc->range_end >> PAGE_SHIFT; + if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) + range_whole = 1; scanned = 1; } if (wbc->sync_mode == WB_SYNC_ALL) @@ -3957,6 +3957,7 @@ static int extent_write_cache_pages(struct extent_io_tree *tree, retry: if (wbc->sync_mode == WB_SYNC_ALL) tag_pages_for_writeback(mapping, index, end); + done_index = index; while (!done && !nr_to_write_done && (index <= end) && (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag, min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) { @@ -3966,6 +3967,7 @@ retry: for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; + done_index = page->index; /* * At this point we hold neither mapping->tree_lock nor * lock on the page itself: the page may be truncated or @@ -4009,6 +4011,20 @@ retry: } if (!err && ret < 0) err = ret; + if (ret < 0) { + /* + * done_index is set past this page, + * so media errors will not choke + * background writeout for the entire + * file. This has consequences for + * range_cyclic semantics (ie. it may + * not be suitable for data integrity + * writeout). + */ + done_index = page->index + 1; + done = 1; + break; + } /* * the filesystem may choose to bump up nr_to_write. @@ -4029,6 +4045,10 @@ retry: index = 0; goto retry; } + + if (wbc->range_cyclic || (wbc->nr_to_write > 0 && range_whole)) + mapping->writeback_index = done_index; + btrfs_add_delayed_iput(inode); return err; } -- cgit v1.2.3-55-g7522 From 894b36e35ae01186b77b083f3f67569a349062a6 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Mon, 7 Mar 2016 16:56:22 -0800 Subject: Btrfs: cleanup error handling in extent_write_cached_pages Now that we bail out immediately if ->writepage() returns an error, we don't need an extra error to retain the error code. Signed-off-by: Liu Bo Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index fd41a730d239..b67d6d24440b 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3916,7 +3916,6 @@ static int extent_write_cache_pages(struct extent_io_tree *tree, struct inode *inode = mapping->host; int ret = 0; int done = 0; - int err = 0; int nr_to_write_done = 0; struct pagevec pvec; int nr_pages; @@ -4009,8 +4008,6 @@ retry: unlock_page(page); ret = 0; } - if (!err && ret < 0) - err = ret; if (ret < 0) { /* * done_index is set past this page, @@ -4036,7 +4033,7 @@ retry: pagevec_release(&pvec); cond_resched(); } - if (!scanned && !done && !err) { + if (!scanned && !done) { /* * We hit the last page and there is more work to be done: wrap * back to the start of the file @@ -4050,7 +4047,7 @@ retry: mapping->writeback_index = done_index; btrfs_add_delayed_iput(inode); - return err; + return ret; } static void flush_epd_write_bio(struct extent_page_data *epd) -- cgit v1.2.3-55-g7522 From a2af23b7d7cb0de89570e97da84f6fb642e990a4 Mon Sep 17 00:00:00 2001 From: Chandan Rajendra Date: Mon, 4 Apr 2016 02:53:06 +0530 Subject: Btrfs: __btrfs_buffered_write: Pass valid file offset when releasing delalloc space The delalloc reserved space is calculated in terms of number of bytes used by an integral number of blocks. This is done by rounding down the value of 'pos' to the nearest multiple of sectorsize. The file offset value held by 'pos' variable may not be aligned to sectorsize and hence when passing it as an argument to btrfs_delalloc_release_space(), we may end up releasing larger delalloc space than we originally had reserved. Signed-off-by: Chandan Rajendra Signed-off-by: David Sterba --- fs/btrfs/file.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 751daacd268d..af059c44684d 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1696,7 +1696,9 @@ again: btrfs_end_write_no_snapshoting(root); btrfs_delalloc_release_metadata(inode, release_bytes); } else { - btrfs_delalloc_release_space(inode, pos, release_bytes); + btrfs_delalloc_release_space(inode, + round_down(pos, root->sectorsize), + release_bytes); } } -- cgit v1.2.3-55-g7522 From cf25ce518e8ef9d59b292e51193bed2b023a32da Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Mon, 14 Dec 2015 18:29:32 -0800 Subject: Btrfs: do not create empty block group if we have allocated data Now we force to create empty block group to keep data profile alive, however, in the below example, we eventually get an empty block group while we're trying to get more space for other types (metadata/system), - Before, block group "A": size=2G, used=1.2G block group "B": size=2G, used=512M - After "btrfs balance start -dusage=50 mount_point", block group "A": size=2G, used=(1.2+0.5)G block group "C": size=2G, used=0 Since there is no data in block group C, it won't be deleted automatically and we have to get the unused 2G until the next mount. Balance itself just moves data and doesn't remove data, so it's safe to not create such a empty block group if we already have data allocated in other block groups. Signed-off-by: Liu Bo Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index bd0f45fb38c4..745a619241c7 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -3402,6 +3402,7 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info) u32 count_meta = 0; u32 count_sys = 0; int chunk_reserved = 0; + u64 bytes_used = 0; /* step one make some room on all the devices */ devices = &fs_info->fs_devices->devices; @@ -3540,7 +3541,13 @@ again: goto loop; } - if ((chunk_type & BTRFS_BLOCK_GROUP_DATA) && !chunk_reserved) { + ASSERT(fs_info->data_sinfo); + spin_lock(&fs_info->data_sinfo->lock); + bytes_used = fs_info->data_sinfo->bytes_used; + spin_unlock(&fs_info->data_sinfo->lock); + + if ((chunk_type & BTRFS_BLOCK_GROUP_DATA) && + !chunk_reserved && !bytes_used) { trans = btrfs_start_transaction(chunk_root, 0); if (IS_ERR(trans)) { mutex_unlock(&fs_info->delete_unused_bgs_mutex); -- cgit v1.2.3-55-g7522 From 6cf86a006be9f2a77434d5a06ea289719815ad7c Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Sat, 13 Feb 2016 10:01:29 +0800 Subject: btrfs: create a helper function to read the disk super A part of code from btrfs_scan_one_device() is moved to a new function btrfs_read_disk_super(), so that former function looks cleaner. (In this process it also moves the code which ensures null terminating label). So this creates easy opportunity to merge various duplicate codes on read disk super. Earlier attempt to merge duplicate codes highlighted that there were some issues for which there are duplicate codes (to read disk super), however it was not clear what was the issue. So until we figure that out, its better to keep them in a separate functions. Signed-off-by: Anand Jain [ use GFP_KERNEL, PAGE_CACHE_ removal related fixups ] Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 87 ++++++++++++++++++++++++++++++++---------------------- 1 file changed, 52 insertions(+), 35 deletions(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index bd0f45fb38c4..5006683283ae 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -988,6 +988,56 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, return ret; } +void btrfs_release_disk_super(struct page *page) +{ + kunmap(page); + put_page(page); +} + +int btrfs_read_disk_super(struct block_device *bdev, u64 bytenr, + struct page **page, struct btrfs_super_block **disk_super) +{ + void *p; + pgoff_t index; + + /* make sure our super fits in the device */ + if (bytenr + PAGE_SIZE >= i_size_read(bdev->bd_inode)) + return 1; + + /* make sure our super fits in the page */ + if (sizeof(**disk_super) > PAGE_SIZE) + return 1; + + /* make sure our super doesn't straddle pages on disk */ + index = bytenr >> PAGE_SHIFT; + if ((bytenr + sizeof(**disk_super) - 1) >> PAGE_SHIFT != index) + return 1; + + /* pull in the page with our super */ + *page = read_cache_page_gfp(bdev->bd_inode->i_mapping, + index, GFP_KERNEL); + + if (IS_ERR_OR_NULL(*page)) + return 1; + + p = kmap(*page); + + /* align our pointer to the offset of the super block */ + *disk_super = p + (bytenr & ~PAGE_MASK); + + if (btrfs_super_bytenr(*disk_super) != bytenr || + btrfs_super_magic(*disk_super) != BTRFS_MAGIC) { + btrfs_release_disk_super(*page); + return 1; + } + + if ((*disk_super)->label[0] && + (*disk_super)->label[BTRFS_LABEL_SIZE - 1]) + (*disk_super)->label[BTRFS_LABEL_SIZE - 1] = '\0'; + + return 0; +} + /* * Look for a btrfs signature on a device. This may be called out of the mount path * and we are not allowed to call set_blocksize during the scan. The superblock @@ -999,13 +1049,11 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, struct btrfs_super_block *disk_super; struct block_device *bdev; struct page *page; - void *p; int ret = -EINVAL; u64 devid; u64 transid; u64 total_devices; u64 bytenr; - pgoff_t index; /* * we would like to check all the supers, but that would make @@ -1018,41 +1066,14 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, mutex_lock(&uuid_mutex); bdev = blkdev_get_by_path(path, flags, holder); - if (IS_ERR(bdev)) { ret = PTR_ERR(bdev); goto error; } - /* make sure our super fits in the device */ - if (bytenr + PAGE_SIZE >= i_size_read(bdev->bd_inode)) - goto error_bdev_put; - - /* make sure our super fits in the page */ - if (sizeof(*disk_super) > PAGE_SIZE) - goto error_bdev_put; - - /* make sure our super doesn't straddle pages on disk */ - index = bytenr >> PAGE_SHIFT; - if ((bytenr + sizeof(*disk_super) - 1) >> PAGE_SHIFT != index) - goto error_bdev_put; - - /* pull in the page with our super */ - page = read_cache_page_gfp(bdev->bd_inode->i_mapping, - index, GFP_NOFS); - - if (IS_ERR_OR_NULL(page)) + if (btrfs_read_disk_super(bdev, bytenr, &page, &disk_super)) goto error_bdev_put; - p = kmap(page); - - /* align our pointer to the offset of the super block */ - disk_super = p + (bytenr & ~PAGE_MASK); - - if (btrfs_super_bytenr(disk_super) != bytenr || - btrfs_super_magic(disk_super) != BTRFS_MAGIC) - goto error_unmap; - devid = btrfs_stack_device_id(&disk_super->dev_item); transid = btrfs_super_generation(disk_super); total_devices = btrfs_super_num_devices(disk_super); @@ -1060,8 +1081,6 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, ret = device_list_add(path, disk_super, devid, fs_devices_ret); if (ret > 0) { if (disk_super->label[0]) { - if (disk_super->label[BTRFS_LABEL_SIZE - 1]) - disk_super->label[BTRFS_LABEL_SIZE - 1] = '\0'; printk(KERN_INFO "BTRFS: device label %s ", disk_super->label); } else { printk(KERN_INFO "BTRFS: device fsid %pU ", disk_super->fsid); @@ -1073,9 +1092,7 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, if (!ret && fs_devices_ret) (*fs_devices_ret)->total_devices = total_devices; -error_unmap: - kunmap(page); - put_page(page); + btrfs_release_disk_super(page); error_bdev_put: blkdev_put(bdev, flags); -- cgit v1.2.3-55-g7522 From f1fa7f264250f2cb60119aca3fd114c3f47070c2 Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Sat, 13 Feb 2016 10:01:33 +0800 Subject: btrfs: create helper function __check_raid_min_devices() move a section of btrfs_rm_device() code to check for min number of the devices into the function __check_raid_min_devices() Signed-off-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 51 ++++++++++++++++++++++++++++++++------------------- 1 file changed, 32 insertions(+), 19 deletions(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 5006683283ae..db781d0d3678 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1705,23 +1705,20 @@ out: return ret; } -int btrfs_rm_device(struct btrfs_root *root, char *device_path) +static int __check_raid_min_devices(struct btrfs_root *root) { - struct btrfs_device *device; - struct btrfs_device *next_device; - struct block_device *bdev; - struct buffer_head *bh = NULL; - struct btrfs_super_block *disk_super; - struct btrfs_fs_devices *cur_devices; u64 all_avail; - u64 devid; u64 num_devices; - u8 *dev_uuid; unsigned seq; int ret = 0; - bool clear_super = false; - mutex_lock(&uuid_mutex); + num_devices = root->fs_info->fs_devices->num_devices; + btrfs_dev_replace_lock(&root->fs_info->dev_replace, 0); + if (btrfs_dev_replace_is_ongoing(&root->fs_info->dev_replace)) { + WARN_ON(num_devices < 1); + num_devices--; + } + btrfs_dev_replace_unlock(&root->fs_info->dev_replace, 0); do { seq = read_seqbegin(&root->fs_info->profiles_lock); @@ -1731,14 +1728,6 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) root->fs_info->avail_metadata_alloc_bits; } while (read_seqretry(&root->fs_info->profiles_lock, seq)); - num_devices = root->fs_info->fs_devices->num_devices; - btrfs_dev_replace_lock(&root->fs_info->dev_replace, 0); - if (btrfs_dev_replace_is_ongoing(&root->fs_info->dev_replace)) { - WARN_ON(num_devices < 1); - num_devices--; - } - btrfs_dev_replace_unlock(&root->fs_info->dev_replace, 0); - if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) && num_devices <= 4) { ret = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET; goto out; @@ -1760,6 +1749,30 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) goto out; } +out: + return ret; +} + +int btrfs_rm_device(struct btrfs_root *root, char *device_path) +{ + struct btrfs_device *device; + struct btrfs_device *next_device; + struct block_device *bdev; + struct buffer_head *bh = NULL; + struct btrfs_super_block *disk_super; + struct btrfs_fs_devices *cur_devices; + u64 devid; + u64 num_devices; + u8 *dev_uuid; + int ret = 0; + bool clear_super = false; + + mutex_lock(&uuid_mutex); + + ret = __check_raid_min_devices(root); + if (ret) + goto out; + if (strcmp(device_path, "missing") == 0) { struct list_head *devices; struct btrfs_device *tmp; -- cgit v1.2.3-55-g7522 From bd45ffbcb1f082e3c5a0bd56b1d7310d8b707ffb Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Sat, 13 Feb 2016 10:01:34 +0800 Subject: btrfs: clean up and optimize __check_raid_min_device() __check_raid_min_device() which was pealed from btrfs_rm_device() maintianed its original code to show the block move. This patch cleans up __check_raid_min_device(). Signed-off-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 43 +++++++++++++++++++------------------------ 1 file changed, 19 insertions(+), 24 deletions(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index db781d0d3678..5538dc78c72e 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1705,52 +1705,47 @@ out: return ret; } -static int __check_raid_min_devices(struct btrfs_root *root) +static int __check_raid_min_devices(struct btrfs_fs_info *fs_info) { u64 all_avail; u64 num_devices; unsigned seq; - int ret = 0; - num_devices = root->fs_info->fs_devices->num_devices; - btrfs_dev_replace_lock(&root->fs_info->dev_replace, 0); - if (btrfs_dev_replace_is_ongoing(&root->fs_info->dev_replace)) { + num_devices = fs_info->fs_devices->num_devices; + btrfs_dev_replace_lock(&fs_info->dev_replace, 0); + if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) { WARN_ON(num_devices < 1); num_devices--; } - btrfs_dev_replace_unlock(&root->fs_info->dev_replace, 0); + btrfs_dev_replace_unlock(&fs_info->dev_replace, 0); do { - seq = read_seqbegin(&root->fs_info->profiles_lock); + seq = read_seqbegin(&fs_info->profiles_lock); - all_avail = root->fs_info->avail_data_alloc_bits | - root->fs_info->avail_system_alloc_bits | - root->fs_info->avail_metadata_alloc_bits; - } while (read_seqretry(&root->fs_info->profiles_lock, seq)); + all_avail = fs_info->avail_data_alloc_bits | + fs_info->avail_system_alloc_bits | + fs_info->avail_metadata_alloc_bits; + } while (read_seqretry(&fs_info->profiles_lock, seq)); if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) && num_devices <= 4) { - ret = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET; - goto out; + return BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET; } if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) && num_devices <= 2) { - ret = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET; - goto out; + return BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET; } if ((all_avail & BTRFS_BLOCK_GROUP_RAID5) && - root->fs_info->fs_devices->rw_devices <= 2) { - ret = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET; - goto out; + fs_info->fs_devices->rw_devices <= 2) { + return BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET; } + if ((all_avail & BTRFS_BLOCK_GROUP_RAID6) && - root->fs_info->fs_devices->rw_devices <= 3) { - ret = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET; - goto out; + fs_info->fs_devices->rw_devices <= 3) { + return BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET; } -out: - return ret; + return 0; } int btrfs_rm_device(struct btrfs_root *root, char *device_path) @@ -1769,7 +1764,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) mutex_lock(&uuid_mutex); - ret = __check_raid_min_devices(root); + ret = __check_raid_min_devices(root->fs_info); if (ret) goto out; -- cgit v1.2.3-55-g7522 From 24e0474b59538cdb9d2b7318ec7c7ae9f6faf85d Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Sat, 13 Feb 2016 10:01:35 +0800 Subject: btrfs: create helper btrfs_find_device_by_user_input() The patch renames btrfs_dev_replace_find_srcdev() to btrfs_find_device_by_user_input() and moves it to volumes.c, so that delete device can use it. Signed-off-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/dev-replace.c | 24 +----------------------- fs/btrfs/volumes.c | 19 +++++++++++++++++++ fs/btrfs/volumes.h | 3 +++ 3 files changed, 23 insertions(+), 23 deletions(-) diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 26bcb487f958..7ad8ae994ca0 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -44,9 +44,6 @@ static void btrfs_dev_replace_update_device_in_mapping_tree( struct btrfs_fs_info *fs_info, struct btrfs_device *srcdev, struct btrfs_device *tgtdev); -static int btrfs_dev_replace_find_srcdev(struct btrfs_root *root, u64 srcdevid, - char *srcdev_name, - struct btrfs_device **device); static u64 __btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info); static int btrfs_dev_replace_kthread(void *data); static int btrfs_dev_replace_continue_on_mount(struct btrfs_fs_info *fs_info); @@ -329,7 +326,7 @@ int btrfs_dev_replace_start(struct btrfs_root *root, /* the disk copy procedure reuses the scrub code */ mutex_lock(&fs_info->volume_mutex); - ret = btrfs_dev_replace_find_srcdev(root, args->start.srcdevid, + ret = btrfs_find_device_by_user_input(root, args->start.srcdevid, args->start.srcdev_name, &src_device); if (ret) { @@ -626,25 +623,6 @@ static void btrfs_dev_replace_update_device_in_mapping_tree( write_unlock(&em_tree->lock); } -static int btrfs_dev_replace_find_srcdev(struct btrfs_root *root, u64 srcdevid, - char *srcdev_name, - struct btrfs_device **device) -{ - int ret; - - if (srcdevid) { - ret = 0; - *device = btrfs_find_device(root->fs_info, srcdevid, NULL, - NULL); - if (!*device) - ret = -ENOENT; - } else { - ret = btrfs_find_device_missing_or_by_path(root, srcdev_name, - device); - } - return ret; -} - void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info, struct btrfs_ioctl_dev_replace_args *args) { diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 5538dc78c72e..ab28d9dc1b0d 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -2126,6 +2126,25 @@ int btrfs_find_device_missing_or_by_path(struct btrfs_root *root, } } +int btrfs_find_device_by_user_input(struct btrfs_root *root, u64 srcdevid, + char *srcdev_name, + struct btrfs_device **device) +{ + int ret; + + if (srcdevid) { + ret = 0; + *device = btrfs_find_device(root->fs_info, srcdevid, NULL, + NULL); + if (!*device) + ret = -ENOENT; + } else { + ret = btrfs_find_device_missing_or_by_path(root, srcdev_name, + device); + } + return ret; +} + /* * does all the dirty work required for changing file system's UUID. */ diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 1939ebde63df..508739314e43 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -448,6 +448,9 @@ void btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices, int step); int btrfs_find_device_missing_or_by_path(struct btrfs_root *root, char *device_path, struct btrfs_device **device); +int btrfs_find_device_by_user_input(struct btrfs_root *root, u64 srcdevid, + char *srcdev_name, + struct btrfs_device **device); struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info, const u64 *devid, const u8 *uuid); -- cgit v1.2.3-55-g7522 From 24fc572fe456c02ff4136c07861a3edd4b8de683 Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Sat, 13 Feb 2016 10:01:36 +0800 Subject: btrfs: make use of btrfs_find_device_by_user_input() btrfs_rm_device() has a section of the code which can be replaced btrfs_find_device_by_user_input() Signed-off-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 100 ++++++++++++++++++++--------------------------------- 1 file changed, 37 insertions(+), 63 deletions(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index ab28d9dc1b0d..40bbe0a2715b 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1752,13 +1752,11 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) { struct btrfs_device *device; struct btrfs_device *next_device; - struct block_device *bdev; + struct block_device *bdev = NULL; struct buffer_head *bh = NULL; - struct btrfs_super_block *disk_super; + struct btrfs_super_block *disk_super = NULL; struct btrfs_fs_devices *cur_devices; - u64 devid; u64 num_devices; - u8 *dev_uuid; int ret = 0; bool clear_super = false; @@ -1768,57 +1766,19 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) if (ret) goto out; - if (strcmp(device_path, "missing") == 0) { - struct list_head *devices; - struct btrfs_device *tmp; - - device = NULL; - devices = &root->fs_info->fs_devices->devices; - /* - * It is safe to read the devices since the volume_mutex - * is held. - */ - list_for_each_entry(tmp, devices, dev_list) { - if (tmp->in_fs_metadata && - !tmp->is_tgtdev_for_dev_replace && - !tmp->bdev) { - device = tmp; - break; - } - } - bdev = NULL; - bh = NULL; - disk_super = NULL; - if (!device) { - ret = BTRFS_ERROR_DEV_MISSING_NOT_FOUND; - goto out; - } - } else { - ret = btrfs_get_bdev_and_sb(device_path, - FMODE_WRITE | FMODE_EXCL, - root->fs_info->bdev_holder, 0, - &bdev, &bh); - if (ret) - goto out; - disk_super = (struct btrfs_super_block *)bh->b_data; - devid = btrfs_stack_device_id(&disk_super->dev_item); - dev_uuid = disk_super->dev_item.uuid; - device = btrfs_find_device(root->fs_info, devid, dev_uuid, - disk_super->fsid); - if (!device) { - ret = -ENOENT; - goto error_brelse; - } - } + ret = btrfs_find_device_by_user_input(root, 0, device_path, + &device); + if (ret) + goto out; if (device->is_tgtdev_for_dev_replace) { ret = BTRFS_ERROR_DEV_TGT_REPLACE; - goto error_brelse; + goto out; } if (device->writeable && root->fs_info->fs_devices->rw_devices == 1) { ret = BTRFS_ERROR_DEV_ONLY_WRITABLE; - goto error_brelse; + goto out; } if (device->writeable) { @@ -1908,16 +1868,33 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) * at this point, the device is zero sized. We want to * remove it from the devices list and zero out the old super */ - if (clear_super && disk_super) { + if (clear_super) { u64 bytenr; int i; + if (!disk_super) { + ret = btrfs_get_bdev_and_sb(device_path, + FMODE_WRITE | FMODE_EXCL, + root->fs_info->bdev_holder, 0, + &bdev, &bh); + if (ret) { + /* + * It could be a failed device ok for clear_super + * to fail. So return success + */ + ret = 0; + goto out; + } + + disk_super = (struct btrfs_super_block *)bh->b_data; + } /* make sure this device isn't detected as part of * the FS anymore */ memset(&disk_super->magic, 0, sizeof(disk_super->magic)); set_buffer_dirty(bh); sync_dirty_buffer(bh); + brelse(bh); /* clear the mirror copies of super block on the disk * being removed, 0th copy is been taken care above and @@ -1929,7 +1906,6 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) i_size_read(bdev->bd_inode)) break; - brelse(bh); bh = __bread(bdev, bytenr / 4096, BTRFS_SUPER_INFO_SIZE); if (!bh) @@ -1939,32 +1915,30 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) if (btrfs_super_bytenr(disk_super) != bytenr || btrfs_super_magic(disk_super) != BTRFS_MAGIC) { + brelse(bh); continue; } memset(&disk_super->magic, 0, sizeof(disk_super->magic)); set_buffer_dirty(bh); sync_dirty_buffer(bh); + brelse(bh); } - } - ret = 0; - - if (bdev) { - /* Notify udev that device has changed */ - btrfs_kobject_uevent(bdev, KOBJ_CHANGE); + if (bdev) { + /* Notify udev that device has changed */ + btrfs_kobject_uevent(bdev, KOBJ_CHANGE); - /* Update ctime/mtime for device path for libblkid */ - update_dev_time(device_path); + /* Update ctime/mtime for device path for libblkid */ + update_dev_time(device_path); + blkdev_put(bdev, FMODE_READ | FMODE_EXCL); + } } -error_brelse: - brelse(bh); - if (bdev) - blkdev_put(bdev, FMODE_READ | FMODE_EXCL); out: mutex_unlock(&uuid_mutex); return ret; + error_undo: if (device->writeable) { lock_chunks(root); @@ -1973,7 +1947,7 @@ error_undo: device->fs_devices->rw_devices++; unlock_chunks(root); } - goto error_brelse; + goto out; } void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_fs_info *fs_info, -- cgit v1.2.3-55-g7522 From b3d1b1532ff9620ff5dba891a96f3e912005eb10 Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Sat, 13 Feb 2016 10:01:37 +0800 Subject: btrfs: enhance btrfs_find_device_by_user_input() to check device path The operation of device replace and device delete follows same steps upto some depth with in btrfs kernel, however they don't share codes. This enhancement will help replace and delete to share codes. Signed-off-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/dev-replace.c | 4 ---- fs/btrfs/volumes.c | 3 +++ 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 7ad8ae994ca0..fdd2880707df 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -320,10 +320,6 @@ int btrfs_dev_replace_start(struct btrfs_root *root, return -EINVAL; } - if ((args->start.srcdevid == 0 && args->start.srcdev_name[0] == '\0') || - args->start.tgtdev_name[0] == '\0') - return -EINVAL; - /* the disk copy procedure reuses the scrub code */ mutex_lock(&fs_info->volume_mutex); ret = btrfs_find_device_by_user_input(root, args->start.srcdevid, diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 40bbe0a2715b..d74260567bea 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -2113,6 +2113,9 @@ int btrfs_find_device_by_user_input(struct btrfs_root *root, u64 srcdevid, if (!*device) ret = -ENOENT; } else { + if (!srcdev_name || !srcdev_name[0]) + return -EINVAL; + ret = btrfs_find_device_missing_or_by_path(root, srcdev_name, device); } -- cgit v1.2.3-55-g7522 From 42b674271566c26692c32f4ec1f4b4cf14d5046f Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Sat, 13 Feb 2016 10:01:38 +0800 Subject: btrfs: make use of btrfs_scratch_superblocks() in btrfs_rm_device() With the previous patches now the btrfs_scratch_superblocks() is ready to be used in btrfs_rm_device() so use it. Signed-off-by: Anand Jain [ use GFP_KERNEL ] Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 77 +++++++++--------------------------------------------- 1 file changed, 13 insertions(+), 64 deletions(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index d74260567bea..8264b06756e6 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1752,13 +1752,11 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) { struct btrfs_device *device; struct btrfs_device *next_device; - struct block_device *bdev = NULL; - struct buffer_head *bh = NULL; - struct btrfs_super_block *disk_super = NULL; struct btrfs_fs_devices *cur_devices; u64 num_devices; int ret = 0; bool clear_super = false; + char *dev_name = NULL; mutex_lock(&uuid_mutex); @@ -1786,6 +1784,11 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) list_del_init(&device->dev_alloc_list); device->fs_devices->rw_devices--; unlock_chunks(root); + dev_name = kstrdup(device->name->str, GFP_KERNEL); + if (!dev_name) { + ret = -ENOMEM; + goto error_undo; + } clear_super = true; } @@ -1869,73 +1872,19 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) * remove it from the devices list and zero out the old super */ if (clear_super) { - u64 bytenr; - int i; - - if (!disk_super) { - ret = btrfs_get_bdev_and_sb(device_path, - FMODE_WRITE | FMODE_EXCL, - root->fs_info->bdev_holder, 0, - &bdev, &bh); - if (ret) { - /* - * It could be a failed device ok for clear_super - * to fail. So return success - */ - ret = 0; - goto out; - } - - disk_super = (struct btrfs_super_block *)bh->b_data; - } - /* make sure this device isn't detected as part of - * the FS anymore - */ - memset(&disk_super->magic, 0, sizeof(disk_super->magic)); - set_buffer_dirty(bh); - sync_dirty_buffer(bh); - brelse(bh); - - /* clear the mirror copies of super block on the disk - * being removed, 0th copy is been taken care above and - * the below would take of the rest - */ - for (i = 1; i < BTRFS_SUPER_MIRROR_MAX; i++) { - bytenr = btrfs_sb_offset(i); - if (bytenr + BTRFS_SUPER_INFO_SIZE >= - i_size_read(bdev->bd_inode)) - break; - - bh = __bread(bdev, bytenr / 4096, - BTRFS_SUPER_INFO_SIZE); - if (!bh) - continue; + struct block_device *bdev; - disk_super = (struct btrfs_super_block *)bh->b_data; - - if (btrfs_super_bytenr(disk_super) != bytenr || - btrfs_super_magic(disk_super) != BTRFS_MAGIC) { - brelse(bh); - continue; - } - memset(&disk_super->magic, 0, - sizeof(disk_super->magic)); - set_buffer_dirty(bh); - sync_dirty_buffer(bh); - brelse(bh); - } - - if (bdev) { - /* Notify udev that device has changed */ - btrfs_kobject_uevent(bdev, KOBJ_CHANGE); - - /* Update ctime/mtime for device path for libblkid */ - update_dev_time(device_path); + bdev = blkdev_get_by_path(dev_name, FMODE_READ | FMODE_EXCL, + root->fs_info->bdev_holder); + if (!IS_ERR(bdev)) { + btrfs_scratch_superblocks(bdev, dev_name); blkdev_put(bdev, FMODE_READ | FMODE_EXCL); } } out: + kfree(dev_name); + mutex_unlock(&uuid_mutex); return ret; -- cgit v1.2.3-55-g7522 From 6b526ed70cf189660d009ea6f17af77a9cca0f38 Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Sat, 13 Feb 2016 10:01:39 +0800 Subject: btrfs: introduce device delete by devid This introduces new ioctl BTRFS_IOC_RM_DEV_V2, which uses enhanced struct btrfs_ioctl_vol_args_v2 to carry devid as an user argument. The patch won't delete the old ioctl interface and so kernel remains backward compatible with user land progs. Test case/script: echo "0 $(blockdev --getsz /dev/sdf) linear /dev/sdf 0" | dmsetup create bad_disk mkfs.btrfs -f -d raid1 -m raid1 /dev/sdd /dev/sde /dev/mapper/bad_disk mount /dev/sdd /btrfs dmsetup suspend bad_disk echo "0 $(blockdev --getsz /dev/sdf) error /dev/sdf 0" | dmsetup load bad_disk dmsetup resume bad_disk echo "bad disk failed. now deleting/replacing" btrfs dev del 3 /btrfs echo $? btrfs fi show /btrfs umount /btrfs btrfs-show-super /dev/sdd | egrep num_device dmsetup remove bad_disk wipefs -a /dev/sdf Signed-off-by: Anand Jain Reported-by: Martin [ adjust messages, s/disk/device/ ] Signed-off-by: David Sterba --- fs/btrfs/ioctl.c | 58 +++++++++++++++++++++++++++++++++++++++++++++- fs/btrfs/volumes.c | 4 ++-- fs/btrfs/volumes.h | 2 +- include/uapi/linux/btrfs.h | 14 ++++++++++- 4 files changed, 73 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 5a23806ae418..7cf0f6328d99 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -2667,6 +2667,60 @@ out: return ret; } +static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg) +{ + struct btrfs_root *root = BTRFS_I(file_inode(file))->root; + struct btrfs_ioctl_vol_args_v2 *vol_args; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + ret = mnt_want_write_file(file); + if (ret) + return ret; + + vol_args = memdup_user(arg, sizeof(*vol_args)); + if (IS_ERR(vol_args)) { + ret = PTR_ERR(vol_args); + goto err_drop; + } + + /* Check for compatibility reject unknown flags */ + if (vol_args->flags & ~BTRFS_VOL_ARG_V2_FLAGS) + return -ENOTTY; + + if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running, + 1)) { + ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; + goto out; + } + + mutex_lock(&root->fs_info->volume_mutex); + if (vol_args->flags & BTRFS_DEVICE_BY_ID) { + ret = btrfs_rm_device(root, NULL, vol_args->devid); + } else { + vol_args->name[BTRFS_SUBVOL_NAME_MAX] = '\0'; + ret = btrfs_rm_device(root, vol_args->name, 0); + } + mutex_unlock(&root->fs_info->volume_mutex); + atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0); + + if (!ret) { + if (vol_args->flags & BTRFS_DEVICE_BY_ID) + btrfs_info(root->fs_info, "device deleted: id %llu", + vol_args->devid); + else + btrfs_info(root->fs_info, "device deleted: %s", + vol_args->name); + } +out: + kfree(vol_args); +err_drop: + mnt_drop_write_file(file); + return ret; +} + static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) { struct btrfs_root *root = BTRFS_I(file_inode(file))->root; @@ -2695,7 +2749,7 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) } mutex_lock(&root->fs_info->volume_mutex); - ret = btrfs_rm_device(root, vol_args->name); + ret = btrfs_rm_device(root, vol_args->name, 0); mutex_unlock(&root->fs_info->volume_mutex); atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0); @@ -5459,6 +5513,8 @@ long btrfs_ioctl(struct file *file, unsigned int return btrfs_ioctl_add_dev(root, argp); case BTRFS_IOC_RM_DEV: return btrfs_ioctl_rm_dev(file, argp); + case BTRFS_IOC_RM_DEV_V2: + return btrfs_ioctl_rm_dev_v2(file, argp); case BTRFS_IOC_FS_INFO: return btrfs_ioctl_fs_info(root, argp); case BTRFS_IOC_DEV_INFO: diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 8264b06756e6..1421d711629f 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1748,7 +1748,7 @@ static int __check_raid_min_devices(struct btrfs_fs_info *fs_info) return 0; } -int btrfs_rm_device(struct btrfs_root *root, char *device_path) +int btrfs_rm_device(struct btrfs_root *root, char *device_path, u64 devid) { struct btrfs_device *device; struct btrfs_device *next_device; @@ -1764,7 +1764,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) if (ret) goto out; - ret = btrfs_find_device_by_user_input(root, 0, device_path, + ret = btrfs_find_device_by_user_input(root, devid, device_path, &device); if (ret) goto out; diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 508739314e43..c73d027e2f8b 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -454,7 +454,7 @@ int btrfs_find_device_by_user_input(struct btrfs_root *root, u64 srcdevid, struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info, const u64 *devid, const u8 *uuid); -int btrfs_rm_device(struct btrfs_root *root, char *device_path); +int btrfs_rm_device(struct btrfs_root *root, char *device_path, u64 devid); void btrfs_cleanup_fs_uuids(void); int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len); int btrfs_grow_device(struct btrfs_trans_handle *trans, diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h index dea893199257..396a4efca775 100644 --- a/include/uapi/linux/btrfs.h +++ b/include/uapi/linux/btrfs.h @@ -36,6 +36,13 @@ struct btrfs_ioctl_vol_args { #define BTRFS_SUBVOL_CREATE_ASYNC (1ULL << 0) #define BTRFS_SUBVOL_RDONLY (1ULL << 1) #define BTRFS_SUBVOL_QGROUP_INHERIT (1ULL << 2) +#define BTRFS_DEVICE_BY_ID (1ULL << 3) +#define BTRFS_VOL_ARG_V2_FLAGS \ + (BTRFS_SUBVOL_CREATE_ASYNC | \ + BTRFS_SUBVOL_RDONLY | \ + BTRFS_SUBVOL_QGROUP_INHERIT | \ + BTRFS_DEVICE_BY_ID) + #define BTRFS_FSID_SIZE 16 #define BTRFS_UUID_SIZE 16 #define BTRFS_UUID_UNPARSED_SIZE 37 @@ -76,7 +83,10 @@ struct btrfs_ioctl_vol_args_v2 { }; __u64 unused[4]; }; - char name[BTRFS_SUBVOL_NAME_MAX + 1]; + union { + char name[BTRFS_SUBVOL_NAME_MAX + 1]; + u64 devid; + }; }; /* @@ -659,5 +669,7 @@ static inline char *btrfs_err_str(enum btrfs_err_code err_code) struct btrfs_ioctl_feature_flags[2]) #define BTRFS_IOC_GET_SUPPORTED_FEATURES _IOR(BTRFS_IOCTL_MAGIC, 57, \ struct btrfs_ioctl_feature_flags[3]) +#define BTRFS_IOC_RM_DEV_V2 _IOW(BTRFS_IOCTL_MAGIC, 58, \ + struct btrfs_ioctl_vol_args_v2) #endif /* _UAPI_LINUX_BTRFS_H */ -- cgit v1.2.3-55-g7522 From 02feae3c5525771878461b90edd2ba38fd3f5359 Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Sat, 13 Feb 2016 10:01:40 +0800 Subject: btrfs: optimize check for stale device Optimize check for stale device to only be checked when there is device added or changed. If there is no update to the device, there is no need to call btrfs_free_stale_device(). Signed-off-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 1421d711629f..2fdd2d3322aa 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -699,7 +699,8 @@ static noinline int device_list_add(const char *path, * if there is new btrfs on an already registered device, * then remove the stale device entry. */ - btrfs_free_stale_device(device); + if (ret > 0) + btrfs_free_stale_device(device); *fs_devices_ret = fs_devices; -- cgit v1.2.3-55-g7522 From f47ab2588e424cb4898b75ace1e2323ddd18b990 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Mon, 15 Feb 2016 15:28:48 +0100 Subject: btrfs: rename __check_raid_min_devices Underscores are for special functions, use the full prefix for better stacktrace recognition. Reviewed-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 2fdd2d3322aa..9051b4472bea 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1706,7 +1706,7 @@ out: return ret; } -static int __check_raid_min_devices(struct btrfs_fs_info *fs_info) +static int btrfs_check_raid_min_devices(struct btrfs_fs_info *fs_info) { u64 all_avail; u64 num_devices; @@ -1761,7 +1761,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path, u64 devid) mutex_lock(&uuid_mutex); - ret = __check_raid_min_devices(root->fs_info); + ret = btrfs_check_raid_min_devices(root->fs_info); if (ret) goto out; -- cgit v1.2.3-55-g7522 From 3cc31a0d5bc77e35229671307eba5ea5c4c4cb9d Mon Sep 17 00:00:00 2001 From: David Sterba Date: Mon, 15 Feb 2016 16:00:26 +0100 Subject: btrfs: pass number of devices to btrfs_check_raid_min_devices Before this patch, btrfs_check_raid_min_devices would do an off-by-one check of the constraints and not the miminmum check, as its name suggests. This is not a problem if the only caller is device remove, but would be confusing for others. Add an argument with the exact number and let the caller(s) decide if this needs any adjustments, like when device replace is running. Reviewed-by: Anand Jain Tested-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 35 ++++++++++++++++++++--------------- 1 file changed, 20 insertions(+), 15 deletions(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 9051b4472bea..e966f0ce07f8 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1706,20 +1706,17 @@ out: return ret; } -static int btrfs_check_raid_min_devices(struct btrfs_fs_info *fs_info) +/* + * Verify that @num_devices satisfies the RAID profile constraints in the whole + * filesystem. It's up to the caller to adjust that number regarding eg. device + * replace. + */ +static int btrfs_check_raid_min_devices(struct btrfs_fs_info *fs_info, + u64 num_devices) { u64 all_avail; - u64 num_devices; unsigned seq; - num_devices = fs_info->fs_devices->num_devices; - btrfs_dev_replace_lock(&fs_info->dev_replace, 0); - if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) { - WARN_ON(num_devices < 1); - num_devices--; - } - btrfs_dev_replace_unlock(&fs_info->dev_replace, 0); - do { seq = read_seqbegin(&fs_info->profiles_lock); @@ -1728,21 +1725,21 @@ static int btrfs_check_raid_min_devices(struct btrfs_fs_info *fs_info) fs_info->avail_metadata_alloc_bits; } while (read_seqretry(&fs_info->profiles_lock, seq)); - if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) && num_devices <= 4) { + if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) && num_devices < 4) { return BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET; } - if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) && num_devices <= 2) { + if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) && num_devices < 2) { return BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET; } if ((all_avail & BTRFS_BLOCK_GROUP_RAID5) && - fs_info->fs_devices->rw_devices <= 2) { + fs_info->fs_devices->rw_devices < 2) { return BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET; } if ((all_avail & BTRFS_BLOCK_GROUP_RAID6) && - fs_info->fs_devices->rw_devices <= 3) { + fs_info->fs_devices->rw_devices < 3) { return BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET; } @@ -1761,7 +1758,15 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path, u64 devid) mutex_lock(&uuid_mutex); - ret = btrfs_check_raid_min_devices(root->fs_info); + num_devices = root->fs_info->fs_devices->num_devices; + btrfs_dev_replace_lock(&root->fs_info->dev_replace, 0); + if (btrfs_dev_replace_is_ongoing(&root->fs_info->dev_replace)) { + WARN_ON(num_devices < 1); + num_devices--; + } + btrfs_dev_replace_unlock(&root->fs_info->dev_replace, 0); + + ret = btrfs_check_raid_min_devices(root->fs_info, num_devices - 1); if (ret) goto out; -- cgit v1.2.3-55-g7522 From 621292bae6ae3b25cd3124b63603d01df4cccfb6 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Mon, 15 Feb 2016 16:28:03 +0100 Subject: btrfs: introduce raid-type to error-code table, for minimum device constraint Reviewed-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 15 +++++++++++++++ fs/btrfs/volumes.h | 2 +- 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index e966f0ce07f8..6b38b72d678b 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -118,6 +118,21 @@ const u64 btrfs_raid_group[BTRFS_NR_RAID_TYPES] = { [BTRFS_RAID_RAID6] = BTRFS_BLOCK_GROUP_RAID6, }; +/* + * Table to convert BTRFS_RAID_* to the error code if minimum number of devices + * condition is not met. Zero means there's no corresponding + * BTRFS_ERROR_DEV_*_NOT_MET value. + */ +const int btrfs_raid_mindev_error[BTRFS_NR_RAID_TYPES] = { + [BTRFS_RAID_RAID10] = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET, + [BTRFS_RAID_RAID1] = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET, + [BTRFS_RAID_DUP] = 0, + [BTRFS_RAID_RAID0] = 0, + [BTRFS_RAID_SINGLE] = 0, + [BTRFS_RAID_RAID5] = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET, + [BTRFS_RAID_RAID6] = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET, +}; + static int init_first_rw_device(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_device *device); diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index c73d027e2f8b..a13a538cb01e 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -340,7 +340,7 @@ struct btrfs_raid_attr { }; extern const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES]; - +extern const int btrfs_raid_mindev_error[BTRFS_NR_RAID_TYPES]; extern const u64 btrfs_raid_group[BTRFS_NR_RAID_TYPES]; struct map_lookup { -- cgit v1.2.3-55-g7522 From 418775a22b4c67bd15915e043c3a8f29816799bd Mon Sep 17 00:00:00 2001 From: David Sterba Date: Mon, 15 Feb 2016 16:28:14 +0100 Subject: btrfs: use existing device constraints table btrfs_raid_array We should avoid duplicating the device constraints, let's use the btrfs_raid_array in btrfs_check_raid_min_devices. Reviewed-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 23 +++++++++-------------- 1 file changed, 9 insertions(+), 14 deletions(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 6b38b72d678b..055fbebea199 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1731,6 +1731,7 @@ static int btrfs_check_raid_min_devices(struct btrfs_fs_info *fs_info, { u64 all_avail; unsigned seq; + int i; do { seq = read_seqbegin(&fs_info->profiles_lock); @@ -1740,22 +1741,16 @@ static int btrfs_check_raid_min_devices(struct btrfs_fs_info *fs_info, fs_info->avail_metadata_alloc_bits; } while (read_seqretry(&fs_info->profiles_lock, seq)); - if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) && num_devices < 4) { - return BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET; - } - - if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) && num_devices < 2) { - return BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET; - } + for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) { + if (!(all_avail & btrfs_raid_group[i])) + continue; - if ((all_avail & BTRFS_BLOCK_GROUP_RAID5) && - fs_info->fs_devices->rw_devices < 2) { - return BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET; - } + if (num_devices < btrfs_raid_array[i].devs_min) { + int ret = btrfs_raid_mindev_error[i]; - if ((all_avail & BTRFS_BLOCK_GROUP_RAID6) && - fs_info->fs_devices->rw_devices < 3) { - return BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET; + if (ret) + return ret; + } } return 0; -- cgit v1.2.3-55-g7522 From 5c5c0df05deaebcdcc9bb31bdca3812a7c22230f Mon Sep 17 00:00:00 2001 From: David Sterba Date: Mon, 15 Feb 2016 16:39:55 +0100 Subject: btrfs: rename btrfs_find_device_by_user_input For clarity how we are going to find the device, let's call it a device specifier, devspec for short. Also rename the arguments that are a leftover from previous function purpose. Reviewed-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/dev-replace.c | 2 +- fs/btrfs/volumes.c | 17 ++++++++++------- fs/btrfs/volumes.h | 4 ++-- 3 files changed, 13 insertions(+), 10 deletions(-) diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index fdd2880707df..b7f5f4aa6e86 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -322,7 +322,7 @@ int btrfs_dev_replace_start(struct btrfs_root *root, /* the disk copy procedure reuses the scrub code */ mutex_lock(&fs_info->volume_mutex); - ret = btrfs_find_device_by_user_input(root, args->start.srcdevid, + ret = btrfs_find_device_by_devspec(root, args->start.srcdevid, args->start.srcdev_name, &src_device); if (ret) { diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 055fbebea199..e43b01e438b1 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1780,7 +1780,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path, u64 devid) if (ret) goto out; - ret = btrfs_find_device_by_user_input(root, devid, device_path, + ret = btrfs_find_device_by_devspec(root, devid, device_path, &device); if (ret) goto out; @@ -2065,23 +2065,26 @@ int btrfs_find_device_missing_or_by_path(struct btrfs_root *root, } } -int btrfs_find_device_by_user_input(struct btrfs_root *root, u64 srcdevid, - char *srcdev_name, +/* + * Lookup a device given by device id, or the path if the id is 0. + */ +int btrfs_find_device_by_devspec(struct btrfs_root *root, u64 devid, + char *devpath, struct btrfs_device **device) { int ret; - if (srcdevid) { + if (devid) { ret = 0; - *device = btrfs_find_device(root->fs_info, srcdevid, NULL, + *device = btrfs_find_device(root->fs_info, devid, NULL, NULL); if (!*device) ret = -ENOENT; } else { - if (!srcdev_name || !srcdev_name[0]) + if (!devpath || !devpath[0]) return -EINVAL; - ret = btrfs_find_device_missing_or_by_path(root, srcdev_name, + ret = btrfs_find_device_missing_or_by_path(root, devpath, device); } return ret; diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index a13a538cb01e..febdb7bc9370 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -448,8 +448,8 @@ void btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices, int step); int btrfs_find_device_missing_or_by_path(struct btrfs_root *root, char *device_path, struct btrfs_device **device); -int btrfs_find_device_by_user_input(struct btrfs_root *root, u64 srcdevid, - char *srcdev_name, +int btrfs_find_device_by_devspec(struct btrfs_root *root, u64 devid, + char *devpath, struct btrfs_device **device); struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info, const u64 *devid, -- cgit v1.2.3-55-g7522 From 735654ea91a06a30bfe05fdfd09c8895abf6c1bf Mon Sep 17 00:00:00 2001 From: David Sterba Date: Mon, 15 Feb 2016 18:15:21 +0100 Subject: btrfs: rename flags for vol args v2 Rename BTRFS_DEVICE_BY_ID so it's more descriptive that we specify the device by id, it'll be part of the public API. The mask of supported flags is also renamed, only for internal use. The error code for unknown flags is EOPNOTSUPP, fixed. Reviewed-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/ioctl.c | 8 ++++---- include/uapi/linux/btrfs.h | 7 ++++--- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 7cf0f6328d99..65903ec5e1c0 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -2687,8 +2687,8 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg) } /* Check for compatibility reject unknown flags */ - if (vol_args->flags & ~BTRFS_VOL_ARG_V2_FLAGS) - return -ENOTTY; + if (vol_args->flags & ~BTRFS_VOL_ARG_V2_FLAGS_SUPPORTED) + return -EOPNOTSUPP; if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running, 1)) { @@ -2697,7 +2697,7 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg) } mutex_lock(&root->fs_info->volume_mutex); - if (vol_args->flags & BTRFS_DEVICE_BY_ID) { + if (vol_args->flags & BTRFS_DEVICE_SPEC_BY_ID) { ret = btrfs_rm_device(root, NULL, vol_args->devid); } else { vol_args->name[BTRFS_SUBVOL_NAME_MAX] = '\0'; @@ -2707,7 +2707,7 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg) atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0); if (!ret) { - if (vol_args->flags & BTRFS_DEVICE_BY_ID) + if (vol_args->flags & BTRFS_DEVICE_SPEC_BY_ID) btrfs_info(root->fs_info, "device deleted: id %llu", vol_args->devid); else diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h index 396a4efca775..3975e683af72 100644 --- a/include/uapi/linux/btrfs.h +++ b/include/uapi/linux/btrfs.h @@ -36,12 +36,13 @@ struct btrfs_ioctl_vol_args { #define BTRFS_SUBVOL_CREATE_ASYNC (1ULL << 0) #define BTRFS_SUBVOL_RDONLY (1ULL << 1) #define BTRFS_SUBVOL_QGROUP_INHERIT (1ULL << 2) -#define BTRFS_DEVICE_BY_ID (1ULL << 3) -#define BTRFS_VOL_ARG_V2_FLAGS \ +#define BTRFS_DEVICE_SPEC_BY_ID (1ULL << 3) + +#define BTRFS_VOL_ARG_V2_FLAGS_SUPPORTED \ (BTRFS_SUBVOL_CREATE_ASYNC | \ BTRFS_SUBVOL_RDONLY | \ BTRFS_SUBVOL_QGROUP_INHERIT | \ - BTRFS_DEVICE_BY_ID) + BTRFS_DEVICE_SPEC_BY_ID) #define BTRFS_FSID_SIZE 16 #define BTRFS_UUID_SIZE 16 -- cgit v1.2.3-55-g7522 From fc23c246d72d21385be115305d1cb85fcc34acad Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Thu, 24 Mar 2016 18:48:12 +0800 Subject: btrfs: use fs_info directly Local variable fs_info, contains root->fs_info, use it. Signed-off-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/dev-replace.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index b7f5f4aa6e86..1a3d24592859 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -368,7 +368,7 @@ int btrfs_dev_replace_start(struct btrfs_root *root, WARN_ON(!tgt_device); dev_replace->tgtdev = tgt_device; - btrfs_info_in_rcu(root->fs_info, + btrfs_info_in_rcu(fs_info, "dev_replace from %s (devid %llu) to %s started", src_device->missing ? "" : rcu_str_deref(src_device->name), @@ -394,9 +394,9 @@ int btrfs_dev_replace_start(struct btrfs_root *root, ret = btrfs_sysfs_add_device_link(tgt_device->fs_devices, tgt_device); if (ret) - btrfs_err(root->fs_info, "kobj add dev failed %d\n", ret); + btrfs_err(fs_info, "kobj add dev failed %d\n", ret); - btrfs_wait_ordered_roots(root->fs_info, -1); + btrfs_wait_ordered_roots(fs_info, -1); /* force writing the updated state information to disk */ trans = btrfs_start_transaction(root, 0); @@ -414,7 +414,7 @@ int btrfs_dev_replace_start(struct btrfs_root *root, btrfs_device_get_total_bytes(src_device), &dev_replace->scrub_progress, 0, 1); - ret = btrfs_dev_replace_finishing(root->fs_info, ret); + ret = btrfs_dev_replace_finishing(fs_info, ret); /* don't warn if EINPROGRESS, someone else might be running scrub */ if (ret == -EINPROGRESS) { args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS; -- cgit v1.2.3-55-g7522 From b5255456c529155730c837f8cfcea47e8feb85ca Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Thu, 24 Mar 2016 18:48:14 +0800 Subject: btrfs: refactor btrfs_dev_replace_start for reuse A refactor patch, and avoids user input verification in the btrfs_dev_replace_start(), and so this function can be reused. Signed-off-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/dev-replace.c | 58 ++++++++++++++++++++++++++++++++------------------ fs/btrfs/dev-replace.h | 4 +++- fs/btrfs/ioctl.c | 2 +- 3 files changed, 41 insertions(+), 23 deletions(-) diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 1a3d24592859..5aebedf12b5f 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -302,8 +302,8 @@ void btrfs_after_dev_replace_commit(struct btrfs_fs_info *fs_info) dev_replace->cursor_left_last_write_of_item; } -int btrfs_dev_replace_start(struct btrfs_root *root, - struct btrfs_ioctl_dev_replace_args *args) +int btrfs_dev_replace_start(struct btrfs_root *root, char *tgtdev_name, + u64 srcdevid, char *srcdev_name, int read_src) { struct btrfs_trans_handle *trans; struct btrfs_fs_info *fs_info = root->fs_info; @@ -312,25 +312,16 @@ int btrfs_dev_replace_start(struct btrfs_root *root, struct btrfs_device *tgt_device = NULL; struct btrfs_device *src_device = NULL; - switch (args->start.cont_reading_from_srcdev_mode) { - case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_ALWAYS: - case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_AVOID: - break; - default: - return -EINVAL; - } - /* the disk copy procedure reuses the scrub code */ mutex_lock(&fs_info->volume_mutex); - ret = btrfs_find_device_by_devspec(root, args->start.srcdevid, - args->start.srcdev_name, - &src_device); + ret = btrfs_find_device_by_devspec(root, srcdevid, + srcdev_name, &src_device); if (ret) { mutex_unlock(&fs_info->volume_mutex); return ret; } - ret = btrfs_init_dev_replace_tgtdev(root, args->start.tgtdev_name, + ret = btrfs_init_dev_replace_tgtdev(root, tgtdev_name, src_device, &tgt_device); mutex_unlock(&fs_info->volume_mutex); if (ret) @@ -357,12 +348,11 @@ int btrfs_dev_replace_start(struct btrfs_root *root, break; case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: - args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_ALREADY_STARTED; + ret = BTRFS_IOCTL_DEV_REPLACE_RESULT_ALREADY_STARTED; goto leave; } - dev_replace->cont_reading_from_srcdev_mode = - args->start.cont_reading_from_srcdev_mode; + dev_replace->cont_reading_from_srcdev_mode = read_src; WARN_ON(!src_device); dev_replace->srcdev = src_device; WARN_ON(!tgt_device); @@ -389,7 +379,6 @@ int btrfs_dev_replace_start(struct btrfs_root *root, dev_replace->item_needs_writeback = 1; atomic64_set(&dev_replace->num_write_errors, 0); atomic64_set(&dev_replace->num_uncorrectable_read_errors, 0); - args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR; btrfs_dev_replace_unlock(dev_replace, 1); ret = btrfs_sysfs_add_device_link(tgt_device->fs_devices, tgt_device); @@ -415,10 +404,8 @@ int btrfs_dev_replace_start(struct btrfs_root *root, &dev_replace->scrub_progress, 0, 1); ret = btrfs_dev_replace_finishing(fs_info, ret); - /* don't warn if EINPROGRESS, someone else might be running scrub */ if (ret == -EINPROGRESS) { - args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS; - ret = 0; + ret = BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS; } else { WARN_ON(ret); } @@ -433,6 +420,35 @@ leave: return ret; } +int btrfs_dev_replace_by_ioctl(struct btrfs_root *root, + struct btrfs_ioctl_dev_replace_args *args) +{ + int ret; + + switch (args->start.cont_reading_from_srcdev_mode) { + case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_ALWAYS: + case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_AVOID: + break; + default: + return -EINVAL; + } + + if ((args->start.srcdevid == 0 && args->start.srcdev_name[0] == '\0') || + args->start.tgtdev_name[0] == '\0') + return -EINVAL; + + ret = btrfs_dev_replace_start(root, args->start.tgtdev_name, + args->start.srcdevid, + args->start.srcdev_name, + args->start.cont_reading_from_srcdev_mode); + args->result = ret; + /* don't warn if EINPROGRESS, someone else might be running scrub */ + if (ret == BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS) + ret = 0; + + return ret; +} + /* * blocked until all flighting bios are finished. */ diff --git a/fs/btrfs/dev-replace.h b/fs/btrfs/dev-replace.h index 29e3ef5f96bd..e922b42d91df 100644 --- a/fs/btrfs/dev-replace.h +++ b/fs/btrfs/dev-replace.h @@ -25,8 +25,10 @@ int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info); int btrfs_run_dev_replace(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); void btrfs_after_dev_replace_commit(struct btrfs_fs_info *fs_info); -int btrfs_dev_replace_start(struct btrfs_root *root, +int btrfs_dev_replace_by_ioctl(struct btrfs_root *root, struct btrfs_ioctl_dev_replace_args *args); +int btrfs_dev_replace_start(struct btrfs_root *root, char *tgtdev_name, + u64 srcdevid, char *srcdev_name, int read_src); void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info, struct btrfs_ioctl_dev_replace_args *args); int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info, diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 65903ec5e1c0..36b1ed223509 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -4430,7 +4430,7 @@ static long btrfs_ioctl_dev_replace(struct btrfs_root *root, void __user *arg) 1)) { ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; } else { - ret = btrfs_dev_replace_start(root, p); + ret = btrfs_dev_replace_by_ioctl(root, p); atomic_set( &root->fs_info->mutually_exclusive_operation_running, 0); -- cgit v1.2.3-55-g7522 From d4ae133b2d195d88cf5394072724adfa6ccdd64b Mon Sep 17 00:00:00 2001 From: Jeff Mahoney Date: Fri, 1 Apr 2016 16:14:23 -0400 Subject: btrfs: uapi/linux/btrfs.h migration, move BTRFS_LABEL_SIZE BTRFS_LABEL_SIZE is required to define the BTRFS_IOC_GET_FSLABEL and BTRFS_IOC_SET_FSLABEL ioctls. Signed-off-by: Jeff Mahoney Reviewed-by: Liu Bo Reviewed-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 1 - include/uapi/linux/btrfs.h | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 84a6a5b3384a..3beaa24aff9b 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -410,7 +410,6 @@ struct btrfs_header { * room to translate 14 chunks with 3 stripes each. */ #define BTRFS_SYSTEM_CHUNK_ARRAY_SIZE 2048 -#define BTRFS_LABEL_SIZE 256 /* * just in case we somehow lose the roots and are not able to mount, diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h index dea893199257..11eee3439994 100644 --- a/include/uapi/linux/btrfs.h +++ b/include/uapi/linux/btrfs.h @@ -23,6 +23,7 @@ #define BTRFS_IOCTL_MAGIC 0x94 #define BTRFS_VOL_NAME_MAX 255 +#define BTRFS_LABEL_SIZE 256 /* this should be 4k */ #define BTRFS_PATH_NAME_MAX 4087 -- cgit v1.2.3-55-g7522 From 83288b60bf6668933689078973136e0c9d387b38 Mon Sep 17 00:00:00 2001 From: Jeff Mahoney Date: Fri, 1 Apr 2016 16:14:24 -0400 Subject: btrfs: uapi/linux/btrfs.h migration, qgroup limit flags The BTRFS_QGROUP_LIMIT_* flags are required to tell the kernel which fields are valid when using the BTRFS_IOC_QGROUP_LIMIT ioctl. Signed-off-by: Jeff Mahoney Reviewed-by: Liu Bo Reviewed-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 8 -------- include/uapi/linux/btrfs.h | 22 +++++++++++++++++++++- 2 files changed, 21 insertions(+), 9 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 3beaa24aff9b..c228b39fbd15 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1154,14 +1154,6 @@ struct btrfs_qgroup_info_item { __le64 excl_cmpr; } __attribute__ ((__packed__)); -/* flags definition for qgroup limits */ -#define BTRFS_QGROUP_LIMIT_MAX_RFER (1ULL << 0) -#define BTRFS_QGROUP_LIMIT_MAX_EXCL (1ULL << 1) -#define BTRFS_QGROUP_LIMIT_RSV_RFER (1ULL << 2) -#define BTRFS_QGROUP_LIMIT_RSV_EXCL (1ULL << 3) -#define BTRFS_QGROUP_LIMIT_RFER_CMPR (1ULL << 4) -#define BTRFS_QGROUP_LIMIT_EXCL_CMPR (1ULL << 5) - struct btrfs_qgroup_limit_item { /* * only updated when any of the other values change diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h index 11eee3439994..9651af3c46a1 100644 --- a/include/uapi/linux/btrfs.h +++ b/include/uapi/linux/btrfs.h @@ -41,7 +41,19 @@ struct btrfs_ioctl_vol_args { #define BTRFS_UUID_SIZE 16 #define BTRFS_UUID_UNPARSED_SIZE 37 -#define BTRFS_QGROUP_INHERIT_SET_LIMITS (1ULL << 0) +/* + * flags definition for qgroup limits + * + * Used by: + * struct btrfs_qgroup_limit.flags + * struct btrfs_qgroup_limit_item.flags + */ +#define BTRFS_QGROUP_LIMIT_MAX_RFER (1ULL << 0) +#define BTRFS_QGROUP_LIMIT_MAX_EXCL (1ULL << 1) +#define BTRFS_QGROUP_LIMIT_RSV_RFER (1ULL << 2) +#define BTRFS_QGROUP_LIMIT_RSV_EXCL (1ULL << 3) +#define BTRFS_QGROUP_LIMIT_RFER_CMPR (1ULL << 4) +#define BTRFS_QGROUP_LIMIT_EXCL_CMPR (1ULL << 5) struct btrfs_qgroup_limit { __u64 flags; @@ -51,6 +63,14 @@ struct btrfs_qgroup_limit { __u64 rsv_excl; }; +/* + * flags definition for qgroup inheritance + * + * Used by: + * struct btrfs_qgroup_inherit.flags + */ +#define BTRFS_QGROUP_INHERIT_SET_LIMITS (1ULL << 0) + struct btrfs_qgroup_inherit { __u64 flags; __u64 num_qgroups; -- cgit v1.2.3-55-g7522 From 884f6eca59475bc3cad5c22360523a1261bd5597 Mon Sep 17 00:00:00 2001 From: Jeff Mahoney Date: Fri, 1 Apr 2016 16:14:25 -0400 Subject: btrfs: uapi/linux/btrfs.h migration, document subvol flags Signed-off-by: Jeff Mahoney Reviewed-by: Liu Bo Reviewed-by: Josef Bacik Signed-off-by: David Sterba --- include/uapi/linux/btrfs.h | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h index 9651af3c46a1..0316e23b32cc 100644 --- a/include/uapi/linux/btrfs.h +++ b/include/uapi/linux/btrfs.h @@ -34,9 +34,6 @@ struct btrfs_ioctl_vol_args { #define BTRFS_DEVICE_PATH_NAME_MAX 1024 -#define BTRFS_SUBVOL_CREATE_ASYNC (1ULL << 0) -#define BTRFS_SUBVOL_RDONLY (1ULL << 1) -#define BTRFS_SUBVOL_QGROUP_INHERIT (1ULL << 2) #define BTRFS_FSID_SIZE 16 #define BTRFS_UUID_SIZE 16 #define BTRFS_UUID_UNPARSED_SIZE 37 @@ -85,6 +82,20 @@ struct btrfs_ioctl_qgroup_limit_args { struct btrfs_qgroup_limit lim; }; +/* + * flags for subvolumes + * + * Used by: + * struct btrfs_ioctl_vol_args_v2.flags + * + * BTRFS_SUBVOL_RDONLY is also provided/consumed by the following ioctls: + * - BTRFS_IOC_SUBVOL_GETFLAGS + * - BTRFS_IOC_SUBVOL_SETFLAGS + */ +#define BTRFS_SUBVOL_CREATE_ASYNC (1ULL << 0) +#define BTRFS_SUBVOL_RDONLY (1ULL << 1) +#define BTRFS_SUBVOL_QGROUP_INHERIT (1ULL << 2) + #define BTRFS_SUBVOL_NAME_MAX 4039 struct btrfs_ioctl_vol_args_v2 { __s64 fd; -- cgit v1.2.3-55-g7522 From 18db9ac644badcb948a623791e599672edade6dd Mon Sep 17 00:00:00 2001 From: Jeff Mahoney Date: Fri, 1 Apr 2016 16:14:26 -0400 Subject: btrfs: uapi/linux/btrfs.h migration, move feature flags The compat/compat_ro/incompat feature flags are used by the feature set/get ioctls. Signed-off-by: Jeff Mahoney Reviewed-by: Liu Bo Reviewed-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 25 ------------------------- include/uapi/linux/btrfs.h | 31 +++++++++++++++++++++++++++++++ 2 files changed, 31 insertions(+), 25 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index c228b39fbd15..378482c705c3 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -506,31 +506,6 @@ struct btrfs_super_block { * Compat flags that we support. If any incompat flags are set other than the * ones specified below then we will fail to mount */ -#define BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE (1ULL << 0) - -#define BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF (1ULL << 0) -#define BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL (1ULL << 1) -#define BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS (1ULL << 2) -#define BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO (1ULL << 3) -/* - * some patches floated around with a second compression method - * lets save that incompat here for when they do get in - * Note we don't actually support it, we're just reserving the - * number - */ -#define BTRFS_FEATURE_INCOMPAT_COMPRESS_LZOv2 (1ULL << 4) - -/* - * older kernels tried to do bigger metadata blocks, but the - * code was pretty buggy. Lets not let them try anymore. - */ -#define BTRFS_FEATURE_INCOMPAT_BIG_METADATA (1ULL << 5) - -#define BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF (1ULL << 6) -#define BTRFS_FEATURE_INCOMPAT_RAID56 (1ULL << 7) -#define BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA (1ULL << 8) -#define BTRFS_FEATURE_INCOMPAT_NO_HOLES (1ULL << 9) - #define BTRFS_FEATURE_COMPAT_SUPP 0ULL #define BTRFS_FEATURE_COMPAT_SAFE_SET 0ULL #define BTRFS_FEATURE_COMPAT_SAFE_CLEAR 0ULL diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h index 0316e23b32cc..de98717386ee 100644 --- a/include/uapi/linux/btrfs.h +++ b/include/uapi/linux/btrfs.h @@ -222,6 +222,37 @@ struct btrfs_ioctl_fs_info_args { __u64 reserved[122]; /* pad to 1k */ }; +/* + * feature flags + * + * Used by: + * struct btrfs_ioctl_feature_flags + */ +#define BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE (1ULL << 0) + +#define BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF (1ULL << 0) +#define BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL (1ULL << 1) +#define BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS (1ULL << 2) +#define BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO (1ULL << 3) +/* + * some patches floated around with a second compression method + * lets save that incompat here for when they do get in + * Note we don't actually support it, we're just reserving the + * number + */ +#define BTRFS_FEATURE_INCOMPAT_COMPRESS_LZOv2 (1ULL << 4) + +/* + * older kernels tried to do bigger metadata blocks, but the + * code was pretty buggy. Lets not let them try anymore. + */ +#define BTRFS_FEATURE_INCOMPAT_BIG_METADATA (1ULL << 5) + +#define BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF (1ULL << 6) +#define BTRFS_FEATURE_INCOMPAT_RAID56 (1ULL << 7) +#define BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA (1ULL << 8) +#define BTRFS_FEATURE_INCOMPAT_NO_HOLES (1ULL << 9) + struct btrfs_ioctl_feature_flags { __u64 compat_flags; __u64 compat_ro_flags; -- cgit v1.2.3-55-g7522 From 04cd01dffbf9be14ccc51595280c8dc8c318a9c9 Mon Sep 17 00:00:00 2001 From: Jeff Mahoney Date: Fri, 1 Apr 2016 16:14:27 -0400 Subject: btrfs: uapi/linux/btrfs.h migration, move balance flags The BTRFS_BALANCE_* flags are used by struct btrfs_ioctl_balance_args.flags and btrfs_ioctl_balance_args.{data,meta,sys}.flags in the BTRFS_IOC_BALANCE ioctl. Signed-off-by: Jeff Mahoney Reviewed-by: Liu Bo Reviewed-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/volumes.h | 46 --------------------------------- include/uapi/linux/btrfs.h | 64 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 64 insertions(+), 46 deletions(-) diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 1939ebde63df..144cec33357a 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -357,52 +357,6 @@ struct map_lookup { #define map_lookup_size(n) (sizeof(struct map_lookup) + \ (sizeof(struct btrfs_bio_stripe) * (n))) -/* - * Restriper's general type filter - */ -#define BTRFS_BALANCE_DATA (1ULL << 0) -#define BTRFS_BALANCE_SYSTEM (1ULL << 1) -#define BTRFS_BALANCE_METADATA (1ULL << 2) - -#define BTRFS_BALANCE_TYPE_MASK (BTRFS_BALANCE_DATA | \ - BTRFS_BALANCE_SYSTEM | \ - BTRFS_BALANCE_METADATA) - -#define BTRFS_BALANCE_FORCE (1ULL << 3) -#define BTRFS_BALANCE_RESUME (1ULL << 4) - -/* - * Balance filters - */ -#define BTRFS_BALANCE_ARGS_PROFILES (1ULL << 0) -#define BTRFS_BALANCE_ARGS_USAGE (1ULL << 1) -#define BTRFS_BALANCE_ARGS_DEVID (1ULL << 2) -#define BTRFS_BALANCE_ARGS_DRANGE (1ULL << 3) -#define BTRFS_BALANCE_ARGS_VRANGE (1ULL << 4) -#define BTRFS_BALANCE_ARGS_LIMIT (1ULL << 5) -#define BTRFS_BALANCE_ARGS_LIMIT_RANGE (1ULL << 6) -#define BTRFS_BALANCE_ARGS_STRIPES_RANGE (1ULL << 7) -#define BTRFS_BALANCE_ARGS_USAGE_RANGE (1ULL << 10) - -#define BTRFS_BALANCE_ARGS_MASK \ - (BTRFS_BALANCE_ARGS_PROFILES | \ - BTRFS_BALANCE_ARGS_USAGE | \ - BTRFS_BALANCE_ARGS_DEVID | \ - BTRFS_BALANCE_ARGS_DRANGE | \ - BTRFS_BALANCE_ARGS_VRANGE | \ - BTRFS_BALANCE_ARGS_LIMIT | \ - BTRFS_BALANCE_ARGS_LIMIT_RANGE | \ - BTRFS_BALANCE_ARGS_STRIPES_RANGE | \ - BTRFS_BALANCE_ARGS_USAGE_RANGE) - -/* - * Profile changing flags. When SOFT is set we won't relocate chunk if - * it already has the target profile (even though it may be - * half-filled). - */ -#define BTRFS_BALANCE_ARGS_CONVERT (1ULL << 8) -#define BTRFS_BALANCE_ARGS_SOFT (1ULL << 9) - struct btrfs_balance_args; struct btrfs_balance_progress; struct btrfs_balance_control { diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h index de98717386ee..abae362d4ec7 100644 --- a/include/uapi/linux/btrfs.h +++ b/include/uapi/linux/btrfs.h @@ -317,6 +317,70 @@ struct btrfs_balance_progress { __u64 completed; /* # of chunks relocated so far */ }; +/* + * flags definition for balance + * + * Restriper's general type filter + * + * Used by: + * btrfs_ioctl_balance_args.flags + * btrfs_balance_control.flags (internal) + */ +#define BTRFS_BALANCE_DATA (1ULL << 0) +#define BTRFS_BALANCE_SYSTEM (1ULL << 1) +#define BTRFS_BALANCE_METADATA (1ULL << 2) + +#define BTRFS_BALANCE_TYPE_MASK (BTRFS_BALANCE_DATA | \ + BTRFS_BALANCE_SYSTEM | \ + BTRFS_BALANCE_METADATA) + +#define BTRFS_BALANCE_FORCE (1ULL << 3) +#define BTRFS_BALANCE_RESUME (1ULL << 4) + +/* + * flags definitions for per-type balance args + * + * Balance filters + * + * Used by: + * struct btrfs_balance_args + */ +#define BTRFS_BALANCE_ARGS_PROFILES (1ULL << 0) +#define BTRFS_BALANCE_ARGS_USAGE (1ULL << 1) +#define BTRFS_BALANCE_ARGS_DEVID (1ULL << 2) +#define BTRFS_BALANCE_ARGS_DRANGE (1ULL << 3) +#define BTRFS_BALANCE_ARGS_VRANGE (1ULL << 4) +#define BTRFS_BALANCE_ARGS_LIMIT (1ULL << 5) +#define BTRFS_BALANCE_ARGS_LIMIT_RANGE (1ULL << 6) +#define BTRFS_BALANCE_ARGS_STRIPES_RANGE (1ULL << 7) +#define BTRFS_BALANCE_ARGS_USAGE_RANGE (1ULL << 10) + +#define BTRFS_BALANCE_ARGS_MASK \ + (BTRFS_BALANCE_ARGS_PROFILES | \ + BTRFS_BALANCE_ARGS_USAGE | \ + BTRFS_BALANCE_ARGS_DEVID | \ + BTRFS_BALANCE_ARGS_DRANGE | \ + BTRFS_BALANCE_ARGS_VRANGE | \ + BTRFS_BALANCE_ARGS_LIMIT | \ + BTRFS_BALANCE_ARGS_LIMIT_RANGE | \ + BTRFS_BALANCE_ARGS_STRIPES_RANGE | \ + BTRFS_BALANCE_ARGS_USAGE_RANGE) + +/* + * Profile changing flags. When SOFT is set we won't relocate chunk if + * it already has the target profile (even though it may be + * half-filled). + */ +#define BTRFS_BALANCE_ARGS_CONVERT (1ULL << 8) +#define BTRFS_BALANCE_ARGS_SOFT (1ULL << 9) + + +/* + * flags definition for balance state + * + * Used by: + * struct btrfs_ioctl_balance_args.state + */ #define BTRFS_BALANCE_STATE_RUNNING (1ULL << 0) #define BTRFS_BALANCE_STATE_PAUSE_REQ (1ULL << 1) #define BTRFS_BALANCE_STATE_CANCEL_REQ (1ULL << 2) -- cgit v1.2.3-55-g7522 From 33ca913349962208e13e894ada99b9ae6e0080ee Mon Sep 17 00:00:00 2001 From: Jeff Mahoney Date: Fri, 1 Apr 2016 16:14:28 -0400 Subject: btrfs: uapi/linux/btrfs.h migration, move struct btrfs_ioctl_defrag_range_args struct btrfs_ioctl_defrag_range_args is used by the BTRFS_IOC_DEFRAG_RANGE ioctl. Signed-off-by: Jeff Mahoney Reviewed-by: Liu Bo Reviewed-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 31 ------------------------------- include/uapi/linux/btrfs.h | 38 +++++++++++++++++++++++++++++++++++++- 2 files changed, 37 insertions(+), 32 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 378482c705c3..89f36b6176b9 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1992,37 +1992,6 @@ struct btrfs_root { atomic_t qgroup_meta_rsv; }; -struct btrfs_ioctl_defrag_range_args { - /* start of the defrag operation */ - __u64 start; - - /* number of bytes to defrag, use (u64)-1 to say all */ - __u64 len; - - /* - * flags for the operation, which can include turning - * on compression for this one defrag - */ - __u64 flags; - - /* - * any extent bigger than this will be considered - * already defragged. Use 0 to take the kernel default - * Use 1 to say every single extent must be rewritten - */ - __u32 extent_thresh; - - /* - * which compression method to use if turning on compression - * for this defrag operation. If unspecified, zlib will - * be used - */ - __u32 compress_type; - - /* spare for later */ - __u32 unused[4]; -}; - /* * inode items have the data typically returned from stat and store other diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h index abae362d4ec7..98aff38a70d3 100644 --- a/include/uapi/linux/btrfs.h +++ b/include/uapi/linux/btrfs.h @@ -474,9 +474,45 @@ struct btrfs_ioctl_clone_range_args { __u64 dest_offset; }; -/* flags for the defrag range ioctl */ +/* + * flags definition for the defrag range ioctl + * + * Used by: + * struct btrfs_ioctl_defrag_range_args.flags + */ #define BTRFS_DEFRAG_RANGE_COMPRESS 1 #define BTRFS_DEFRAG_RANGE_START_IO 2 +struct btrfs_ioctl_defrag_range_args { + /* start of the defrag operation */ + __u64 start; + + /* number of bytes to defrag, use (u64)-1 to say all */ + __u64 len; + + /* + * flags for the operation, which can include turning + * on compression for this one defrag + */ + __u64 flags; + + /* + * any extent bigger than this will be considered + * already defragged. Use 0 to take the kernel default + * Use 1 to say every single extent must be rewritten + */ + __u32 extent_thresh; + + /* + * which compression method to use if turning on compression + * for this defrag operation. If unspecified, zlib will + * be used + */ + __u32 compress_type; + + /* spare for later */ + __u32 unused[4]; +}; + #define BTRFS_SAME_DATA_DIFFERS 1 /* For extent-same ioctl */ -- cgit v1.2.3-55-g7522 From db6711600e27c885aed89751f04e727f3af26715 Mon Sep 17 00:00:00 2001 From: Jeff Mahoney Date: Fri, 1 Apr 2016 16:14:29 -0400 Subject: btrfs: uapi/linux/btrfs_tree.h migration, item types and defines The BTRFS_IOC_SEARCH_TREE ioctl returns file system items directly to userspace. In order to decode them, full type information is required. Create a new header, btrfs_tree to contain these since most users won't need them. Signed-off-by: Jeff Mahoney Reviewed-by: Liu Bo Reviewed-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 949 +-------------------------------------- include/uapi/linux/btrfs_tree.h | 966 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 967 insertions(+), 948 deletions(-) create mode 100644 include/uapi/linux/btrfs_tree.h diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 89f36b6176b9..cf34fb58874c 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -33,6 +33,7 @@ #include #include #include +#include #include #include #include @@ -64,98 +65,6 @@ struct btrfs_ordered_sum; #define BTRFS_COMPAT_EXTENT_TREE_V0 -/* holds pointers to all of the tree roots */ -#define BTRFS_ROOT_TREE_OBJECTID 1ULL - -/* stores information about which extents are in use, and reference counts */ -#define BTRFS_EXTENT_TREE_OBJECTID 2ULL - -/* - * chunk tree stores translations from logical -> physical block numbering - * the super block points to the chunk tree - */ -#define BTRFS_CHUNK_TREE_OBJECTID 3ULL - -/* - * stores information about which areas of a given device are in use. - * one per device. The tree of tree roots points to the device tree - */ -#define BTRFS_DEV_TREE_OBJECTID 4ULL - -/* one per subvolume, storing files and directories */ -#define BTRFS_FS_TREE_OBJECTID 5ULL - -/* directory objectid inside the root tree */ -#define BTRFS_ROOT_TREE_DIR_OBJECTID 6ULL - -/* holds checksums of all the data extents */ -#define BTRFS_CSUM_TREE_OBJECTID 7ULL - -/* holds quota configuration and tracking */ -#define BTRFS_QUOTA_TREE_OBJECTID 8ULL - -/* for storing items that use the BTRFS_UUID_KEY* types */ -#define BTRFS_UUID_TREE_OBJECTID 9ULL - -/* tracks free space in block groups. */ -#define BTRFS_FREE_SPACE_TREE_OBJECTID 10ULL - -/* device stats in the device tree */ -#define BTRFS_DEV_STATS_OBJECTID 0ULL - -/* for storing balance parameters in the root tree */ -#define BTRFS_BALANCE_OBJECTID -4ULL - -/* orhpan objectid for tracking unlinked/truncated files */ -#define BTRFS_ORPHAN_OBJECTID -5ULL - -/* does write ahead logging to speed up fsyncs */ -#define BTRFS_TREE_LOG_OBJECTID -6ULL -#define BTRFS_TREE_LOG_FIXUP_OBJECTID -7ULL - -/* for space balancing */ -#define BTRFS_TREE_RELOC_OBJECTID -8ULL -#define BTRFS_DATA_RELOC_TREE_OBJECTID -9ULL - -/* - * extent checksums all have this objectid - * this allows them to share the logging tree - * for fsyncs - */ -#define BTRFS_EXTENT_CSUM_OBJECTID -10ULL - -/* For storing free space cache */ -#define BTRFS_FREE_SPACE_OBJECTID -11ULL - -/* - * The inode number assigned to the special inode for storing - * free ino cache - */ -#define BTRFS_FREE_INO_OBJECTID -12ULL - -/* dummy objectid represents multiple objectids */ -#define BTRFS_MULTIPLE_OBJECTIDS -255ULL - -/* - * All files have objectids in this range. - */ -#define BTRFS_FIRST_FREE_OBJECTID 256ULL -#define BTRFS_LAST_FREE_OBJECTID -256ULL -#define BTRFS_FIRST_CHUNK_TREE_OBJECTID 256ULL - - -/* - * the device items go into the chunk tree. The key is in the form - * [ 1 BTRFS_DEV_ITEM_KEY device_id ] - */ -#define BTRFS_DEV_ITEMS_OBJECTID 1ULL - -#define BTRFS_BTREE_INODE_OBJECTID 1 - -#define BTRFS_EMPTY_SUBVOL_DIR_OBJECTID 2 - -#define BTRFS_DEV_REPLACE_DEVID 0ULL - /* * the max metadata block size. This limit is somewhat artificial, * but the memmove costs go through the roof for larger blocks. @@ -175,12 +84,6 @@ struct btrfs_ordered_sum; */ #define BTRFS_LINK_MAX 65535U -/* 32 bytes in various csum fields */ -#define BTRFS_CSUM_SIZE 32 - -/* csum types */ -#define BTRFS_CSUM_TYPE_CRC32 0 - static const int btrfs_csum_sizes[] = { 4 }; /* four bytes for CRC32 */ @@ -189,17 +92,6 @@ static const int btrfs_csum_sizes[] = { 4 }; /* spefic to btrfs_map_block(), therefore not in include/linux/blk_types.h */ #define REQ_GET_READ_MIRRORS (1 << 30) -#define BTRFS_FT_UNKNOWN 0 -#define BTRFS_FT_REG_FILE 1 -#define BTRFS_FT_DIR 2 -#define BTRFS_FT_CHRDEV 3 -#define BTRFS_FT_BLKDEV 4 -#define BTRFS_FT_FIFO 5 -#define BTRFS_FT_SOCK 6 -#define BTRFS_FT_SYMLINK 7 -#define BTRFS_FT_XATTR 8 -#define BTRFS_FT_MAX 9 - /* ioprio of readahead is set to idle */ #define BTRFS_IOPRIO_READA (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)) @@ -207,138 +99,10 @@ static const int btrfs_csum_sizes[] = { 4 }; #define BTRFS_MAX_EXTENT_SIZE SZ_128M -/* - * The key defines the order in the tree, and so it also defines (optimal) - * block layout. - * - * objectid corresponds to the inode number. - * - * type tells us things about the object, and is a kind of stream selector. - * so for a given inode, keys with type of 1 might refer to the inode data, - * type of 2 may point to file data in the btree and type == 3 may point to - * extents. - * - * offset is the starting byte offset for this key in the stream. - * - * btrfs_disk_key is in disk byte order. struct btrfs_key is always - * in cpu native order. Otherwise they are identical and their sizes - * should be the same (ie both packed) - */ -struct btrfs_disk_key { - __le64 objectid; - u8 type; - __le64 offset; -} __attribute__ ((__packed__)); - -struct btrfs_key { - u64 objectid; - u8 type; - u64 offset; -} __attribute__ ((__packed__)); - struct btrfs_mapping_tree { struct extent_map_tree map_tree; }; -struct btrfs_dev_item { - /* the internal btrfs device id */ - __le64 devid; - - /* size of the device */ - __le64 total_bytes; - - /* bytes used */ - __le64 bytes_used; - - /* optimal io alignment for this device */ - __le32 io_align; - - /* optimal io width for this device */ - __le32 io_width; - - /* minimal io size for this device */ - __le32 sector_size; - - /* type and info about this device */ - __le64 type; - - /* expected generation for this device */ - __le64 generation; - - /* - * starting byte of this partition on the device, - * to allow for stripe alignment in the future - */ - __le64 start_offset; - - /* grouping information for allocation decisions */ - __le32 dev_group; - - /* seek speed 0-100 where 100 is fastest */ - u8 seek_speed; - - /* bandwidth 0-100 where 100 is fastest */ - u8 bandwidth; - - /* btrfs generated uuid for this device */ - u8 uuid[BTRFS_UUID_SIZE]; - - /* uuid of FS who owns this device */ - u8 fsid[BTRFS_UUID_SIZE]; -} __attribute__ ((__packed__)); - -struct btrfs_stripe { - __le64 devid; - __le64 offset; - u8 dev_uuid[BTRFS_UUID_SIZE]; -} __attribute__ ((__packed__)); - -struct btrfs_chunk { - /* size of this chunk in bytes */ - __le64 length; - - /* objectid of the root referencing this chunk */ - __le64 owner; - - __le64 stripe_len; - __le64 type; - - /* optimal io alignment for this chunk */ - __le32 io_align; - - /* optimal io width for this chunk */ - __le32 io_width; - - /* minimal io size for this chunk */ - __le32 sector_size; - - /* 2^16 stripes is quite a lot, a second limit is the size of a single - * item in the btree - */ - __le16 num_stripes; - - /* sub stripes only matter for raid10 */ - __le16 sub_stripes; - struct btrfs_stripe stripe; - /* additional stripes go here */ -} __attribute__ ((__packed__)); - -#define BTRFS_FREE_SPACE_EXTENT 1 -#define BTRFS_FREE_SPACE_BITMAP 2 - -struct btrfs_free_space_entry { - __le64 offset; - __le64 bytes; - u8 type; -} __attribute__ ((__packed__)); - -struct btrfs_free_space_header { - struct btrfs_disk_key location; - __le64 generation; - __le64 num_entries; - __le64 num_bitmaps; -} __attribute__ ((__packed__)); - static inline unsigned long btrfs_chunk_item_size(int num_stripes) { BUG_ON(num_stripes == 0); @@ -346,9 +110,6 @@ static inline unsigned long btrfs_chunk_item_size(int num_stripes) sizeof(struct btrfs_stripe) * (num_stripes - 1); } -#define BTRFS_HEADER_FLAG_WRITTEN (1ULL << 0) -#define BTRFS_HEADER_FLAG_RELOC (1ULL << 1) - /* * File system states */ @@ -357,13 +118,6 @@ static inline unsigned long btrfs_chunk_item_size(int num_stripes) #define BTRFS_FS_STATE_TRANS_ABORTED 2 #define BTRFS_FS_STATE_DEV_REPLACING 3 -/* Super block flags */ -/* Errors detected */ -#define BTRFS_SUPER_FLAG_ERROR (1ULL << 2) - -#define BTRFS_SUPER_FLAG_SEEDING (1ULL << 32) -#define BTRFS_SUPER_FLAG_METADUMP (1ULL << 33) - #define BTRFS_BACKREF_REV_MAX 256 #define BTRFS_BACKREF_REV_SHIFT 56 #define BTRFS_BACKREF_REV_MASK (((u64)BTRFS_BACKREF_REV_MAX - 1) << \ @@ -598,357 +352,8 @@ struct btrfs_path { unsigned int need_commit_sem:1; unsigned int skip_release_on_error:1; }; - -/* - * items in the extent btree are used to record the objectid of the - * owner of the block and the number of references - */ - -struct btrfs_extent_item { - __le64 refs; - __le64 generation; - __le64 flags; -} __attribute__ ((__packed__)); - -struct btrfs_extent_item_v0 { - __le32 refs; -} __attribute__ ((__packed__)); - #define BTRFS_MAX_EXTENT_ITEM_SIZE(r) ((BTRFS_LEAF_DATA_SIZE(r) >> 4) - \ sizeof(struct btrfs_item)) - -#define BTRFS_EXTENT_FLAG_DATA (1ULL << 0) -#define BTRFS_EXTENT_FLAG_TREE_BLOCK (1ULL << 1) - -/* following flags only apply to tree blocks */ - -/* use full backrefs for extent pointers in the block */ -#define BTRFS_BLOCK_FLAG_FULL_BACKREF (1ULL << 8) - -/* - * this flag is only used internally by scrub and may be changed at any time - * it is only declared here to avoid collisions - */ -#define BTRFS_EXTENT_FLAG_SUPER (1ULL << 48) - -struct btrfs_tree_block_info { - struct btrfs_disk_key key; - u8 level; -} __attribute__ ((__packed__)); - -struct btrfs_extent_data_ref { - __le64 root; - __le64 objectid; - __le64 offset; - __le32 count; -} __attribute__ ((__packed__)); - -struct btrfs_shared_data_ref { - __le32 count; -} __attribute__ ((__packed__)); - -struct btrfs_extent_inline_ref { - u8 type; - __le64 offset; -} __attribute__ ((__packed__)); - -/* old style backrefs item */ -struct btrfs_extent_ref_v0 { - __le64 root; - __le64 generation; - __le64 objectid; - __le32 count; -} __attribute__ ((__packed__)); - - -/* dev extents record free space on individual devices. The owner - * field points back to the chunk allocation mapping tree that allocated - * the extent. The chunk tree uuid field is a way to double check the owner - */ -struct btrfs_dev_extent { - __le64 chunk_tree; - __le64 chunk_objectid; - __le64 chunk_offset; - __le64 length; - u8 chunk_tree_uuid[BTRFS_UUID_SIZE]; -} __attribute__ ((__packed__)); - -struct btrfs_inode_ref { - __le64 index; - __le16 name_len; - /* name goes here */ -} __attribute__ ((__packed__)); - -struct btrfs_inode_extref { - __le64 parent_objectid; - __le64 index; - __le16 name_len; - __u8 name[0]; - /* name goes here */ -} __attribute__ ((__packed__)); - -struct btrfs_timespec { - __le64 sec; - __le32 nsec; -} __attribute__ ((__packed__)); - -struct btrfs_inode_item { - /* nfs style generation number */ - __le64 generation; - /* transid that last touched this inode */ - __le64 transid; - __le64 size; - __le64 nbytes; - __le64 block_group; - __le32 nlink; - __le32 uid; - __le32 gid; - __le32 mode; - __le64 rdev; - __le64 flags; - - /* modification sequence number for NFS */ - __le64 sequence; - - /* - * a little future expansion, for more than this we can - * just grow the inode item and version it - */ - __le64 reserved[4]; - struct btrfs_timespec atime; - struct btrfs_timespec ctime; - struct btrfs_timespec mtime; - struct btrfs_timespec otime; -} __attribute__ ((__packed__)); - -struct btrfs_dir_log_item { - __le64 end; -} __attribute__ ((__packed__)); - -struct btrfs_dir_item { - struct btrfs_disk_key location; - __le64 transid; - __le16 data_len; - __le16 name_len; - u8 type; -} __attribute__ ((__packed__)); - -#define BTRFS_ROOT_SUBVOL_RDONLY (1ULL << 0) - -/* - * Internal in-memory flag that a subvolume has been marked for deletion but - * still visible as a directory - */ -#define BTRFS_ROOT_SUBVOL_DEAD (1ULL << 48) - -struct btrfs_root_item { - struct btrfs_inode_item inode; - __le64 generation; - __le64 root_dirid; - __le64 bytenr; - __le64 byte_limit; - __le64 bytes_used; - __le64 last_snapshot; - __le64 flags; - __le32 refs; - struct btrfs_disk_key drop_progress; - u8 drop_level; - u8 level; - - /* - * The following fields appear after subvol_uuids+subvol_times - * were introduced. - */ - - /* - * This generation number is used to test if the new fields are valid - * and up to date while reading the root item. Every time the root item - * is written out, the "generation" field is copied into this field. If - * anyone ever mounted the fs with an older kernel, we will have - * mismatching generation values here and thus must invalidate the - * new fields. See btrfs_update_root and btrfs_find_last_root for - * details. - * the offset of generation_v2 is also used as the start for the memset - * when invalidating the fields. - */ - __le64 generation_v2; - u8 uuid[BTRFS_UUID_SIZE]; - u8 parent_uuid[BTRFS_UUID_SIZE]; - u8 received_uuid[BTRFS_UUID_SIZE]; - __le64 ctransid; /* updated when an inode changes */ - __le64 otransid; /* trans when created */ - __le64 stransid; /* trans when sent. non-zero for received subvol */ - __le64 rtransid; /* trans when received. non-zero for received subvol */ - struct btrfs_timespec ctime; - struct btrfs_timespec otime; - struct btrfs_timespec stime; - struct btrfs_timespec rtime; - __le64 reserved[8]; /* for future */ -} __attribute__ ((__packed__)); - -/* - * this is used for both forward and backward root refs - */ -struct btrfs_root_ref { - __le64 dirid; - __le64 sequence; - __le16 name_len; -} __attribute__ ((__packed__)); - -struct btrfs_disk_balance_args { - /* - * profiles to operate on, single is denoted by - * BTRFS_AVAIL_ALLOC_BIT_SINGLE - */ - __le64 profiles; - - /* - * usage filter - * BTRFS_BALANCE_ARGS_USAGE with a single value means '0..N' - * BTRFS_BALANCE_ARGS_USAGE_RANGE - range syntax, min..max - */ - union { - __le64 usage; - struct { - __le32 usage_min; - __le32 usage_max; - }; - }; - - /* devid filter */ - __le64 devid; - - /* devid subset filter [pstart..pend) */ - __le64 pstart; - __le64 pend; - - /* btrfs virtual address space subset filter [vstart..vend) */ - __le64 vstart; - __le64 vend; - - /* - * profile to convert to, single is denoted by - * BTRFS_AVAIL_ALLOC_BIT_SINGLE - */ - __le64 target; - - /* BTRFS_BALANCE_ARGS_* */ - __le64 flags; - - /* - * BTRFS_BALANCE_ARGS_LIMIT with value 'limit' - * BTRFS_BALANCE_ARGS_LIMIT_RANGE - the extend version can use minimum - * and maximum - */ - union { - __le64 limit; - struct { - __le32 limit_min; - __le32 limit_max; - }; - }; - - /* - * Process chunks that cross stripes_min..stripes_max devices, - * BTRFS_BALANCE_ARGS_STRIPES_RANGE - */ - __le32 stripes_min; - __le32 stripes_max; - - __le64 unused[6]; -} __attribute__ ((__packed__)); - -/* - * store balance parameters to disk so that balance can be properly - * resumed after crash or unmount - */ -struct btrfs_balance_item { - /* BTRFS_BALANCE_* */ - __le64 flags; - - struct btrfs_disk_balance_args data; - struct btrfs_disk_balance_args meta; - struct btrfs_disk_balance_args sys; - - __le64 unused[4]; -} __attribute__ ((__packed__)); - -#define BTRFS_FILE_EXTENT_INLINE 0 -#define BTRFS_FILE_EXTENT_REG 1 -#define BTRFS_FILE_EXTENT_PREALLOC 2 - -struct btrfs_file_extent_item { - /* - * transaction id that created this extent - */ - __le64 generation; - /* - * max number of bytes to hold this extent in ram - * when we split a compressed extent we can't know how big - * each of the resulting pieces will be. So, this is - * an upper limit on the size of the extent in ram instead of - * an exact limit. - */ - __le64 ram_bytes; - - /* - * 32 bits for the various ways we might encode the data, - * including compression and encryption. If any of these - * are set to something a given disk format doesn't understand - * it is treated like an incompat flag for reading and writing, - * but not for stat. - */ - u8 compression; - u8 encryption; - __le16 other_encoding; /* spare for later use */ - - /* are we inline data or a real extent? */ - u8 type; - - /* - * disk space consumed by the extent, checksum blocks are included - * in these numbers - * - * At this offset in the structure, the inline extent data start. - */ - __le64 disk_bytenr; - __le64 disk_num_bytes; - /* - * the logical offset in file blocks (no csums) - * this extent record is for. This allows a file extent to point - * into the middle of an existing extent on disk, sharing it - * between two snapshots (useful if some bytes in the middle of the - * extent have changed - */ - __le64 offset; - /* - * the logical number of file blocks (no csums included). This - * always reflects the size uncompressed and without encoding. - */ - __le64 num_bytes; - -} __attribute__ ((__packed__)); - -struct btrfs_csum_item { - u8 csum; -} __attribute__ ((__packed__)); - -struct btrfs_dev_stats_item { - /* - * grow this item struct at the end for future enhancements and keep - * the existing values unchanged - */ - __le64 values[BTRFS_DEV_STAT_VALUES_MAX]; -} __attribute__ ((__packed__)); - -#define BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_ALWAYS 0 -#define BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID 1 -#define BTRFS_DEV_REPLACE_ITEM_STATE_NEVER_STARTED 0 -#define BTRFS_DEV_REPLACE_ITEM_STATE_STARTED 1 -#define BTRFS_DEV_REPLACE_ITEM_STATE_SUSPENDED 2 -#define BTRFS_DEV_REPLACE_ITEM_STATE_FINISHED 3 -#define BTRFS_DEV_REPLACE_ITEM_STATE_CANCELED 4 - struct btrfs_dev_replace { u64 replace_state; /* see #define above */ u64 time_started; /* seconds since 1-Jan-1970 */ @@ -979,167 +384,6 @@ struct btrfs_dev_replace { struct btrfs_scrub_progress scrub_progress; }; -struct btrfs_dev_replace_item { - /* - * grow this item struct at the end for future enhancements and keep - * the existing values unchanged - */ - __le64 src_devid; - __le64 cursor_left; - __le64 cursor_right; - __le64 cont_reading_from_srcdev_mode; - - __le64 replace_state; - __le64 time_started; - __le64 time_stopped; - __le64 num_write_errors; - __le64 num_uncorrectable_read_errors; -} __attribute__ ((__packed__)); - -/* different types of block groups (and chunks) */ -#define BTRFS_BLOCK_GROUP_DATA (1ULL << 0) -#define BTRFS_BLOCK_GROUP_SYSTEM (1ULL << 1) -#define BTRFS_BLOCK_GROUP_METADATA (1ULL << 2) -#define BTRFS_BLOCK_GROUP_RAID0 (1ULL << 3) -#define BTRFS_BLOCK_GROUP_RAID1 (1ULL << 4) -#define BTRFS_BLOCK_GROUP_DUP (1ULL << 5) -#define BTRFS_BLOCK_GROUP_RAID10 (1ULL << 6) -#define BTRFS_BLOCK_GROUP_RAID5 (1ULL << 7) -#define BTRFS_BLOCK_GROUP_RAID6 (1ULL << 8) -#define BTRFS_BLOCK_GROUP_RESERVED (BTRFS_AVAIL_ALLOC_BIT_SINGLE | \ - BTRFS_SPACE_INFO_GLOBAL_RSV) - -enum btrfs_raid_types { - BTRFS_RAID_RAID10, - BTRFS_RAID_RAID1, - BTRFS_RAID_DUP, - BTRFS_RAID_RAID0, - BTRFS_RAID_SINGLE, - BTRFS_RAID_RAID5, - BTRFS_RAID_RAID6, - BTRFS_NR_RAID_TYPES -}; - -#define BTRFS_BLOCK_GROUP_TYPE_MASK (BTRFS_BLOCK_GROUP_DATA | \ - BTRFS_BLOCK_GROUP_SYSTEM | \ - BTRFS_BLOCK_GROUP_METADATA) - -#define BTRFS_BLOCK_GROUP_PROFILE_MASK (BTRFS_BLOCK_GROUP_RAID0 | \ - BTRFS_BLOCK_GROUP_RAID1 | \ - BTRFS_BLOCK_GROUP_RAID5 | \ - BTRFS_BLOCK_GROUP_RAID6 | \ - BTRFS_BLOCK_GROUP_DUP | \ - BTRFS_BLOCK_GROUP_RAID10) -#define BTRFS_BLOCK_GROUP_RAID56_MASK (BTRFS_BLOCK_GROUP_RAID5 | \ - BTRFS_BLOCK_GROUP_RAID6) - -/* - * We need a bit for restriper to be able to tell when chunks of type - * SINGLE are available. This "extended" profile format is used in - * fs_info->avail_*_alloc_bits (in-memory) and balance item fields - * (on-disk). The corresponding on-disk bit in chunk.type is reserved - * to avoid remappings between two formats in future. - */ -#define BTRFS_AVAIL_ALLOC_BIT_SINGLE (1ULL << 48) - -/* - * A fake block group type that is used to communicate global block reserve - * size to userspace via the SPACE_INFO ioctl. - */ -#define BTRFS_SPACE_INFO_GLOBAL_RSV (1ULL << 49) - -#define BTRFS_EXTENDED_PROFILE_MASK (BTRFS_BLOCK_GROUP_PROFILE_MASK | \ - BTRFS_AVAIL_ALLOC_BIT_SINGLE) - -static inline u64 chunk_to_extended(u64 flags) -{ - if ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0) - flags |= BTRFS_AVAIL_ALLOC_BIT_SINGLE; - - return flags; -} -static inline u64 extended_to_chunk(u64 flags) -{ - return flags & ~BTRFS_AVAIL_ALLOC_BIT_SINGLE; -} - -struct btrfs_block_group_item { - __le64 used; - __le64 chunk_objectid; - __le64 flags; -} __attribute__ ((__packed__)); - -struct btrfs_free_space_info { - __le32 extent_count; - __le32 flags; -} __attribute__ ((__packed__)); - -#define BTRFS_FREE_SPACE_USING_BITMAPS (1ULL << 0) - -#define BTRFS_QGROUP_LEVEL_SHIFT 48 -static inline u64 btrfs_qgroup_level(u64 qgroupid) -{ - return qgroupid >> BTRFS_QGROUP_LEVEL_SHIFT; -} - -/* - * is subvolume quota turned on? - */ -#define BTRFS_QGROUP_STATUS_FLAG_ON (1ULL << 0) -/* - * RESCAN is set during the initialization phase - */ -#define BTRFS_QGROUP_STATUS_FLAG_RESCAN (1ULL << 1) -/* - * Some qgroup entries are known to be out of date, - * either because the configuration has changed in a way that - * makes a rescan necessary, or because the fs has been mounted - * with a non-qgroup-aware version. - * Turning qouta off and on again makes it inconsistent, too. - */ -#define BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT (1ULL << 2) - -#define BTRFS_QGROUP_STATUS_VERSION 1 - -struct btrfs_qgroup_status_item { - __le64 version; - /* - * the generation is updated during every commit. As older - * versions of btrfs are not aware of qgroups, it will be - * possible to detect inconsistencies by checking the - * generation on mount time - */ - __le64 generation; - - /* flag definitions see above */ - __le64 flags; - - /* - * only used during scanning to record the progress - * of the scan. It contains a logical address - */ - __le64 rescan; -} __attribute__ ((__packed__)); - -struct btrfs_qgroup_info_item { - __le64 generation; - __le64 rfer; - __le64 rfer_cmpr; - __le64 excl; - __le64 excl_cmpr; -} __attribute__ ((__packed__)); - -struct btrfs_qgroup_limit_item { - /* - * only updated when any of the other values change - */ - __le64 flags; - __le64 max_rfer; - __le64 max_excl; - __le64 rsv_rfer; - __le64 rsv_excl; -} __attribute__ ((__packed__)); - /* For raid type sysfs entries */ struct raid_kobject { int raid_type; @@ -1992,197 +1236,6 @@ struct btrfs_root { atomic_t qgroup_meta_rsv; }; - -/* - * inode items have the data typically returned from stat and store other - * info about object characteristics. There is one for every file and dir in - * the FS - */ -#define BTRFS_INODE_ITEM_KEY 1 -#define BTRFS_INODE_REF_KEY 12 -#define BTRFS_INODE_EXTREF_KEY 13 -#define BTRFS_XATTR_ITEM_KEY 24 -#define BTRFS_ORPHAN_ITEM_KEY 48 -/* reserve 2-15 close to the inode for later flexibility */ - -/* - * dir items are the name -> inode pointers in a directory. There is one - * for every name in a directory. - */ -#define BTRFS_DIR_LOG_ITEM_KEY 60 -#define BTRFS_DIR_LOG_INDEX_KEY 72 -#define BTRFS_DIR_ITEM_KEY 84 -#define BTRFS_DIR_INDEX_KEY 96 -/* - * extent data is for file data - */ -#define BTRFS_EXTENT_DATA_KEY 108 - -/* - * extent csums are stored in a separate tree and hold csums for - * an entire extent on disk. - */ -#define BTRFS_EXTENT_CSUM_KEY 128 - -/* - * root items point to tree roots. They are typically in the root - * tree used by the super block to find all the other trees - */ -#define BTRFS_ROOT_ITEM_KEY 132 - -/* - * root backrefs tie subvols and snapshots to the directory entries that - * reference them - */ -#define BTRFS_ROOT_BACKREF_KEY 144 - -/* - * root refs make a fast index for listing all of the snapshots and - * subvolumes referenced by a given root. They point directly to the - * directory item in the root that references the subvol - */ -#define BTRFS_ROOT_REF_KEY 156 - -/* - * extent items are in the extent map tree. These record which blocks - * are used, and how many references there are to each block - */ -#define BTRFS_EXTENT_ITEM_KEY 168 - -/* - * The same as the BTRFS_EXTENT_ITEM_KEY, except it's metadata we already know - * the length, so we save the level in key->offset instead of the length. - */ -#define BTRFS_METADATA_ITEM_KEY 169 - -#define BTRFS_TREE_BLOCK_REF_KEY 176 - -#define BTRFS_EXTENT_DATA_REF_KEY 178 - -#define BTRFS_EXTENT_REF_V0_KEY 180 - -#define BTRFS_SHARED_BLOCK_REF_KEY 182 - -#define BTRFS_SHARED_DATA_REF_KEY 184 - -/* - * block groups give us hints into the extent allocation trees. Which - * blocks are free etc etc - */ -#define BTRFS_BLOCK_GROUP_ITEM_KEY 192 - -/* - * Every block group is represented in the free space tree by a free space info - * item, which stores some accounting information. It is keyed on - * (block_group_start, FREE_SPACE_INFO, block_group_length). - */ -#define BTRFS_FREE_SPACE_INFO_KEY 198 - -/* - * A free space extent tracks an extent of space that is free in a block group. - * It is keyed on (start, FREE_SPACE_EXTENT, length). - */ -#define BTRFS_FREE_SPACE_EXTENT_KEY 199 - -/* - * When a block group becomes very fragmented, we convert it to use bitmaps - * instead of extents. A free space bitmap is keyed on - * (start, FREE_SPACE_BITMAP, length); the corresponding item is a bitmap with - * (length / sectorsize) bits. - */ -#define BTRFS_FREE_SPACE_BITMAP_KEY 200 - -#define BTRFS_DEV_EXTENT_KEY 204 -#define BTRFS_DEV_ITEM_KEY 216 -#define BTRFS_CHUNK_ITEM_KEY 228 - -/* - * Records the overall state of the qgroups. - * There's only one instance of this key present, - * (0, BTRFS_QGROUP_STATUS_KEY, 0) - */ -#define BTRFS_QGROUP_STATUS_KEY 240 -/* - * Records the currently used space of the qgroup. - * One key per qgroup, (0, BTRFS_QGROUP_INFO_KEY, qgroupid). - */ -#define BTRFS_QGROUP_INFO_KEY 242 -/* - * Contains the user configured limits for the qgroup. - * One key per qgroup, (0, BTRFS_QGROUP_LIMIT_KEY, qgroupid). - */ -#define BTRFS_QGROUP_LIMIT_KEY 244 -/* - * Records the child-parent relationship of qgroups. For - * each relation, 2 keys are present: - * (childid, BTRFS_QGROUP_RELATION_KEY, parentid) - * (parentid, BTRFS_QGROUP_RELATION_KEY, childid) - */ -#define BTRFS_QGROUP_RELATION_KEY 246 - -/* - * Obsolete name, see BTRFS_TEMPORARY_ITEM_KEY. - */ -#define BTRFS_BALANCE_ITEM_KEY 248 - -/* - * The key type for tree items that are stored persistently, but do not need to - * exist for extended period of time. The items can exist in any tree. - * - * [subtype, BTRFS_TEMPORARY_ITEM_KEY, data] - * - * Existing items: - * - * - balance status item - * (BTRFS_BALANCE_OBJECTID, BTRFS_TEMPORARY_ITEM_KEY, 0) - */ -#define BTRFS_TEMPORARY_ITEM_KEY 248 - -/* - * Obsolete name, see BTRFS_PERSISTENT_ITEM_KEY - */ -#define BTRFS_DEV_STATS_KEY 249 - -/* - * The key type for tree items that are stored persistently and usually exist - * for a long period, eg. filesystem lifetime. The item kinds can be status - * information, stats or preference values. The item can exist in any tree. - * - * [subtype, BTRFS_PERSISTENT_ITEM_KEY, data] - * - * Existing items: - * - * - device statistics, store IO stats in the device tree, one key for all - * stats - * (BTRFS_DEV_STATS_OBJECTID, BTRFS_DEV_STATS_KEY, 0) - */ -#define BTRFS_PERSISTENT_ITEM_KEY 249 - -/* - * Persistantly stores the device replace state in the device tree. - * The key is built like this: (0, BTRFS_DEV_REPLACE_KEY, 0). - */ -#define BTRFS_DEV_REPLACE_KEY 250 - -/* - * Stores items that allow to quickly map UUIDs to something else. - * These items are part of the filesystem UUID tree. - * The key is built like this: - * (UUID_upper_64_bits, BTRFS_UUID_KEY*, UUID_lower_64_bits). - */ -#if BTRFS_UUID_SIZE != 16 -#error "UUID items require BTRFS_UUID_SIZE == 16!" -#endif -#define BTRFS_UUID_KEY_SUBVOL 251 /* for UUIDs assigned to subvols */ -#define BTRFS_UUID_KEY_RECEIVED_SUBVOL 252 /* for UUIDs assigned to - * received subvols */ - -/* - * string items are for debugging. They just store a short string of - * data in the FS - */ -#define BTRFS_STRING_ITEM_KEY 253 - /* * Flags for mount options. * diff --git a/include/uapi/linux/btrfs_tree.h b/include/uapi/linux/btrfs_tree.h new file mode 100644 index 000000000000..1e875057d6b2 --- /dev/null +++ b/include/uapi/linux/btrfs_tree.h @@ -0,0 +1,966 @@ +#ifndef _BTRFS_CTREE_H_ +#define _BTRFS_CTREE_H_ + +/* + * This header contains the structure definitions and constants used + * by file system objects that can be retrieved using + * the BTRFS_IOC_SEARCH_TREE ioctl. That means basically anything that + * is needed to describe a leaf node's key or item contents. + */ + +/* holds pointers to all of the tree roots */ +#define BTRFS_ROOT_TREE_OBJECTID 1ULL + +/* stores information about which extents are in use, and reference counts */ +#define BTRFS_EXTENT_TREE_OBJECTID 2ULL + +/* + * chunk tree stores translations from logical -> physical block numbering + * the super block points to the chunk tree + */ +#define BTRFS_CHUNK_TREE_OBJECTID 3ULL + +/* + * stores information about which areas of a given device are in use. + * one per device. The tree of tree roots points to the device tree + */ +#define BTRFS_DEV_TREE_OBJECTID 4ULL + +/* one per subvolume, storing files and directories */ +#define BTRFS_FS_TREE_OBJECTID 5ULL + +/* directory objectid inside the root tree */ +#define BTRFS_ROOT_TREE_DIR_OBJECTID 6ULL + +/* holds checksums of all the data extents */ +#define BTRFS_CSUM_TREE_OBJECTID 7ULL + +/* holds quota configuration and tracking */ +#define BTRFS_QUOTA_TREE_OBJECTID 8ULL + +/* for storing items that use the BTRFS_UUID_KEY* types */ +#define BTRFS_UUID_TREE_OBJECTID 9ULL + +/* tracks free space in block groups. */ +#define BTRFS_FREE_SPACE_TREE_OBJECTID 10ULL + +/* device stats in the device tree */ +#define BTRFS_DEV_STATS_OBJECTID 0ULL + +/* for storing balance parameters in the root tree */ +#define BTRFS_BALANCE_OBJECTID -4ULL + +/* orhpan objectid for tracking unlinked/truncated files */ +#define BTRFS_ORPHAN_OBJECTID -5ULL + +/* does write ahead logging to speed up fsyncs */ +#define BTRFS_TREE_LOG_OBJECTID -6ULL +#define BTRFS_TREE_LOG_FIXUP_OBJECTID -7ULL + +/* for space balancing */ +#define BTRFS_TREE_RELOC_OBJECTID -8ULL +#define BTRFS_DATA_RELOC_TREE_OBJECTID -9ULL + +/* + * extent checksums all have this objectid + * this allows them to share the logging tree + * for fsyncs + */ +#define BTRFS_EXTENT_CSUM_OBJECTID -10ULL + +/* For storing free space cache */ +#define BTRFS_FREE_SPACE_OBJECTID -11ULL + +/* + * The inode number assigned to the special inode for storing + * free ino cache + */ +#define BTRFS_FREE_INO_OBJECTID -12ULL + +/* dummy objectid represents multiple objectids */ +#define BTRFS_MULTIPLE_OBJECTIDS -255ULL + +/* + * All files have objectids in this range. + */ +#define BTRFS_FIRST_FREE_OBJECTID 256ULL +#define BTRFS_LAST_FREE_OBJECTID -256ULL +#define BTRFS_FIRST_CHUNK_TREE_OBJECTID 256ULL + + +/* + * the device items go into the chunk tree. The key is in the form + * [ 1 BTRFS_DEV_ITEM_KEY device_id ] + */ +#define BTRFS_DEV_ITEMS_OBJECTID 1ULL + +#define BTRFS_BTREE_INODE_OBJECTID 1 + +#define BTRFS_EMPTY_SUBVOL_DIR_OBJECTID 2 + +#define BTRFS_DEV_REPLACE_DEVID 0ULL + +/* + * inode items have the data typically returned from stat and store other + * info about object characteristics. There is one for every file and dir in + * the FS + */ +#define BTRFS_INODE_ITEM_KEY 1 +#define BTRFS_INODE_REF_KEY 12 +#define BTRFS_INODE_EXTREF_KEY 13 +#define BTRFS_XATTR_ITEM_KEY 24 +#define BTRFS_ORPHAN_ITEM_KEY 48 +/* reserve 2-15 close to the inode for later flexibility */ + +/* + * dir items are the name -> inode pointers in a directory. There is one + * for every name in a directory. + */ +#define BTRFS_DIR_LOG_ITEM_KEY 60 +#define BTRFS_DIR_LOG_INDEX_KEY 72 +#define BTRFS_DIR_ITEM_KEY 84 +#define BTRFS_DIR_INDEX_KEY 96 +/* + * extent data is for file data + */ +#define BTRFS_EXTENT_DATA_KEY 108 + +/* + * extent csums are stored in a separate tree and hold csums for + * an entire extent on disk. + */ +#define BTRFS_EXTENT_CSUM_KEY 128 + +/* + * root items point to tree roots. They are typically in the root + * tree used by the super block to find all the other trees + */ +#define BTRFS_ROOT_ITEM_KEY 132 + +/* + * root backrefs tie subvols and snapshots to the directory entries that + * reference them + */ +#define BTRFS_ROOT_BACKREF_KEY 144 + +/* + * root refs make a fast index for listing all of the snapshots and + * subvolumes referenced by a given root. They point directly to the + * directory item in the root that references the subvol + */ +#define BTRFS_ROOT_REF_KEY 156 + +/* + * extent items are in the extent map tree. These record which blocks + * are used, and how many references there are to each block + */ +#define BTRFS_EXTENT_ITEM_KEY 168 + +/* + * The same as the BTRFS_EXTENT_ITEM_KEY, except it's metadata we already know + * the length, so we save the level in key->offset instead of the length. + */ +#define BTRFS_METADATA_ITEM_KEY 169 + +#define BTRFS_TREE_BLOCK_REF_KEY 176 + +#define BTRFS_EXTENT_DATA_REF_KEY 178 + +#define BTRFS_EXTENT_REF_V0_KEY 180 + +#define BTRFS_SHARED_BLOCK_REF_KEY 182 + +#define BTRFS_SHARED_DATA_REF_KEY 184 + +/* + * block groups give us hints into the extent allocation trees. Which + * blocks are free etc etc + */ +#define BTRFS_BLOCK_GROUP_ITEM_KEY 192 + +/* + * Every block group is represented in the free space tree by a free space info + * item, which stores some accounting information. It is keyed on + * (block_group_start, FREE_SPACE_INFO, block_group_length). + */ +#define BTRFS_FREE_SPACE_INFO_KEY 198 + +/* + * A free space extent tracks an extent of space that is free in a block group. + * It is keyed on (start, FREE_SPACE_EXTENT, length). + */ +#define BTRFS_FREE_SPACE_EXTENT_KEY 199 + +/* + * When a block group becomes very fragmented, we convert it to use bitmaps + * instead of extents. A free space bitmap is keyed on + * (start, FREE_SPACE_BITMAP, length); the corresponding item is a bitmap with + * (length / sectorsize) bits. + */ +#define BTRFS_FREE_SPACE_BITMAP_KEY 200 + +#define BTRFS_DEV_EXTENT_KEY 204 +#define BTRFS_DEV_ITEM_KEY 216 +#define BTRFS_CHUNK_ITEM_KEY 228 + +/* + * Records the overall state of the qgroups. + * There's only one instance of this key present, + * (0, BTRFS_QGROUP_STATUS_KEY, 0) + */ +#define BTRFS_QGROUP_STATUS_KEY 240 +/* + * Records the currently used space of the qgroup. + * One key per qgroup, (0, BTRFS_QGROUP_INFO_KEY, qgroupid). + */ +#define BTRFS_QGROUP_INFO_KEY 242 +/* + * Contains the user configured limits for the qgroup. + * One key per qgroup, (0, BTRFS_QGROUP_LIMIT_KEY, qgroupid). + */ +#define BTRFS_QGROUP_LIMIT_KEY 244 +/* + * Records the child-parent relationship of qgroups. For + * each relation, 2 keys are present: + * (childid, BTRFS_QGROUP_RELATION_KEY, parentid) + * (parentid, BTRFS_QGROUP_RELATION_KEY, childid) + */ +#define BTRFS_QGROUP_RELATION_KEY 246 + +/* + * Obsolete name, see BTRFS_TEMPORARY_ITEM_KEY. + */ +#define BTRFS_BALANCE_ITEM_KEY 248 + +/* + * The key type for tree items that are stored persistently, but do not need to + * exist for extended period of time. The items can exist in any tree. + * + * [subtype, BTRFS_TEMPORARY_ITEM_KEY, data] + * + * Existing items: + * + * - balance status item + * (BTRFS_BALANCE_OBJECTID, BTRFS_TEMPORARY_ITEM_KEY, 0) + */ +#define BTRFS_TEMPORARY_ITEM_KEY 248 + +/* + * Obsolete name, see BTRFS_PERSISTENT_ITEM_KEY + */ +#define BTRFS_DEV_STATS_KEY 249 + +/* + * The key type for tree items that are stored persistently and usually exist + * for a long period, eg. filesystem lifetime. The item kinds can be status + * information, stats or preference values. The item can exist in any tree. + * + * [subtype, BTRFS_PERSISTENT_ITEM_KEY, data] + * + * Existing items: + * + * - device statistics, store IO stats in the device tree, one key for all + * stats + * (BTRFS_DEV_STATS_OBJECTID, BTRFS_DEV_STATS_KEY, 0) + */ +#define BTRFS_PERSISTENT_ITEM_KEY 249 + +/* + * Persistantly stores the device replace state in the device tree. + * The key is built like this: (0, BTRFS_DEV_REPLACE_KEY, 0). + */ +#define BTRFS_DEV_REPLACE_KEY 250 + +/* + * Stores items that allow to quickly map UUIDs to something else. + * These items are part of the filesystem UUID tree. + * The key is built like this: + * (UUID_upper_64_bits, BTRFS_UUID_KEY*, UUID_lower_64_bits). + */ +#if BTRFS_UUID_SIZE != 16 +#error "UUID items require BTRFS_UUID_SIZE == 16!" +#endif +#define BTRFS_UUID_KEY_SUBVOL 251 /* for UUIDs assigned to subvols */ +#define BTRFS_UUID_KEY_RECEIVED_SUBVOL 252 /* for UUIDs assigned to + * received subvols */ + +/* + * string items are for debugging. They just store a short string of + * data in the FS + */ +#define BTRFS_STRING_ITEM_KEY 253 + + + +/* 32 bytes in various csum fields */ +#define BTRFS_CSUM_SIZE 32 + +/* csum types */ +#define BTRFS_CSUM_TYPE_CRC32 0 + +/* + * flags definitions for directory entry item type + * + * Used by: + * struct btrfs_dir_item.type + */ +#define BTRFS_FT_UNKNOWN 0 +#define BTRFS_FT_REG_FILE 1 +#define BTRFS_FT_DIR 2 +#define BTRFS_FT_CHRDEV 3 +#define BTRFS_FT_BLKDEV 4 +#define BTRFS_FT_FIFO 5 +#define BTRFS_FT_SOCK 6 +#define BTRFS_FT_SYMLINK 7 +#define BTRFS_FT_XATTR 8 +#define BTRFS_FT_MAX 9 + +/* + * The key defines the order in the tree, and so it also defines (optimal) + * block layout. + * + * objectid corresponds to the inode number. + * + * type tells us things about the object, and is a kind of stream selector. + * so for a given inode, keys with type of 1 might refer to the inode data, + * type of 2 may point to file data in the btree and type == 3 may point to + * extents. + * + * offset is the starting byte offset for this key in the stream. + * + * btrfs_disk_key is in disk byte order. struct btrfs_key is always + * in cpu native order. Otherwise they are identical and their sizes + * should be the same (ie both packed) + */ +struct btrfs_disk_key { + __le64 objectid; + u8 type; + __le64 offset; +} __attribute__ ((__packed__)); + +struct btrfs_key { + u64 objectid; + u8 type; + u64 offset; +} __attribute__ ((__packed__)); + +struct btrfs_dev_item { + /* the internal btrfs device id */ + __le64 devid; + + /* size of the device */ + __le64 total_bytes; + + /* bytes used */ + __le64 bytes_used; + + /* optimal io alignment for this device */ + __le32 io_align; + + /* optimal io width for this device */ + __le32 io_width; + + /* minimal io size for this device */ + __le32 sector_size; + + /* type and info about this device */ + __le64 type; + + /* expected generation for this device */ + __le64 generation; + + /* + * starting byte of this partition on the device, + * to allow for stripe alignment in the future + */ + __le64 start_offset; + + /* grouping information for allocation decisions */ + __le32 dev_group; + + /* seek speed 0-100 where 100 is fastest */ + u8 seek_speed; + + /* bandwidth 0-100 where 100 is fastest */ + u8 bandwidth; + + /* btrfs generated uuid for this device */ + u8 uuid[BTRFS_UUID_SIZE]; + + /* uuid of FS who owns this device */ + u8 fsid[BTRFS_UUID_SIZE]; +} __attribute__ ((__packed__)); + +struct btrfs_stripe { + __le64 devid; + __le64 offset; + u8 dev_uuid[BTRFS_UUID_SIZE]; +} __attribute__ ((__packed__)); + +struct btrfs_chunk { + /* size of this chunk in bytes */ + __le64 length; + + /* objectid of the root referencing this chunk */ + __le64 owner; + + __le64 stripe_len; + __le64 type; + + /* optimal io alignment for this chunk */ + __le32 io_align; + + /* optimal io width for this chunk */ + __le32 io_width; + + /* minimal io size for this chunk */ + __le32 sector_size; + + /* 2^16 stripes is quite a lot, a second limit is the size of a single + * item in the btree + */ + __le16 num_stripes; + + /* sub stripes only matter for raid10 */ + __le16 sub_stripes; + struct btrfs_stripe stripe; + /* additional stripes go here */ +} __attribute__ ((__packed__)); + +#define BTRFS_FREE_SPACE_EXTENT 1 +#define BTRFS_FREE_SPACE_BITMAP 2 + +struct btrfs_free_space_entry { + __le64 offset; + __le64 bytes; + u8 type; +} __attribute__ ((__packed__)); + +struct btrfs_free_space_header { + struct btrfs_disk_key location; + __le64 generation; + __le64 num_entries; + __le64 num_bitmaps; +} __attribute__ ((__packed__)); + +#define BTRFS_HEADER_FLAG_WRITTEN (1ULL << 0) +#define BTRFS_HEADER_FLAG_RELOC (1ULL << 1) + +/* Super block flags */ +/* Errors detected */ +#define BTRFS_SUPER_FLAG_ERROR (1ULL << 2) + +#define BTRFS_SUPER_FLAG_SEEDING (1ULL << 32) +#define BTRFS_SUPER_FLAG_METADUMP (1ULL << 33) + + +/* + * items in the extent btree are used to record the objectid of the + * owner of the block and the number of references + */ + +struct btrfs_extent_item { + __le64 refs; + __le64 generation; + __le64 flags; +} __attribute__ ((__packed__)); + +struct btrfs_extent_item_v0 { + __le32 refs; +} __attribute__ ((__packed__)); + + +#define BTRFS_EXTENT_FLAG_DATA (1ULL << 0) +#define BTRFS_EXTENT_FLAG_TREE_BLOCK (1ULL << 1) + +/* following flags only apply to tree blocks */ + +/* use full backrefs for extent pointers in the block */ +#define BTRFS_BLOCK_FLAG_FULL_BACKREF (1ULL << 8) + +/* + * this flag is only used internally by scrub and may be changed at any time + * it is only declared here to avoid collisions + */ +#define BTRFS_EXTENT_FLAG_SUPER (1ULL << 48) + +struct btrfs_tree_block_info { + struct btrfs_disk_key key; + u8 level; +} __attribute__ ((__packed__)); + +struct btrfs_extent_data_ref { + __le64 root; + __le64 objectid; + __le64 offset; + __le32 count; +} __attribute__ ((__packed__)); + +struct btrfs_shared_data_ref { + __le32 count; +} __attribute__ ((__packed__)); + +struct btrfs_extent_inline_ref { + u8 type; + __le64 offset; +} __attribute__ ((__packed__)); + +/* old style backrefs item */ +struct btrfs_extent_ref_v0 { + __le64 root; + __le64 generation; + __le64 objectid; + __le32 count; +} __attribute__ ((__packed__)); + + +/* dev extents record free space on individual devices. The owner + * field points back to the chunk allocation mapping tree that allocated + * the extent. The chunk tree uuid field is a way to double check the owner + */ +struct btrfs_dev_extent { + __le64 chunk_tree; + __le64 chunk_objectid; + __le64 chunk_offset; + __le64 length; + u8 chunk_tree_uuid[BTRFS_UUID_SIZE]; +} __attribute__ ((__packed__)); + +struct btrfs_inode_ref { + __le64 index; + __le16 name_len; + /* name goes here */ +} __attribute__ ((__packed__)); + +struct btrfs_inode_extref { + __le64 parent_objectid; + __le64 index; + __le16 name_len; + __u8 name[0]; + /* name goes here */ +} __attribute__ ((__packed__)); + +struct btrfs_timespec { + __le64 sec; + __le32 nsec; +} __attribute__ ((__packed__)); + +struct btrfs_inode_item { + /* nfs style generation number */ + __le64 generation; + /* transid that last touched this inode */ + __le64 transid; + __le64 size; + __le64 nbytes; + __le64 block_group; + __le32 nlink; + __le32 uid; + __le32 gid; + __le32 mode; + __le64 rdev; + __le64 flags; + + /* modification sequence number for NFS */ + __le64 sequence; + + /* + * a little future expansion, for more than this we can + * just grow the inode item and version it + */ + __le64 reserved[4]; + struct btrfs_timespec atime; + struct btrfs_timespec ctime; + struct btrfs_timespec mtime; + struct btrfs_timespec otime; +} __attribute__ ((__packed__)); + +struct btrfs_dir_log_item { + __le64 end; +} __attribute__ ((__packed__)); + +struct btrfs_dir_item { + struct btrfs_disk_key location; + __le64 transid; + __le16 data_len; + __le16 name_len; + u8 type; +} __attribute__ ((__packed__)); + +#define BTRFS_ROOT_SUBVOL_RDONLY (1ULL << 0) + +/* + * Internal in-memory flag that a subvolume has been marked for deletion but + * still visible as a directory + */ +#define BTRFS_ROOT_SUBVOL_DEAD (1ULL << 48) + +struct btrfs_root_item { + struct btrfs_inode_item inode; + __le64 generation; + __le64 root_dirid; + __le64 bytenr; + __le64 byte_limit; + __le64 bytes_used; + __le64 last_snapshot; + __le64 flags; + __le32 refs; + struct btrfs_disk_key drop_progress; + u8 drop_level; + u8 level; + + /* + * The following fields appear after subvol_uuids+subvol_times + * were introduced. + */ + + /* + * This generation number is used to test if the new fields are valid + * and up to date while reading the root item. Every time the root item + * is written out, the "generation" field is copied into this field. If + * anyone ever mounted the fs with an older kernel, we will have + * mismatching generation values here and thus must invalidate the + * new fields. See btrfs_update_root and btrfs_find_last_root for + * details. + * the offset of generation_v2 is also used as the start for the memset + * when invalidating the fields. + */ + __le64 generation_v2; + u8 uuid[BTRFS_UUID_SIZE]; + u8 parent_uuid[BTRFS_UUID_SIZE]; + u8 received_uuid[BTRFS_UUID_SIZE]; + __le64 ctransid; /* updated when an inode changes */ + __le64 otransid; /* trans when created */ + __le64 stransid; /* trans when sent. non-zero for received subvol */ + __le64 rtransid; /* trans when received. non-zero for received subvol */ + struct btrfs_timespec ctime; + struct btrfs_timespec otime; + struct btrfs_timespec stime; + struct btrfs_timespec rtime; + __le64 reserved[8]; /* for future */ +} __attribute__ ((__packed__)); + +/* + * this is used for both forward and backward root refs + */ +struct btrfs_root_ref { + __le64 dirid; + __le64 sequence; + __le16 name_len; +} __attribute__ ((__packed__)); + +struct btrfs_disk_balance_args { + /* + * profiles to operate on, single is denoted by + * BTRFS_AVAIL_ALLOC_BIT_SINGLE + */ + __le64 profiles; + + /* + * usage filter + * BTRFS_BALANCE_ARGS_USAGE with a single value means '0..N' + * BTRFS_BALANCE_ARGS_USAGE_RANGE - range syntax, min..max + */ + union { + __le64 usage; + struct { + __le32 usage_min; + __le32 usage_max; + }; + }; + + /* devid filter */ + __le64 devid; + + /* devid subset filter [pstart..pend) */ + __le64 pstart; + __le64 pend; + + /* btrfs virtual address space subset filter [vstart..vend) */ + __le64 vstart; + __le64 vend; + + /* + * profile to convert to, single is denoted by + * BTRFS_AVAIL_ALLOC_BIT_SINGLE + */ + __le64 target; + + /* BTRFS_BALANCE_ARGS_* */ + __le64 flags; + + /* + * BTRFS_BALANCE_ARGS_LIMIT with value 'limit' + * BTRFS_BALANCE_ARGS_LIMIT_RANGE - the extend version can use minimum + * and maximum + */ + union { + __le64 limit; + struct { + __le32 limit_min; + __le32 limit_max; + }; + }; + + /* + * Process chunks that cross stripes_min..stripes_max devices, + * BTRFS_BALANCE_ARGS_STRIPES_RANGE + */ + __le32 stripes_min; + __le32 stripes_max; + + __le64 unused[6]; +} __attribute__ ((__packed__)); + +/* + * store balance parameters to disk so that balance can be properly + * resumed after crash or unmount + */ +struct btrfs_balance_item { + /* BTRFS_BALANCE_* */ + __le64 flags; + + struct btrfs_disk_balance_args data; + struct btrfs_disk_balance_args meta; + struct btrfs_disk_balance_args sys; + + __le64 unused[4]; +} __attribute__ ((__packed__)); + +#define BTRFS_FILE_EXTENT_INLINE 0 +#define BTRFS_FILE_EXTENT_REG 1 +#define BTRFS_FILE_EXTENT_PREALLOC 2 + +struct btrfs_file_extent_item { + /* + * transaction id that created this extent + */ + __le64 generation; + /* + * max number of bytes to hold this extent in ram + * when we split a compressed extent we can't know how big + * each of the resulting pieces will be. So, this is + * an upper limit on the size of the extent in ram instead of + * an exact limit. + */ + __le64 ram_bytes; + + /* + * 32 bits for the various ways we might encode the data, + * including compression and encryption. If any of these + * are set to something a given disk format doesn't understand + * it is treated like an incompat flag for reading and writing, + * but not for stat. + */ + u8 compression; + u8 encryption; + __le16 other_encoding; /* spare for later use */ + + /* are we inline data or a real extent? */ + u8 type; + + /* + * disk space consumed by the extent, checksum blocks are included + * in these numbers + * + * At this offset in the structure, the inline extent data start. + */ + __le64 disk_bytenr; + __le64 disk_num_bytes; + /* + * the logical offset in file blocks (no csums) + * this extent record is for. This allows a file extent to point + * into the middle of an existing extent on disk, sharing it + * between two snapshots (useful if some bytes in the middle of the + * extent have changed + */ + __le64 offset; + /* + * the logical number of file blocks (no csums included). This + * always reflects the size uncompressed and without encoding. + */ + __le64 num_bytes; + +} __attribute__ ((__packed__)); + +struct btrfs_csum_item { + u8 csum; +} __attribute__ ((__packed__)); + +struct btrfs_dev_stats_item { + /* + * grow this item struct at the end for future enhancements and keep + * the existing values unchanged + */ + __le64 values[BTRFS_DEV_STAT_VALUES_MAX]; +} __attribute__ ((__packed__)); + +#define BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_ALWAYS 0 +#define BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID 1 +#define BTRFS_DEV_REPLACE_ITEM_STATE_NEVER_STARTED 0 +#define BTRFS_DEV_REPLACE_ITEM_STATE_STARTED 1 +#define BTRFS_DEV_REPLACE_ITEM_STATE_SUSPENDED 2 +#define BTRFS_DEV_REPLACE_ITEM_STATE_FINISHED 3 +#define BTRFS_DEV_REPLACE_ITEM_STATE_CANCELED 4 + +struct btrfs_dev_replace_item { + /* + * grow this item struct at the end for future enhancements and keep + * the existing values unchanged + */ + __le64 src_devid; + __le64 cursor_left; + __le64 cursor_right; + __le64 cont_reading_from_srcdev_mode; + + __le64 replace_state; + __le64 time_started; + __le64 time_stopped; + __le64 num_write_errors; + __le64 num_uncorrectable_read_errors; +} __attribute__ ((__packed__)); + +/* different types of block groups (and chunks) */ +#define BTRFS_BLOCK_GROUP_DATA (1ULL << 0) +#define BTRFS_BLOCK_GROUP_SYSTEM (1ULL << 1) +#define BTRFS_BLOCK_GROUP_METADATA (1ULL << 2) +#define BTRFS_BLOCK_GROUP_RAID0 (1ULL << 3) +#define BTRFS_BLOCK_GROUP_RAID1 (1ULL << 4) +#define BTRFS_BLOCK_GROUP_DUP (1ULL << 5) +#define BTRFS_BLOCK_GROUP_RAID10 (1ULL << 6) +#define BTRFS_BLOCK_GROUP_RAID5 (1ULL << 7) +#define BTRFS_BLOCK_GROUP_RAID6 (1ULL << 8) +#define BTRFS_BLOCK_GROUP_RESERVED (BTRFS_AVAIL_ALLOC_BIT_SINGLE | \ + BTRFS_SPACE_INFO_GLOBAL_RSV) + +enum btrfs_raid_types { + BTRFS_RAID_RAID10, + BTRFS_RAID_RAID1, + BTRFS_RAID_DUP, + BTRFS_RAID_RAID0, + BTRFS_RAID_SINGLE, + BTRFS_RAID_RAID5, + BTRFS_RAID_RAID6, + BTRFS_NR_RAID_TYPES +}; + +#define BTRFS_BLOCK_GROUP_TYPE_MASK (BTRFS_BLOCK_GROUP_DATA | \ + BTRFS_BLOCK_GROUP_SYSTEM | \ + BTRFS_BLOCK_GROUP_METADATA) + +#define BTRFS_BLOCK_GROUP_PROFILE_MASK (BTRFS_BLOCK_GROUP_RAID0 | \ + BTRFS_BLOCK_GROUP_RAID1 | \ + BTRFS_BLOCK_GROUP_RAID5 | \ + BTRFS_BLOCK_GROUP_RAID6 | \ + BTRFS_BLOCK_GROUP_DUP | \ + BTRFS_BLOCK_GROUP_RAID10) +#define BTRFS_BLOCK_GROUP_RAID56_MASK (BTRFS_BLOCK_GROUP_RAID5 | \ + BTRFS_BLOCK_GROUP_RAID6) + +/* + * We need a bit for restriper to be able to tell when chunks of type + * SINGLE are available. This "extended" profile format is used in + * fs_info->avail_*_alloc_bits (in-memory) and balance item fields + * (on-disk). The corresponding on-disk bit in chunk.type is reserved + * to avoid remappings between two formats in future. + */ +#define BTRFS_AVAIL_ALLOC_BIT_SINGLE (1ULL << 48) + +/* + * A fake block group type that is used to communicate global block reserve + * size to userspace via the SPACE_INFO ioctl. + */ +#define BTRFS_SPACE_INFO_GLOBAL_RSV (1ULL << 49) + +#define BTRFS_EXTENDED_PROFILE_MASK (BTRFS_BLOCK_GROUP_PROFILE_MASK | \ + BTRFS_AVAIL_ALLOC_BIT_SINGLE) + +static inline u64 chunk_to_extended(u64 flags) +{ + if ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0) + flags |= BTRFS_AVAIL_ALLOC_BIT_SINGLE; + + return flags; +} +static inline u64 extended_to_chunk(u64 flags) +{ + return flags & ~BTRFS_AVAIL_ALLOC_BIT_SINGLE; +} + +struct btrfs_block_group_item { + __le64 used; + __le64 chunk_objectid; + __le64 flags; +} __attribute__ ((__packed__)); + +struct btrfs_free_space_info { + __le32 extent_count; + __le32 flags; +} __attribute__ ((__packed__)); + +#define BTRFS_FREE_SPACE_USING_BITMAPS (1ULL << 0) + +#define BTRFS_QGROUP_LEVEL_SHIFT 48 +static inline u64 btrfs_qgroup_level(u64 qgroupid) +{ + return qgroupid >> BTRFS_QGROUP_LEVEL_SHIFT; +} + +/* + * is subvolume quota turned on? + */ +#define BTRFS_QGROUP_STATUS_FLAG_ON (1ULL << 0) +/* + * RESCAN is set during the initialization phase + */ +#define BTRFS_QGROUP_STATUS_FLAG_RESCAN (1ULL << 1) +/* + * Some qgroup entries are known to be out of date, + * either because the configuration has changed in a way that + * makes a rescan necessary, or because the fs has been mounted + * with a non-qgroup-aware version. + * Turning qouta off and on again makes it inconsistent, too. + */ +#define BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT (1ULL << 2) + +#define BTRFS_QGROUP_STATUS_VERSION 1 + +struct btrfs_qgroup_status_item { + __le64 version; + /* + * the generation is updated during every commit. As older + * versions of btrfs are not aware of qgroups, it will be + * possible to detect inconsistencies by checking the + * generation on mount time + */ + __le64 generation; + + /* flag definitions see above */ + __le64 flags; + + /* + * only used during scanning to record the progress + * of the scan. It contains a logical address + */ + __le64 rescan; +} __attribute__ ((__packed__)); + +struct btrfs_qgroup_info_item { + __le64 generation; + __le64 rfer; + __le64 rfer_cmpr; + __le64 excl; + __le64 excl_cmpr; +} __attribute__ ((__packed__)); + +struct btrfs_qgroup_limit_item { + /* + * only updated when any of the other values change + */ + __le64 flags; + __le64 max_rfer; + __le64 max_excl; + __le64 rsv_rfer; + __le64 rsv_excl; +} __attribute__ ((__packed__)); + +#endif /* _BTRFS_CTREE_H_ */ -- cgit v1.2.3-55-g7522 From 14b05c5106312744badb463e7fb9a78811e44fc0 Mon Sep 17 00:00:00 2001 From: Jeff Mahoney Date: Fri, 1 Apr 2016 16:14:30 -0400 Subject: btrfs: uapi/linux/btrfs_tree.h, use __u8 and __u64 u8 and u64 aren't exported to userspace, while __u8 and __u64 are. Signed-off-by: Jeff Mahoney Reviewed-by: Liu Bo Reviewed-by: Josef Bacik Signed-off-by: David Sterba --- include/uapi/linux/btrfs_tree.h | 52 ++++++++++++++++++++--------------------- 1 file changed, 26 insertions(+), 26 deletions(-) diff --git a/include/uapi/linux/btrfs_tree.h b/include/uapi/linux/btrfs_tree.h index 1e875057d6b2..d5ad15a106a7 100644 --- a/include/uapi/linux/btrfs_tree.h +++ b/include/uapi/linux/btrfs_tree.h @@ -334,14 +334,14 @@ */ struct btrfs_disk_key { __le64 objectid; - u8 type; + __u8 type; __le64 offset; } __attribute__ ((__packed__)); struct btrfs_key { - u64 objectid; - u8 type; - u64 offset; + __u64 objectid; + __u8 type; + __u64 offset; } __attribute__ ((__packed__)); struct btrfs_dev_item { @@ -379,22 +379,22 @@ struct btrfs_dev_item { __le32 dev_group; /* seek speed 0-100 where 100 is fastest */ - u8 seek_speed; + __u8 seek_speed; /* bandwidth 0-100 where 100 is fastest */ - u8 bandwidth; + __u8 bandwidth; /* btrfs generated uuid for this device */ - u8 uuid[BTRFS_UUID_SIZE]; + __u8 uuid[BTRFS_UUID_SIZE]; /* uuid of FS who owns this device */ - u8 fsid[BTRFS_UUID_SIZE]; + __u8 fsid[BTRFS_UUID_SIZE]; } __attribute__ ((__packed__)); struct btrfs_stripe { __le64 devid; __le64 offset; - u8 dev_uuid[BTRFS_UUID_SIZE]; + __u8 dev_uuid[BTRFS_UUID_SIZE]; } __attribute__ ((__packed__)); struct btrfs_chunk { @@ -433,7 +433,7 @@ struct btrfs_chunk { struct btrfs_free_space_entry { __le64 offset; __le64 bytes; - u8 type; + __u8 type; } __attribute__ ((__packed__)); struct btrfs_free_space_header { @@ -486,7 +486,7 @@ struct btrfs_extent_item_v0 { struct btrfs_tree_block_info { struct btrfs_disk_key key; - u8 level; + __u8 level; } __attribute__ ((__packed__)); struct btrfs_extent_data_ref { @@ -501,7 +501,7 @@ struct btrfs_shared_data_ref { } __attribute__ ((__packed__)); struct btrfs_extent_inline_ref { - u8 type; + __u8 type; __le64 offset; } __attribute__ ((__packed__)); @@ -523,7 +523,7 @@ struct btrfs_dev_extent { __le64 chunk_objectid; __le64 chunk_offset; __le64 length; - u8 chunk_tree_uuid[BTRFS_UUID_SIZE]; + __u8 chunk_tree_uuid[BTRFS_UUID_SIZE]; } __attribute__ ((__packed__)); struct btrfs_inode_ref { @@ -583,7 +583,7 @@ struct btrfs_dir_item { __le64 transid; __le16 data_len; __le16 name_len; - u8 type; + __u8 type; } __attribute__ ((__packed__)); #define BTRFS_ROOT_SUBVOL_RDONLY (1ULL << 0) @@ -605,8 +605,8 @@ struct btrfs_root_item { __le64 flags; __le32 refs; struct btrfs_disk_key drop_progress; - u8 drop_level; - u8 level; + __u8 drop_level; + __u8 level; /* * The following fields appear after subvol_uuids+subvol_times @@ -625,9 +625,9 @@ struct btrfs_root_item { * when invalidating the fields. */ __le64 generation_v2; - u8 uuid[BTRFS_UUID_SIZE]; - u8 parent_uuid[BTRFS_UUID_SIZE]; - u8 received_uuid[BTRFS_UUID_SIZE]; + __u8 uuid[BTRFS_UUID_SIZE]; + __u8 parent_uuid[BTRFS_UUID_SIZE]; + __u8 received_uuid[BTRFS_UUID_SIZE]; __le64 ctransid; /* updated when an inode changes */ __le64 otransid; /* trans when created */ __le64 stransid; /* trans when sent. non-zero for received subvol */ @@ -751,12 +751,12 @@ struct btrfs_file_extent_item { * it is treated like an incompat flag for reading and writing, * but not for stat. */ - u8 compression; - u8 encryption; + __u8 compression; + __u8 encryption; __le16 other_encoding; /* spare for later use */ /* are we inline data or a real extent? */ - u8 type; + __u8 type; /* * disk space consumed by the extent, checksum blocks are included @@ -783,7 +783,7 @@ struct btrfs_file_extent_item { } __attribute__ ((__packed__)); struct btrfs_csum_item { - u8 csum; + __u8 csum; } __attribute__ ((__packed__)); struct btrfs_dev_stats_item { @@ -874,14 +874,14 @@ enum btrfs_raid_types { #define BTRFS_EXTENDED_PROFILE_MASK (BTRFS_BLOCK_GROUP_PROFILE_MASK | \ BTRFS_AVAIL_ALLOC_BIT_SINGLE) -static inline u64 chunk_to_extended(u64 flags) +static inline __u64 chunk_to_extended(__u64 flags) { if ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0) flags |= BTRFS_AVAIL_ALLOC_BIT_SINGLE; return flags; } -static inline u64 extended_to_chunk(u64 flags) +static inline __u64 extended_to_chunk(__u64 flags) { return flags & ~BTRFS_AVAIL_ALLOC_BIT_SINGLE; } @@ -900,7 +900,7 @@ struct btrfs_free_space_info { #define BTRFS_FREE_SPACE_USING_BITMAPS (1ULL << 0) #define BTRFS_QGROUP_LEVEL_SHIFT 48 -static inline u64 btrfs_qgroup_level(u64 qgroupid) +static inline __u64 btrfs_qgroup_level(__u64 qgroupid) { return qgroupid >> BTRFS_QGROUP_LEVEL_SHIFT; } -- cgit v1.2.3-55-g7522 From ad8403df054c9b5dc3175a26c6179571b9cafa4e Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Thu, 10 Mar 2016 12:22:15 +0800 Subject: btrfs: pass the right error code to the btrfs_std_error Also drop the newline from the message. Signed-off-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/ioctl.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 0f76aea6e398..03df0bbb445d 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -4847,8 +4847,8 @@ static long btrfs_ioctl_qgroup_assign(struct file *file, void __user *arg) /* update qgroup status and info */ err = btrfs_run_qgroups(trans, root->fs_info); if (err < 0) - btrfs_handle_fs_error(root->fs_info, ret, - "failed to update qgroup status and info\n"); + btrfs_handle_fs_error(root->fs_info, err, + "failed to update qgroup status and info"); err = btrfs_end_transaction(trans, root); if (err && !ret) ret = err; -- cgit v1.2.3-55-g7522 From 8ed01abe7da6af62f2089da9a4c1fe839dc638f5 Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Thu, 14 Apr 2016 18:24:10 +0800 Subject: btrfs: s_bdev is not null after missing replace Yauhen reported in the ML that s_bdev is null at mount, and s_bdev gets updated to some device when missing device is replaced, as because bdev is null for missing device, things gets matched up. Fix this by checking if s_bdev is set. I didn't want to completely remove updating s_bdev because the future multi device support at vfs layer may need it. Signed-off-by: Anand Jain Reported-by: Yauhen Kharuzhy Signed-off-by: David Sterba --- fs/btrfs/dev-replace.c | 3 ++- fs/btrfs/volumes.c | 6 ++++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 5aebedf12b5f..c93eec9ca433 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -569,7 +569,8 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, ASSERT(list_empty(&src_device->resized_list)); tgt_device->commit_total_bytes = src_device->commit_total_bytes; tgt_device->commit_bytes_used = src_device->bytes_used; - if (fs_info->sb->s_bdev == src_device->bdev) + if (fs_info->sb->s_bdev && + (fs_info->sb->s_bdev == src_device->bdev)) fs_info->sb->s_bdev = tgt_device->bdev; if (fs_info->fs_devices->latest_bdev == src_device->bdev) fs_info->fs_devices->latest_bdev = tgt_device->bdev; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index e43b01e438b1..41bc384f3483 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1848,7 +1848,8 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path, u64 devid) next_device = list_entry(root->fs_info->fs_devices->devices.next, struct btrfs_device, dev_list); - if (device->bdev == root->fs_info->sb->s_bdev) + if (root->fs_info->sb->s_bdev && + (root->fs_info->sb->s_bdev == device->bdev)) root->fs_info->sb->s_bdev = next_device->bdev; if (device->bdev == root->fs_info->fs_devices->latest_bdev) root->fs_info->fs_devices->latest_bdev = next_device->bdev; @@ -1996,7 +1997,8 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, next_device = list_entry(fs_info->fs_devices->devices.next, struct btrfs_device, dev_list); - if (tgtdev->bdev == fs_info->sb->s_bdev) + if (fs_info->sb->s_bdev && + (tgtdev->bdev == fs_info->sb->s_bdev)) fs_info->sb->s_bdev = next_device->bdev; if (tgtdev->bdev == fs_info->fs_devices->latest_bdev) fs_info->fs_devices->latest_bdev = next_device->bdev; -- cgit v1.2.3-55-g7522 From 88acff64c621aaeee2a4fe0ed124c77358069bce Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Tue, 3 May 2016 17:44:43 +0800 Subject: btrfs: cleanup assigning next active device with a check Creates helper fucntion as needed by the device delete and replace operations. Also now it checks if the next device being assigned is an active device. Signed-off-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/dev-replace.c | 8 +++---- fs/btrfs/volumes.c | 60 ++++++++++++++++++++++++++++++++++++-------------- fs/btrfs/volumes.h | 2 ++ 3 files changed, 48 insertions(+), 22 deletions(-) diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index c93eec9ca433..1f193f742f21 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -569,11 +569,9 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, ASSERT(list_empty(&src_device->resized_list)); tgt_device->commit_total_bytes = src_device->commit_total_bytes; tgt_device->commit_bytes_used = src_device->bytes_used; - if (fs_info->sb->s_bdev && - (fs_info->sb->s_bdev == src_device->bdev)) - fs_info->sb->s_bdev = tgt_device->bdev; - if (fs_info->fs_devices->latest_bdev == src_device->bdev) - fs_info->fs_devices->latest_bdev = tgt_device->bdev; + + btrfs_assign_next_active_device(fs_info, src_device, tgt_device); + list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list); fs_info->fs_devices->rw_devices++; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 41bc384f3483..3fb0786060dd 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1756,10 +1756,49 @@ static int btrfs_check_raid_min_devices(struct btrfs_fs_info *fs_info, return 0; } +struct btrfs_device *btrfs_find_next_active_device(struct btrfs_fs_devices *fs_devs, + struct btrfs_device *device) +{ + struct btrfs_device *next_device; + + list_for_each_entry(next_device, &fs_devs->devices, dev_list) { + if (next_device != device && + !next_device->missing && next_device->bdev) + return next_device; + } + + return NULL; +} + +/* + * Helper function to check if the given device is part of s_bdev / latest_bdev + * and replace it with the provided or the next active device, in the context + * where this function called, there should be always be another device (or + * this_dev) which is active. + */ +void btrfs_assign_next_active_device(struct btrfs_fs_info *fs_info, + struct btrfs_device *device, struct btrfs_device *this_dev) +{ + struct btrfs_device *next_device; + + if (this_dev) + next_device = this_dev; + else + next_device = btrfs_find_next_active_device(fs_info->fs_devices, + device); + ASSERT(next_device); + + if (fs_info->sb->s_bdev && + (fs_info->sb->s_bdev == device->bdev)) + fs_info->sb->s_bdev = next_device->bdev; + + if (fs_info->fs_devices->latest_bdev == device->bdev) + fs_info->fs_devices->latest_bdev = next_device->bdev; +} + int btrfs_rm_device(struct btrfs_root *root, char *device_path, u64 devid) { struct btrfs_device *device; - struct btrfs_device *next_device; struct btrfs_fs_devices *cur_devices; u64 num_devices; int ret = 0; @@ -1846,13 +1885,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path, u64 devid) if (device->missing) device->fs_devices->missing_devices--; - next_device = list_entry(root->fs_info->fs_devices->devices.next, - struct btrfs_device, dev_list); - if (root->fs_info->sb->s_bdev && - (root->fs_info->sb->s_bdev == device->bdev)) - root->fs_info->sb->s_bdev = next_device->bdev; - if (device->bdev == root->fs_info->fs_devices->latest_bdev) - root->fs_info->fs_devices->latest_bdev = next_device->bdev; + btrfs_assign_next_active_device(root->fs_info, device, NULL); if (device->bdev) { device->fs_devices->open_devices--; @@ -1981,8 +2014,6 @@ void btrfs_rm_dev_replace_free_srcdev(struct btrfs_fs_info *fs_info, void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, struct btrfs_device *tgtdev) { - struct btrfs_device *next_device; - mutex_lock(&uuid_mutex); WARN_ON(!tgtdev); mutex_lock(&fs_info->fs_devices->device_list_mutex); @@ -1995,13 +2026,8 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, } fs_info->fs_devices->num_devices--; - next_device = list_entry(fs_info->fs_devices->devices.next, - struct btrfs_device, dev_list); - if (fs_info->sb->s_bdev && - (tgtdev->bdev == fs_info->sb->s_bdev)) - fs_info->sb->s_bdev = next_device->bdev; - if (tgtdev->bdev == fs_info->fs_devices->latest_bdev) - fs_info->fs_devices->latest_bdev = next_device->bdev; + btrfs_assign_next_active_device(fs_info, tgtdev, NULL); + list_del_rcu(&tgtdev->dev_list); call_rcu(&tgtdev->rcu, free_device); diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index febdb7bc9370..7038017bf93b 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -445,6 +445,8 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, struct btrfs_fs_devices **fs_devices_ret); int btrfs_close_devices(struct btrfs_fs_devices *fs_devices); void btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices, int step); +void btrfs_assign_next_active_device(struct btrfs_fs_info *fs_info, + struct btrfs_device *device, struct btrfs_device *this_dev); int btrfs_find_device_missing_or_by_path(struct btrfs_root *root, char *device_path, struct btrfs_device **device); -- cgit v1.2.3-55-g7522 From 58409edd2d5cc24716cb9ce690803696c5118503 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Wed, 4 May 2016 11:46:10 +0200 Subject: btrfs: kill unused writepage_io_hook callback It seems to be long time unused, since 2008 and 6885f308b5570 ("Btrfs: Misc 2.6.25 updates"). Propagating the removal touches some code but has no functional effect. Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 39 ++++++++++++++++----------------------- fs/btrfs/extent_io.h | 1 - 2 files changed, 16 insertions(+), 24 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index d247fc0eea19..d22b0777c7e0 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3368,6 +3368,8 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode, while (cur <= end) { u64 em_end; + unsigned long max_nr; + if (cur >= i_size) { if (tree->ops && tree->ops->writepage_end_io_hook) tree->ops->writepage_end_io_hook(page, cur, @@ -3423,32 +3425,23 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode, continue; } - if (tree->ops && tree->ops->writepage_io_hook) { - ret = tree->ops->writepage_io_hook(page, cur, - cur + iosize - 1); - } else { - ret = 0; + max_nr = (i_size >> PAGE_SHIFT) + 1; + + set_range_writeback(tree, cur, cur + iosize - 1); + if (!PageWriteback(page)) { + btrfs_err(BTRFS_I(inode)->root->fs_info, + "page %lu not writeback, cur %llu end %llu", + page->index, cur, end); } - if (ret) { - SetPageError(page); - } else { - unsigned long max_nr = (i_size >> PAGE_SHIFT) + 1; - set_range_writeback(tree, cur, cur + iosize - 1); - if (!PageWriteback(page)) { - btrfs_err(BTRFS_I(inode)->root->fs_info, - "page %lu not writeback, cur %llu end %llu", - page->index, cur, end); - } + ret = submit_extent_page(write_flags, tree, wbc, page, + sector, iosize, pg_offset, + bdev, &epd->bio, max_nr, + end_bio_extent_writepage, + 0, 0, 0, false); + if (ret) + SetPageError(page); - ret = submit_extent_page(write_flags, tree, wbc, page, - sector, iosize, pg_offset, - bdev, &epd->bio, max_nr, - end_bio_extent_writepage, - 0, 0, 0, false); - if (ret) - SetPageError(page); - } cur = cur + iosize; pg_offset += iosize; nr++; diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index b5e0ade90e88..981f402bf754 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -71,7 +71,6 @@ struct extent_io_ops { u64 start, u64 end, int *page_started, unsigned long *nr_written); int (*writepage_start_hook)(struct page *page, u64 start, u64 end); - int (*writepage_io_hook)(struct page *page, u64 start, u64 end); extent_submit_bio_hook_t *submit_bio_hook; int (*merge_bio_hook)(int rw, struct page *page, unsigned long offset, size_t size, struct bio *bio, -- cgit v1.2.3-55-g7522 From 2355ac8495c1fb8ae58bdfe191489682538c697d Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 28 Apr 2016 11:59:34 +0200 Subject: btrfs: ioctl: reorder exclusive op check in RM_DEV Move the op exclusivity check before the other code (same as in ADD_DEV). Signed-off-by: David Sterba --- fs/btrfs/ioctl.c | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 03df0bbb445d..45a19c6b2341 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -2680,32 +2680,31 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) if (ret) return ret; - vol_args = memdup_user(arg, sizeof(*vol_args)); - if (IS_ERR(vol_args)) { - ret = PTR_ERR(vol_args); - goto err_drop; - } - - vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; - if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running, 1)) { ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; + goto out_drop_write; + } + + vol_args = memdup_user(arg, sizeof(*vol_args)); + if (IS_ERR(vol_args)) { + ret = PTR_ERR(vol_args); goto out; } + vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; mutex_lock(&root->fs_info->volume_mutex); ret = btrfs_rm_device(root, vol_args->name); mutex_unlock(&root->fs_info->volume_mutex); - atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0); if (!ret) btrfs_info(root->fs_info, "disk deleted %s",vol_args->name); - -out: kfree(vol_args); -err_drop: +out: + atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0); +out_drop_write: mnt_drop_write_file(file); + return ret; } -- cgit v1.2.3-55-g7522 From 88be159c905a2b4f6d59afa352bef410afb6af02 Mon Sep 17 00:00:00 2001 From: Austin S. Hemmelgarn Date: Wed, 23 Mar 2016 14:22:59 -0400 Subject: btrfs: allow balancing to dup with multi-device Currently, we don't allow the user to try and rebalance to a dup profile on a multi-device filesystem. In most cases, this is a perfectly sensible restriction as raid1 uses the same amount of space and provides better protection. However, when reshaping a multi-device filesystem down to a single device filesystem, this requires the user to convert metadata and system chunks to single profile before deleting devices, and then convert again to dup, which leaves a period of time where metadata integrity is reduced. This patch removes the single-device-only restriction from converting to dup profile to remove this potential data integrity reduction. Signed-off-by: Austin S. Hemmelgarn Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 745a619241c7..83dfed667637 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -3700,10 +3700,8 @@ int btrfs_balance(struct btrfs_balance_control *bctl, num_devices--; } btrfs_dev_replace_unlock(&fs_info->dev_replace, 0); - allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE; - if (num_devices == 1) - allowed |= BTRFS_BLOCK_GROUP_DUP; - else if (num_devices > 1) + allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE | BTRFS_BLOCK_GROUP_DUP; + if (num_devices > 1) allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1); if (num_devices > 2) allowed |= BTRFS_BLOCK_GROUP_RAID5; -- cgit v1.2.3-55-g7522 From ae02d1bd070767e109f4a6f1bb1f466e9698a355 Mon Sep 17 00:00:00 2001 From: Luis de Bethencourt Date: Wed, 30 Mar 2016 21:53:38 +0100 Subject: btrfs: fix mixed block count of available space Metadata for mixed block is already accounted in total data and should not be counted as part of the free metadata space. Signed-off-by: Luis de Bethencourt Link: https://bugzilla.kernel.org/show_bug.cgi?id=114281 Signed-off-by: David Sterba --- fs/btrfs/super.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 00b8f37cc306..bdca79ce45f1 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -2051,6 +2051,7 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv; int ret; u64 thresh = 0; + int mixed = 0; /* * holding chunk_muext to avoid allocating new chunks, holding @@ -2076,8 +2077,17 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) } } } - if (found->flags & BTRFS_BLOCK_GROUP_METADATA) - total_free_meta += found->disk_total - found->disk_used; + + /* + * Metadata in mixed block goup profiles are accounted in data + */ + if (!mixed && found->flags & BTRFS_BLOCK_GROUP_METADATA) { + if (found->flags & BTRFS_BLOCK_GROUP_DATA) + mixed = 1; + else + total_free_meta += found->disk_total - + found->disk_used; + } total_used += found->disk_used; } @@ -2115,7 +2125,7 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) */ thresh = 4 * 1024 * 1024; - if (total_free_meta - thresh < block_rsv->size) + if (!mixed && total_free_meta - thresh < block_rsv->size) buf->f_bavail = 0; buf->f_type = BTRFS_SUPER_MAGIC; -- cgit v1.2.3-55-g7522 From 41b34accb265e3a20211a7a8ef3625678f1c6ec7 Mon Sep 17 00:00:00 2001 From: Luis de Bethencourt Date: Wed, 30 Mar 2016 23:18:14 +0100 Subject: btrfs: avoid overflowing f_bfree Since mixed block groups accounting isn't byte-accurate and f_bree is an unsigned integer, it could overflow. Avoid this. Signed-off-by: Luis de Bethencourt Suggested-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/super.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index bdca79ce45f1..fe03efb5bec0 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -2100,7 +2100,11 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) /* Account global block reserve as used, it's in logical size already */ spin_lock(&block_rsv->lock); - buf->f_bfree -= block_rsv->size >> bits; + /* Mixed block groups accounting is not byte-accurate, avoid overflow */ + if (buf->f_bfree >= block_rsv->size >> bits) + buf->f_bfree -= block_rsv->size >> bits; + else + buf->f_bfree = 0; spin_unlock(&block_rsv->lock); buf->f_bavail = div_u64(total_free_data, factor); -- cgit v1.2.3-55-g7522 From f5ecec3ce21f706e9e7a330b2e8e5a2941927b46 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 13 Apr 2016 09:40:59 +0300 Subject: btrfs: send: silence an integer overflow warning The "sizeof(*arg->clone_sources) * arg->clone_sources_count" expression can overflow. It causes several static checker warnings. It's all under CAP_SYS_ADMIN so it's not that serious but lets silence the warnings. Signed-off-by: Dan Carpenter Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/send.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 8d358c547c59..ec433795fa71 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -5978,6 +5978,12 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) goto out; } + if (arg->clone_sources_count > + ULLONG_MAX / sizeof(*arg->clone_sources)) { + ret = -EINVAL; + goto out; + } + if (!access_ok(VERIFY_READ, arg->clone_sources, sizeof(*arg->clone_sources) * arg->clone_sources_count)) { -- cgit v1.2.3-55-g7522 From 779bf3fefa835cb52a07457c8acac6f2f66f2493 Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Mon, 18 Apr 2016 16:51:23 +0800 Subject: btrfs: fix lock dep warning, move scratch dev out of device_list_mutex and uuid_mutex When the replace target fails, the target device will be taken out of fs device list, scratch + update_dev_time and freed. However we could do the scratch + update_dev_time and free part after the device has been taken out of device list, so that we don't have to hold the device_list_mutex and uuid_mutex locks. Reported issue: [ 5375.718845] ====================================================== [ 5375.718846] [ INFO: possible circular locking dependency detected ] [ 5375.718849] 4.4.5-scst31x-debug-11+ #40 Not tainted [ 5375.718849] ------------------------------------------------------- [ 5375.718851] btrfs-health/4662 is trying to acquire lock: [ 5375.718861] (sb_writers){.+.+.+}, at: [] __sb_start_write+0xb7/0xf0 [ 5375.718862] [ 5375.718862] but task is already holding lock: [ 5375.718907] (&fs_devs->device_list_mutex){+.+.+.}, at: [] btrfs_destroy_dev_replace_tgtdev+0x3c/0x150 [btrfs] [ 5375.718907] [ 5375.718907] which lock already depends on the new lock. [ 5375.718907] [ 5375.718908] [ 5375.718908] the existing dependency chain (in reverse order) is: [ 5375.718911] [ 5375.718911] -> #3 (&fs_devs->device_list_mutex){+.+.+.}: [ 5375.718917] [] lock_acquire+0xce/0x1e0 [ 5375.718921] [] mutex_lock_nested+0x69/0x3c0 [ 5375.718940] [] btrfs_show_devname+0x36/0x210 [btrfs] [ 5375.718945] [] show_vfsmnt+0x49/0x150 [ 5375.718948] [] m_show+0x17/0x20 [ 5375.718951] [] seq_read+0x2d8/0x3b0 [ 5375.718955] [] __vfs_read+0x28/0xd0 [ 5375.718959] [] vfs_read+0x86/0x130 [ 5375.718962] [] SyS_read+0x49/0xa0 [ 5375.718966] [] entry_SYSCALL_64_fastpath+0x16/0x7a [ 5375.718968] [ 5375.718968] -> #2 (namespace_sem){+++++.}: [ 5375.718971] [] lock_acquire+0xce/0x1e0 [ 5375.718974] [] down_write+0x49/0x80 [ 5375.718977] [] lock_mount+0x43/0x1c0 [ 5375.718979] [] do_add_mount+0x23/0xd0 [ 5375.718982] [] do_mount+0x27b/0xe30 [ 5375.718985] [] SyS_mount+0x8c/0xd0 [ 5375.718988] [] entry_SYSCALL_64_fastpath+0x16/0x7a [ 5375.718991] [ 5375.718991] -> #1 (&sb->s_type->i_mutex_key#5){+.+.+.}: [ 5375.718994] [] lock_acquire+0xce/0x1e0 [ 5375.718996] [] mutex_lock_nested+0x69/0x3c0 [ 5375.719001] [] path_openat+0x468/0x1360 [ 5375.719004] [] do_filp_open+0x7e/0xe0 [ 5375.719007] [] do_sys_open+0x12b/0x210 [ 5375.719010] [] SyS_open+0x1e/0x20 [ 5375.719013] [] entry_SYSCALL_64_fastpath+0x16/0x7a [ 5375.719015] [ 5375.719015] -> #0 (sb_writers){.+.+.+}: [ 5375.719018] [] __lock_acquire+0x17ba/0x1ae0 [ 5375.719021] [] lock_acquire+0xce/0x1e0 [ 5375.719026] [] percpu_down_read+0x4f/0xa0 [ 5375.719028] [] __sb_start_write+0xb7/0xf0 [ 5375.719031] [] mnt_want_write+0x24/0x50 [ 5375.719035] [] path_openat+0xd32/0x1360 [ 5375.719037] [] do_filp_open+0x7e/0xe0 [ 5375.719040] [] file_open_name+0xe4/0x130 [ 5375.719043] [] filp_open+0x33/0x60 [ 5375.719073] [] update_dev_time+0x16/0x40 [btrfs] [ 5375.719099] [] btrfs_scratch_superblocks+0x4e/0x90 [btrfs] [ 5375.719123] [] btrfs_destroy_dev_replace_tgtdev+0x65/0x150 [btrfs] [ 5375.719150] [] btrfs_dev_replace_finishing+0x6b0/0x990 [btrfs] [ 5375.719175] [] btrfs_dev_replace_start+0x33e/0x540 [btrfs] [ 5375.719199] [] btrfs_auto_replace_start+0xf8/0x140 [btrfs] [ 5375.719222] [] health_kthread+0x246/0x490 [btrfs] [ 5375.719225] [] kthread+0xef/0x110 [ 5375.719229] [] ret_from_fork+0x3f/0x70 [ 5375.719230] [ 5375.719230] other info that might help us debug this: [ 5375.719230] [ 5375.719233] Chain exists of: [ 5375.719233] sb_writers --> namespace_sem --> &fs_devs->device_list_mutex [ 5375.719233] [ 5375.719234] Possible unsafe locking scenario: [ 5375.719234] [ 5375.719234] CPU0 CPU1 [ 5375.719235] ---- ---- [ 5375.719236] lock(&fs_devs->device_list_mutex); [ 5375.719238] lock(namespace_sem); [ 5375.719239] lock(&fs_devs->device_list_mutex); [ 5375.719241] lock(sb_writers); [ 5375.719241] [ 5375.719241] *** DEADLOCK *** [ 5375.719241] [ 5375.719243] 4 locks held by btrfs-health/4662: [ 5375.719266] #0: (&fs_info->health_mutex){+.+.+.}, at: [] health_kthread+0x63/0x490 [btrfs] [ 5375.719293] #1: (&fs_info->dev_replace.lock_finishing_cancel_unmount){+.+.+.}, at: [] btrfs_dev_replace_finishing+0x41/0x990 [btrfs] [ 5375.719319] #2: (uuid_mutex){+.+.+.}, at: [] btrfs_destroy_dev_replace_tgtdev+0x20/0x150 [btrfs] [ 5375.719343] #3: (&fs_devs->device_list_mutex){+.+.+.}, at: [] btrfs_destroy_dev_replace_tgtdev+0x3c/0x150 [btrfs] [ 5375.719343] [ 5375.719343] stack backtrace: [ 5375.719347] CPU: 2 PID: 4662 Comm: btrfs-health Not tainted 4.4.5-scst31x-debug-11+ #40 [ 5375.719348] Hardware name: Supermicro SYS-6018R-WTRT/X10DRW-iT, BIOS 1.0c 01/07/2015 [ 5375.719352] 0000000000000000 ffff880856f73880 ffffffff813529e3 ffffffff826182a0 [ 5375.719354] ffffffff8260c090 ffff880856f738c0 ffffffff810d667c ffff880856f73930 [ 5375.719357] ffff880861f32b40 ffff880861f32b68 0000000000000003 0000000000000004 [ 5375.719357] Call Trace: [ 5375.719363] [] dump_stack+0x85/0xc2 [ 5375.719366] [] print_circular_bug+0x1ec/0x260 [ 5375.719369] [] __lock_acquire+0x17ba/0x1ae0 [ 5375.719373] [] ? debug_lockdep_rcu_enabled+0x1d/0x20 [ 5375.719376] [] lock_acquire+0xce/0x1e0 [ 5375.719378] [] ? __sb_start_write+0xb7/0xf0 [ 5375.719383] [] percpu_down_read+0x4f/0xa0 [ 5375.719385] [] ? __sb_start_write+0xb7/0xf0 [ 5375.719387] [] __sb_start_write+0xb7/0xf0 [ 5375.719389] [] mnt_want_write+0x24/0x50 [ 5375.719393] [] path_openat+0xd32/0x1360 [ 5375.719415] [] ? btrfs_congested_fn+0x180/0x180 [btrfs] [ 5375.719418] [] ? debug_lockdep_rcu_enabled+0x1d/0x20 [ 5375.719420] [] do_filp_open+0x7e/0xe0 [ 5375.719423] [] ? rcu_read_lock_sched_held+0x6d/0x80 [ 5375.719426] [] ? kmem_cache_alloc+0x26b/0x5d0 [ 5375.719430] [] ? getname_kernel+0x34/0x120 [ 5375.719433] [] file_open_name+0xe4/0x130 [ 5375.719436] [] filp_open+0x33/0x60 [ 5375.719462] [] update_dev_time+0x16/0x40 [btrfs] [ 5375.719485] [] btrfs_scratch_superblocks+0x4e/0x90 [btrfs] [ 5375.719506] [] btrfs_destroy_dev_replace_tgtdev+0x65/0x150 [btrfs] [ 5375.719530] [] btrfs_dev_replace_finishing+0x6b0/0x990 [btrfs] [ 5375.719554] [] ? btrfs_dev_replace_finishing+0x553/0x990 [btrfs] [ 5375.719576] [] btrfs_dev_replace_start+0x33e/0x540 [btrfs] [ 5375.719598] [] btrfs_auto_replace_start+0xf8/0x140 [btrfs] [ 5375.719621] [] health_kthread+0x246/0x490 [btrfs] [ 5375.719641] [] ? health_kthread+0x138/0x490 [btrfs] [ 5375.719661] [] ? btrfs_congested_fn+0x180/0x180 [btrfs] [ 5375.719663] [] kthread+0xef/0x110 [ 5375.719666] [] ? kthread_create_on_node+0x200/0x200 [ 5375.719669] [] ret_from_fork+0x3f/0x70 [ 5375.719672] [] ? kthread_create_on_node+0x200/0x200 [ 5375.719697] ------------[ cut here ]------------ Signed-off-by: Anand Jain Reported-by: Yauhen Kharuzhy Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 83dfed667637..7bd553d42e0d 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -2024,10 +2024,9 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, btrfs_sysfs_rm_device_link(fs_info->fs_devices, tgtdev); - if (tgtdev->bdev) { - btrfs_scratch_superblocks(tgtdev->bdev, tgtdev->name->str); + if (tgtdev->bdev) fs_info->fs_devices->open_devices--; - } + fs_info->fs_devices->num_devices--; next_device = list_entry(fs_info->fs_devices->devices.next, @@ -2038,10 +2037,18 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, fs_info->fs_devices->latest_bdev = next_device->bdev; list_del_rcu(&tgtdev->dev_list); - call_rcu(&tgtdev->rcu, free_device); - mutex_unlock(&fs_info->fs_devices->device_list_mutex); mutex_unlock(&uuid_mutex); + + /* + * The update_dev_time() with in btrfs_scratch_superblocks() + * may lead to a call to btrfs_show_devname() which will try + * to hold device_list_mutex. And here this device + * is already out of device list, so we don't have to hold + * the device_list_mutex lock. + */ + btrfs_scratch_superblocks(tgtdev->bdev, tgtdev->name->str); + call_rcu(&tgtdev->rcu, free_device); } static int btrfs_find_device_by_path(struct btrfs_root *root, char *device_path, -- cgit v1.2.3-55-g7522 From 6ff48ce06b07255a6459cd8b816a110971a81f00 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Mon, 11 Apr 2016 18:40:08 +0200 Subject: btrfs: send: use vmalloc only as fallback for send_buf Signed-off-by: David Sterba --- fs/btrfs/send.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index ec433795fa71..b6e2c6ec4ee5 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -6028,10 +6028,13 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) sctx->clone_roots_cnt = arg->clone_sources_count; sctx->send_max_size = BTRFS_SEND_BUF_SIZE; - sctx->send_buf = vmalloc(sctx->send_max_size); + sctx->send_buf = kmalloc(sctx->send_max_size, GFP_KERNEL | __GFP_NOWARN); if (!sctx->send_buf) { - ret = -ENOMEM; - goto out; + sctx->send_buf = vmalloc(sctx->send_max_size); + if (!sctx->send_buf) { + ret = -ENOMEM; + goto out; + } } sctx->read_buf = vmalloc(BTRFS_SEND_READ_SIZE); @@ -6220,7 +6223,7 @@ out: fput(sctx->send_filp); vfree(sctx->clone_roots); - vfree(sctx->send_buf); + kvfree(sctx->send_buf); vfree(sctx->read_buf); name_cache_free(sctx); -- cgit v1.2.3-55-g7522 From eb5b75fe2e61a9ba907785b70318736112b0cf93 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Mon, 11 Apr 2016 18:40:08 +0200 Subject: btrfs: send: use vmalloc only as fallback for read_buf Signed-off-by: David Sterba --- fs/btrfs/send.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index b6e2c6ec4ee5..4e950ebbef53 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -6037,10 +6037,13 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) } } - sctx->read_buf = vmalloc(BTRFS_SEND_READ_SIZE); + sctx->read_buf = kmalloc(BTRFS_SEND_READ_SIZE, GFP_KERNEL | __GFP_NOWARN); if (!sctx->read_buf) { - ret = -ENOMEM; - goto out; + sctx->read_buf = vmalloc(BTRFS_SEND_READ_SIZE); + if (!sctx->read_buf) { + ret = -ENOMEM; + goto out; + } } sctx->pending_dir_moves = RB_ROOT; @@ -6224,7 +6227,7 @@ out: vfree(sctx->clone_roots); kvfree(sctx->send_buf); - vfree(sctx->read_buf); + kvfree(sctx->read_buf); name_cache_free(sctx); -- cgit v1.2.3-55-g7522 From e55d1153dbf48485a74eb4bf4eefeaedcf1486a9 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Mon, 11 Apr 2016 18:52:02 +0200 Subject: btrfs: send: use temporary variable to store allocation size We're going to use the argument multiple times later. Signed-off-by: David Sterba --- fs/btrfs/send.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 4e950ebbef53..4f85a47c2f55 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -5939,6 +5939,7 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) u32 i; u64 *clone_sources_tmp = NULL; int clone_sources_to_rollback = 0; + unsigned alloc_size; int sort_clone_roots = 0; int index; @@ -6050,24 +6051,25 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) sctx->waiting_dir_moves = RB_ROOT; sctx->orphan_dirs = RB_ROOT; - sctx->clone_roots = vzalloc(sizeof(struct clone_root) * - (arg->clone_sources_count + 1)); + alloc_size = sizeof(struct clone_root) * (arg->clone_sources_count + 1); + + sctx->clone_roots = vzalloc(alloc_size); if (!sctx->clone_roots) { ret = -ENOMEM; goto out; } + alloc_size = arg->clone_sources_count * sizeof(*arg->clone_sources); + if (arg->clone_sources_count) { - clone_sources_tmp = vmalloc(arg->clone_sources_count * - sizeof(*arg->clone_sources)); + clone_sources_tmp = vmalloc(alloc_size); if (!clone_sources_tmp) { ret = -ENOMEM; goto out; } ret = copy_from_user(clone_sources_tmp, arg->clone_sources, - arg->clone_sources_count * - sizeof(*arg->clone_sources)); + alloc_size); if (ret) { ret = -EFAULT; goto out; -- cgit v1.2.3-55-g7522 From c03d01f3404282712b9fd280297f133860c91c93 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Mon, 11 Apr 2016 18:40:08 +0200 Subject: btrfs: send: use vmalloc only as fallback for clone_roots Signed-off-by: David Sterba --- fs/btrfs/send.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 4f85a47c2f55..5a5d37b37150 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -6053,10 +6053,13 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) alloc_size = sizeof(struct clone_root) * (arg->clone_sources_count + 1); - sctx->clone_roots = vzalloc(alloc_size); + sctx->clone_roots = kzalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN); if (!sctx->clone_roots) { - ret = -ENOMEM; - goto out; + sctx->clone_roots = vzalloc(alloc_size); + if (!sctx->clone_roots) { + ret = -ENOMEM; + goto out; + } } alloc_size = arg->clone_sources_count * sizeof(*arg->clone_sources); @@ -6227,7 +6230,7 @@ out: if (sctx->send_filp) fput(sctx->send_filp); - vfree(sctx->clone_roots); + kvfree(sctx->clone_roots); kvfree(sctx->send_buf); kvfree(sctx->read_buf); -- cgit v1.2.3-55-g7522 From 2f91306a37809907474a06c1defdb1ff50be06f0 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Mon, 11 Apr 2016 18:40:08 +0200 Subject: btrfs: send: use vmalloc only as fallback for clone_sources_tmp Signed-off-by: David Sterba --- fs/btrfs/send.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 5a5d37b37150..6a8c86074aa4 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -6065,10 +6065,13 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) alloc_size = arg->clone_sources_count * sizeof(*arg->clone_sources); if (arg->clone_sources_count) { - clone_sources_tmp = vmalloc(alloc_size); + clone_sources_tmp = kmalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN); if (!clone_sources_tmp) { - ret = -ENOMEM; - goto out; + clone_sources_tmp = vmalloc(alloc_size); + if (!clone_sources_tmp) { + ret = -ENOMEM; + goto out; + } } ret = copy_from_user(clone_sources_tmp, arg->clone_sources, @@ -6106,7 +6109,7 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) sctx->clone_roots[i].root = clone_root; clone_sources_to_rollback = i + 1; } - vfree(clone_sources_tmp); + kvfree(clone_sources_tmp); clone_sources_tmp = NULL; } @@ -6224,7 +6227,7 @@ out: btrfs_root_dec_send_in_progress(sctx->parent_root); kfree(arg); - vfree(clone_sources_tmp); + kvfree(clone_sources_tmp); if (sctx) { if (sctx->send_filp) -- cgit v1.2.3-55-g7522 From 153519559a39725c5a45269256fec0efb81bcd1f Mon Sep 17 00:00:00 2001 From: David Sterba Date: Mon, 11 Apr 2016 18:40:08 +0200 Subject: btrfs: clone: use vmalloc only as fallback for nodesize bufer Signed-off-by: David Sterba --- fs/btrfs/ioctl.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index f545f81f642d..cfa7d47b1ba0 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -3468,13 +3468,16 @@ static int btrfs_clone(struct inode *src, struct inode *inode, u64 last_dest_end = destoff; ret = -ENOMEM; - buf = vmalloc(root->nodesize); - if (!buf) - return ret; + buf = kmalloc(root->nodesize, GFP_KERNEL | __GFP_NOWARN); + if (!buf) { + buf = vmalloc(root->nodesize); + if (!buf) + return ret; + } path = btrfs_alloc_path(); if (!path) { - vfree(buf); + kvfree(buf); return ret; } @@ -3775,7 +3778,7 @@ process_slot: out: btrfs_free_path(path); - vfree(buf); + kvfree(buf); return ret; } -- cgit v1.2.3-55-g7522 From 49a3c4d9b64d2e5d6fd268d53153945d04cb5559 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 24 Mar 2016 17:49:22 +0100 Subject: btrfs: use dynamic allocation for root item in create_subvol The size of root item is more than 400 bytes, which is quite a lot of stack space. As we do IO from inside the subvolume ioctls, we should keep the stack usage low in case the filesystem is on top of other layers (NFS, device mapper, iscsi, etc). Reviewed-by: Tsutomu Itoh Signed-off-by: David Sterba --- fs/btrfs/ioctl.c | 65 ++++++++++++++++++++++++++++++++------------------------ 1 file changed, 37 insertions(+), 28 deletions(-) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index cfa7d47b1ba0..798f58e7338e 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -439,7 +439,7 @@ static noinline int create_subvol(struct inode *dir, { struct btrfs_trans_handle *trans; struct btrfs_key key; - struct btrfs_root_item root_item; + struct btrfs_root_item *root_item; struct btrfs_inode_item *inode_item; struct extent_buffer *leaf; struct btrfs_root *root = BTRFS_I(dir)->root; @@ -455,16 +455,22 @@ static noinline int create_subvol(struct inode *dir, u64 qgroup_reserved; uuid_le new_uuid; + root_item = kzalloc(sizeof(*root_item), GFP_KERNEL); + if (!root_item) + return -ENOMEM; + ret = btrfs_find_free_objectid(root->fs_info->tree_root, &objectid); if (ret) - return ret; + goto fail_free; /* * Don't create subvolume whose level is not zero. Or qgroup will be * screwed up since it assume subvolme qgroup's level to be 0. */ - if (btrfs_qgroup_level(objectid)) - return -ENOSPC; + if (btrfs_qgroup_level(objectid)) { + ret = -ENOSPC; + goto fail_free; + } btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP); /* @@ -474,14 +480,14 @@ static noinline int create_subvol(struct inode *dir, ret = btrfs_subvolume_reserve_metadata(root, &block_rsv, 8, &qgroup_reserved, false); if (ret) - return ret; + goto fail_free; trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { ret = PTR_ERR(trans); btrfs_subvolume_release_metadata(root, &block_rsv, qgroup_reserved); - return ret; + goto fail_free; } trans->block_rsv = &block_rsv; trans->bytes_reserved = block_rsv.size; @@ -509,47 +515,45 @@ static noinline int create_subvol(struct inode *dir, BTRFS_UUID_SIZE); btrfs_mark_buffer_dirty(leaf); - memset(&root_item, 0, sizeof(root_item)); - - inode_item = &root_item.inode; + inode_item = &root_item->inode; btrfs_set_stack_inode_generation(inode_item, 1); btrfs_set_stack_inode_size(inode_item, 3); btrfs_set_stack_inode_nlink(inode_item, 1); btrfs_set_stack_inode_nbytes(inode_item, root->nodesize); btrfs_set_stack_inode_mode(inode_item, S_IFDIR | 0755); - btrfs_set_root_flags(&root_item, 0); - btrfs_set_root_limit(&root_item, 0); + btrfs_set_root_flags(root_item, 0); + btrfs_set_root_limit(root_item, 0); btrfs_set_stack_inode_flags(inode_item, BTRFS_INODE_ROOT_ITEM_INIT); - btrfs_set_root_bytenr(&root_item, leaf->start); - btrfs_set_root_generation(&root_item, trans->transid); - btrfs_set_root_level(&root_item, 0); - btrfs_set_root_refs(&root_item, 1); - btrfs_set_root_used(&root_item, leaf->len); - btrfs_set_root_last_snapshot(&root_item, 0); + btrfs_set_root_bytenr(root_item, leaf->start); + btrfs_set_root_generation(root_item, trans->transid); + btrfs_set_root_level(root_item, 0); + btrfs_set_root_refs(root_item, 1); + btrfs_set_root_used(root_item, leaf->len); + btrfs_set_root_last_snapshot(root_item, 0); - btrfs_set_root_generation_v2(&root_item, - btrfs_root_generation(&root_item)); + btrfs_set_root_generation_v2(root_item, + btrfs_root_generation(root_item)); uuid_le_gen(&new_uuid); - memcpy(root_item.uuid, new_uuid.b, BTRFS_UUID_SIZE); - btrfs_set_stack_timespec_sec(&root_item.otime, cur_time.tv_sec); - btrfs_set_stack_timespec_nsec(&root_item.otime, cur_time.tv_nsec); - root_item.ctime = root_item.otime; - btrfs_set_root_ctransid(&root_item, trans->transid); - btrfs_set_root_otransid(&root_item, trans->transid); + memcpy(root_item->uuid, new_uuid.b, BTRFS_UUID_SIZE); + btrfs_set_stack_timespec_sec(&root_item->otime, cur_time.tv_sec); + btrfs_set_stack_timespec_nsec(&root_item->otime, cur_time.tv_nsec); + root_item->ctime = root_item->otime; + btrfs_set_root_ctransid(root_item, trans->transid); + btrfs_set_root_otransid(root_item, trans->transid); btrfs_tree_unlock(leaf); free_extent_buffer(leaf); leaf = NULL; - btrfs_set_root_dirid(&root_item, new_dirid); + btrfs_set_root_dirid(root_item, new_dirid); key.objectid = objectid; key.offset = 0; key.type = BTRFS_ROOT_ITEM_KEY; ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key, - &root_item); + root_item); if (ret) goto fail; @@ -601,12 +605,13 @@ static noinline int create_subvol(struct inode *dir, BUG_ON(ret); ret = btrfs_uuid_tree_add(trans, root->fs_info->uuid_root, - root_item.uuid, BTRFS_UUID_KEY_SUBVOL, + root_item->uuid, BTRFS_UUID_KEY_SUBVOL, objectid); if (ret) btrfs_abort_transaction(trans, root, ret); fail: + kfree(root_item); trans->block_rsv = NULL; trans->bytes_reserved = 0; btrfs_subvolume_release_metadata(root, &block_rsv, qgroup_reserved); @@ -629,6 +634,10 @@ fail: d_instantiate(dentry, inode); } return ret; + +fail_free: + kfree(root_item); + return ret; } static void btrfs_wait_for_no_snapshoting_writes(struct btrfs_root *root) -- cgit v1.2.3-55-g7522 From e6c11f9a462e9ef4876d5e1539a6c06eded4c793 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 24 Mar 2016 18:00:53 +0100 Subject: btrfs: reuse existing variable in scrub_stripe, reduce stack usage The key variable occupies 17 bytes, the key_start is used once, we can simply reuse existing 'key' for that purpose. As the key is not a simple type, compiler doest not do it on itself. Signed-off-by: David Sterba --- fs/btrfs/scrub.c | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 4678f03e878e..2ff2876656ba 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -3070,7 +3070,6 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, int slot; u64 nstripes; struct extent_buffer *l; - struct btrfs_key key; u64 physical; u64 logical; u64 logic_end; @@ -3079,7 +3078,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, int mirror_num; struct reada_control *reada1; struct reada_control *reada2; - struct btrfs_key key_start; + struct btrfs_key key; struct btrfs_key key_end; u64 increment = map->stripe_len; u64 offset; @@ -3158,21 +3157,21 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, scrub_blocked_if_needed(fs_info); /* FIXME it might be better to start readahead at commit root */ - key_start.objectid = logical; - key_start.type = BTRFS_EXTENT_ITEM_KEY; - key_start.offset = (u64)0; + key.objectid = logical; + key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = (u64)0; key_end.objectid = logic_end; key_end.type = BTRFS_METADATA_ITEM_KEY; key_end.offset = (u64)-1; - reada1 = btrfs_reada_add(root, &key_start, &key_end); + reada1 = btrfs_reada_add(root, &key, &key_end); - key_start.objectid = BTRFS_EXTENT_CSUM_OBJECTID; - key_start.type = BTRFS_EXTENT_CSUM_KEY; - key_start.offset = logical; + key.objectid = BTRFS_EXTENT_CSUM_OBJECTID; + key.type = BTRFS_EXTENT_CSUM_KEY; + key.offset = logical; key_end.objectid = BTRFS_EXTENT_CSUM_OBJECTID; key_end.type = BTRFS_EXTENT_CSUM_KEY; key_end.offset = logic_end; - reada2 = btrfs_reada_add(csum_root, &key_start, &key_end); + reada2 = btrfs_reada_add(csum_root, &key, &key_end); if (!IS_ERR(reada1)) btrfs_reada_wait(reada1); -- cgit v1.2.3-55-g7522 From ee6111386a1b304f8bf589d36810d53e3b27ee20 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Fri, 23 Jan 2015 18:43:31 +0100 Subject: btrfs: add read-only check to sysfs handler of features We don't want to trigger the change on a read-only filesystem, similar to what the label handler does. Signed-off-by: David Sterba --- fs/btrfs/sysfs.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 539e7b5e3f86..6a6bb600b1ff 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -120,6 +120,9 @@ static ssize_t btrfs_feature_attr_store(struct kobject *kobj, if (!fs_info) return -EPERM; + if (fs_info->sb->s_flags & MS_RDONLY) + return -EROFS; + ret = kstrtoul(skip_spaces(buf), 0, &val); if (ret) return ret; -- cgit v1.2.3-55-g7522 From 66ac9fe7bacf9fa76c472efc7a7aaa590c7bce6a Mon Sep 17 00:00:00 2001 From: David Sterba Date: Tue, 26 Apr 2016 16:03:57 +0200 Subject: btrfs: add check to sysfs handler of label Add a sanity check for the fs_info as we will dereference it, similar to what the 'store features' handler does. Signed-off-by: David Sterba --- fs/btrfs/sysfs.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 6a6bb600b1ff..3d14618ce54b 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -377,6 +377,9 @@ static ssize_t btrfs_label_store(struct kobject *kobj, struct btrfs_fs_info *fs_info = to_fs_info(kobj); size_t p_len; + if (!fs_info) + return -EPERM; + if (fs_info->sb->s_flags & MS_RDONLY) return -EROFS; -- cgit v1.2.3-55-g7522 From ee17fc8005287d2d6ca7cab6e814e5043d773735 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Tue, 26 Apr 2016 16:22:06 +0200 Subject: btrfs: sysfs: protect reading label by lock If the label setting ioctl races with sysfs label handler, we could get mixed result in the output, part old part new. We should either get the old or new label. The chances to hit this race are low. Signed-off-by: David Sterba --- fs/btrfs/sysfs.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 3d14618ce54b..4879656bda3c 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -367,7 +367,13 @@ static ssize_t btrfs_label_show(struct kobject *kobj, { struct btrfs_fs_info *fs_info = to_fs_info(kobj); char *label = fs_info->super_copy->label; - return snprintf(buf, PAGE_SIZE, label[0] ? "%s\n" : "%s", label); + ssize_t ret; + + spin_lock(&fs_info->super_lock); + ret = snprintf(buf, PAGE_SIZE, label[0] ? "%s\n" : "%s", label); + spin_unlock(&fs_info->super_lock); + + return ret; } static ssize_t btrfs_label_store(struct kobject *kobj, -- cgit v1.2.3-55-g7522 From 3d8da67817606380fdadfa483d4dba5c3a5446c6 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Tue, 26 Apr 2016 17:53:31 -0700 Subject: Btrfs: fix divide error upon chunk's stripe_len The struct 'map_lookup' uses type int for @stripe_len, while btrfs_chunk_stripe_len() can return a u64 value, and it may end up with @stripe_len being undefined value and it can lead to 'divide error' in __btrfs_map_block(). This changes 'map_lookup' to use type u64 for stripe_len, also right now we only use BTRFS_STRIPE_LEN for stripe_len, so this adds a valid checker for BTRFS_STRIPE_LEN. Reported-by: Vegard Nossum Reported-by: Quentin Casasnovas Signed-off-by: Liu Bo Reviewed-by: David Sterba [ folded division fix to scrub_raid56_parity ] Signed-off-by: David Sterba --- fs/btrfs/scrub.c | 2 +- fs/btrfs/volumes.c | 2 +- fs/btrfs/volumes.h | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 2ff2876656ba..96d2a0de35a8 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -2860,7 +2860,7 @@ static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx, int extent_mirror_num; int stop_loop = 0; - nsectors = map->stripe_len / root->sectorsize; + nsectors = div_u64(map->stripe_len, root->sectorsize); bitmap_len = scrub_calc_parity_bitmap_len(nsectors); sparity = kzalloc(sizeof(struct scrub_parity) + 2 * bitmap_len, GFP_NOFS); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 7bd553d42e0d..f13e2bcc1398 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -6254,7 +6254,7 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key, "invalid chunk length %llu", length); return -EIO; } - if (!is_power_of_2(stripe_len)) { + if (!is_power_of_2(stripe_len) || stripe_len != BTRFS_STRIPE_LEN) { btrfs_err(root->fs_info, "invalid chunk stripe length: %llu", stripe_len); return -EIO; diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 1939ebde63df..7507be74f7da 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -347,7 +347,7 @@ struct map_lookup { u64 type; int io_align; int io_width; - int stripe_len; + u64 stripe_len; int sector_size; int num_stripes; int sub_stripes; -- cgit v1.2.3-55-g7522 From e042d1ec4417981dfe9331e47b76f17929bc2ffe Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 12 Apr 2016 12:54:40 -0400 Subject: Btrfs: remove BUG_ON()'s in btrfs_map_block btrfs_map_block can go horribly wrong in the face of fs corruption, lets agree to not be assholes and panic at any possible chance things are all fucked up. Signed-off-by: Josef Bacik [ removed type casts ] Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index f13e2bcc1398..8d3ceb0fd239 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -5290,7 +5290,15 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, stripe_nr = div64_u64(stripe_nr, stripe_len); stripe_offset = stripe_nr * stripe_len; - BUG_ON(offset < stripe_offset); + if (offset < stripe_offset) { + btrfs_crit(fs_info, "stripe math has gone wrong, " + "stripe_offset=%llu, offset=%llu, start=%llu, " + "logical=%llu, stripe_len=%llu", + stripe_offset, offset, em->start, logical, + stripe_len); + free_extent_map(em); + return -EINVAL; + } /* stripe_offset is the offset of this block in its stripe*/ stripe_offset = offset - stripe_offset; @@ -5531,7 +5539,13 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, &stripe_index); mirror_num = stripe_index + 1; } - BUG_ON(stripe_index >= map->num_stripes); + if (stripe_index >= map->num_stripes) { + btrfs_crit(fs_info, "stripe index math went horribly wrong, " + "got stripe_index=%u, num_stripes=%u", + stripe_index, map->num_stripes); + ret = -EINVAL; + goto out; + } num_alloc_stripes = num_stripes; if (dev_replace_is_ongoing) { -- cgit v1.2.3-55-g7522 From 2473114981a36b4f0f57cf6e9548037d547a71b7 Mon Sep 17 00:00:00 2001 From: Ashish Samant Date: Fri, 29 Apr 2016 18:33:59 -0700 Subject: btrfs: Fix BUG_ON condition in scrub_setup_recheck_block() pagev array in scrub_block{} is of size SCRUB_MAX_PAGES_PER_BLOCK. page_index should be checked with the same to trigger BUG_ON(). Signed-off-by: Ashish Samant Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/scrub.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 96d2a0de35a8..d270c700ed31 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -1350,7 +1350,7 @@ static int scrub_setup_recheck_block(struct scrub_block *original_sblock, recover->bbio = bbio; recover->map_length = mapped_length; - BUG_ON(page_index >= SCRUB_PAGES_PER_RD_BIO); + BUG_ON(page_index >= SCRUB_MAX_PAGES_PER_BLOCK); nmirrors = min(scrub_nr_raid_mirrors(bbio), BTRFS_MAX_MIRRORS); -- cgit v1.2.3-55-g7522 From 48b3b9d401ec86899a52003b37331190a35a81a6 Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Tue, 12 Apr 2016 21:36:16 +0800 Subject: btrfs: fix lock dep warning move scratch super outside of chunk_mutex Move scratch super outside of the chunk lock to avoid below lockdep warning. The better place to scratch super is in the function btrfs_rm_dev_replace_free_srcdev() just before free_device, which is outside of the chunk lock as well. To reproduce: (fresh boot) mkfs.btrfs -f -draid5 -mraid5 /dev/sdc /dev/sdd /dev/sde mount /dev/sdc /btrfs dd if=/dev/zero of=/btrfs/tf1 bs=4096 count=100 (get devmgt from https://github.com/asj/devmgt.git) devmgt detach /dev/sde dd if=/dev/zero of=/btrfs/tf1 bs=4096 count=100 sync btrfs replace start -Brf 3 /dev/sdf /btrfs <-- devmgt attach host7 ====================================================== [ INFO: possible circular locking dependency detected ] 4.6.0-rc2asj+ #1 Not tainted --------------------------------------------------- btrfs/2174 is trying to acquire lock: (sb_writers){.+.+.+}, at: [] __sb_start_write+0xb4/0xf0 but task is already holding lock: (&fs_info->chunk_mutex){+.+.+.}, at: [] btrfs_dev_replace_finishing+0x145/0x980 [btrfs] which lock already depends on the new lock. Chain exists of: sb_writers --> &fs_devs->device_list_mutex --> &fs_info->chunk_mutex Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(&fs_info->chunk_mutex); lock(&fs_devs->device_list_mutex); lock(&fs_info->chunk_mutex); lock(sb_writers); *** DEADLOCK *** -> #0 (sb_writers){.+.+.+}: [] __lock_acquire+0x1bc5/0x1ee0 [] lock_acquire+0xbe/0x210 [] percpu_down_read+0x4a/0xa0 [] __sb_start_write+0xb4/0xf0 [] mnt_want_write+0x24/0x50 [] path_openat+0x952/0x1190 [] do_filp_open+0x91/0x100 [] file_open_name+0xfc/0x140 [] filp_open+0x33/0x60 [] update_dev_time+0x16/0x40 [btrfs] [] btrfs_scratch_superblocks+0x5d/0xb0 [btrfs] [] btrfs_rm_dev_replace_remove_srcdev+0xae/0xd0 [btrfs] [] btrfs_dev_replace_finishing+0x4b5/0x980 [btrfs] [] btrfs_dev_replace_start+0x358/0x530 [btrfs] Signed-off-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 8d3ceb0fd239..c5e0d2d431dc 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1972,11 +1972,8 @@ void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_fs_info *fs_info, if (srcdev->missing) fs_devices->missing_devices--; - if (srcdev->writeable) { + if (srcdev->writeable) fs_devices->rw_devices--; - /* zero out the old super if it is writable */ - btrfs_scratch_superblocks(srcdev->bdev, srcdev->name->str); - } if (srcdev->bdev) fs_devices->open_devices--; @@ -1987,6 +1984,10 @@ void btrfs_rm_dev_replace_free_srcdev(struct btrfs_fs_info *fs_info, { struct btrfs_fs_devices *fs_devices = srcdev->fs_devices; + if (srcdev->writeable) { + /* zero out the old super if it is writable */ + btrfs_scratch_superblocks(srcdev->bdev, srcdev->name->str); + } call_rcu(&srcdev->rcu, free_device); /* -- cgit v1.2.3-55-g7522 From 7ab19625a911f7568ec85302e3aa7a64186006c8 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Wed, 4 May 2016 11:32:00 +0200 Subject: btrfs: add write protection to SET_FEATURES ioctl Perform the want_write check if we get far enough to do any writes. Signed-off-by: David Sterba --- fs/btrfs/ioctl.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 798f58e7338e..1e8ce5247a81 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -5406,9 +5406,15 @@ static int btrfs_ioctl_set_features(struct file *file, void __user *arg) if (ret) return ret; + ret = mnt_want_write_file(file); + if (ret) + return ret; + trans = btrfs_start_transaction(root, 0); - if (IS_ERR(trans)) - return PTR_ERR(trans); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto out_drop_write; + } spin_lock(&root->fs_info->super_lock); newflags = btrfs_super_compat_flags(super_block); @@ -5427,7 +5433,11 @@ static int btrfs_ioctl_set_features(struct file *file, void __user *arg) btrfs_set_super_incompat_flags(super_block, newflags); spin_unlock(&root->fs_info->super_lock); - return btrfs_commit_transaction(trans, root); + ret = btrfs_commit_transaction(trans, root); +out_drop_write: + mnt_drop_write_file(file); + + return ret; } long btrfs_ioctl(struct file *file, unsigned int -- cgit v1.2.3-55-g7522 From 58d7bbf81f54667e36940d5f4b5609606efa597b Mon Sep 17 00:00:00 2001 From: David Sterba Date: Wed, 4 May 2016 14:10:47 +0200 Subject: btrfs: ioctl: reorder exclusive op check in RM_DEV Move the op exclusivity check before the other code (same as in ADD_DEV). Signed-off-by: David Sterba --- fs/btrfs/ioctl.c | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 1e8ce5247a81..c81d6daefe74 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -2689,32 +2689,31 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) if (ret) return ret; - vol_args = memdup_user(arg, sizeof(*vol_args)); - if (IS_ERR(vol_args)) { - ret = PTR_ERR(vol_args); - goto err_drop; - } - - vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; - if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running, 1)) { ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; + goto out_drop_write; + } + + vol_args = memdup_user(arg, sizeof(*vol_args)); + if (IS_ERR(vol_args)) { + ret = PTR_ERR(vol_args); goto out; } + vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; mutex_lock(&root->fs_info->volume_mutex); ret = btrfs_rm_device(root, vol_args->name); mutex_unlock(&root->fs_info->volume_mutex); - atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0); if (!ret) btrfs_info(root->fs_info, "disk deleted %s",vol_args->name); - -out: kfree(vol_args); -err_drop: +out: + atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0); +out_drop_write: mnt_drop_write_file(file); + return ret; } -- cgit v1.2.3-55-g7522 From 2f3165ecf103599f82bf0ea254039db335fb5005 Mon Sep 17 00:00:00 2001 From: Zygo Blaxell Date: Thu, 5 May 2016 00:23:49 -0400 Subject: btrfs: don't force mounts to wait for cleaner_kthread to delete one or more subvolumes During a mount, we start the cleaner kthread first because the transaction kthread wants to wake up the cleaner kthread. We start the transaction kthread next because everything in btrfs wants transactions. We do reloc recovery in the thread that was doing the original mount call once the transaction kthread is running. This means that the cleaner kthread could already be running when reloc recovery happens (e.g. if a snapshot delete was started before a crash). Relocation does not play well with the cleaner kthread, so a mutex was added in commit 5f3164813b90f7dbcb5c3ab9006906222ce471b7 "Btrfs: fix race between balance recovery and root deletion" to prevent both from being active at the same time. If the cleaner kthread is already holding the mutex by the time we get to btrfs_recover_relocation, the mount will be blocked until at least one deleted subvolume is cleaned (possibly more if the mount process doesn't get the lock right away). During this time (which could be an arbitrarily long time on a large/slow filesystem), the mount process is stuck and the filesystem is unnecessarily inaccessible. Fix this by locking cleaner_mutex before we start cleaner_kthread, and unlocking the mutex after mount no longer requires it. This ensures that the mounting process will not be blocked by the cleaner kthread. The cleaner kthread is already prepared for mutex contention and will just go to sleep until the mutex is available. Signed-off-by: Zygo Blaxell Reviewed-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 4e47849d7427..070c1dad42bd 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2517,6 +2517,7 @@ int open_ctree(struct super_block *sb, int num_backups_tried = 0; int backup_index = 0; int max_active; + bool cleaner_mutex_locked = false; tree_root = fs_info->tree_root = btrfs_alloc_root(fs_info, GFP_KERNEL); chunk_root = fs_info->chunk_root = btrfs_alloc_root(fs_info, GFP_KERNEL); @@ -2997,6 +2998,13 @@ retry_root_backup: goto fail_sysfs; } + /* + * Hold the cleaner_mutex thread here so that we don't block + * for a long time on btrfs_recover_relocation. cleaner_kthread + * will wait for us to finish mounting the filesystem. + */ + mutex_lock(&fs_info->cleaner_mutex); + cleaner_mutex_locked = true; fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root, "btrfs-cleaner"); if (IS_ERR(fs_info->cleaner_kthread)) @@ -3056,10 +3064,8 @@ retry_root_backup: ret = btrfs_cleanup_fs_roots(fs_info); if (ret) goto fail_qgroup; - - mutex_lock(&fs_info->cleaner_mutex); + /* We locked cleaner_mutex before creating cleaner_kthread. */ ret = btrfs_recover_relocation(tree_root); - mutex_unlock(&fs_info->cleaner_mutex); if (ret < 0) { printk(KERN_WARNING "BTRFS: failed to recover relocation\n"); @@ -3067,6 +3073,8 @@ retry_root_backup: goto fail_qgroup; } } + mutex_unlock(&fs_info->cleaner_mutex); + cleaner_mutex_locked = false; location.objectid = BTRFS_FS_TREE_OBJECTID; location.type = BTRFS_ROOT_ITEM_KEY; @@ -3180,6 +3188,10 @@ fail_cleaner: filemap_write_and_wait(fs_info->btree_inode->i_mapping); fail_sysfs: + if (cleaner_mutex_locked) { + mutex_unlock(&fs_info->cleaner_mutex); + cleaner_mutex_locked = false; + } btrfs_sysfs_remove_mounted(fs_info); fail_fsdev_sysfs: -- cgit v1.2.3-55-g7522 From 8eb0dfdbda3f56bf7d248ed87fcc383df114ecbb Mon Sep 17 00:00:00 2001 From: Adam Borowski Date: Sun, 8 May 2016 15:08:00 +0200 Subject: btrfs: fix int32 overflow in shrink_delalloc(). UBSAN: Undefined behaviour in fs/btrfs/extent-tree.c:4623:21 signed integer overflow: 10808 * 262144 cannot be represented in type 'int [8]' If 8192<=items<16384, we request a writeback of an insane number of pages which is benign (everything will be written). But if items>=16384, the space reservation won't be enough. Signed-off-by: Adam Borowski Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 84e060eb0de8..391f576789e9 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4620,7 +4620,7 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig, /* Calc the number of the pages we need flush for space reservation */ items = calc_reclaim_items_nr(root, to_reclaim); - to_reclaim = items * EXTENT_SIZE_PER_ITEM; + to_reclaim = (u64)items * EXTENT_SIZE_PER_ITEM; trans = (struct btrfs_trans_handle *)current->journal_info; block_rsv = &root->fs_info->delalloc_block_rsv; -- cgit v1.2.3-55-g7522 From 05135f597adcb94dc34fa87b82e68eb55e00f0eb Mon Sep 17 00:00:00 2001 From: David Sterba Date: Mon, 9 May 2016 11:32:39 +0200 Subject: btrfs: switch to common message helpers in open_ctree, adjust messages Currently we lack the identification of the filesystem in most if not all mount messages, done via printk/pr_* functions. We can use the btrfs_* helpers in open_ctree, as the fs_info <-> sb link is established at the beginning of the function. The messages have been updated at the same time to be more consistent: * dropped sb->s_id, as it's not available via btrfs_* * added %d for return code where appropriate * wording changed * %Lx replaced by %llx Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 102 ++++++++++++++++++++++++++--------------------------- 1 file changed, 50 insertions(+), 52 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index aeb090583b78..123fab55b5a4 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2713,7 +2713,7 @@ int open_ctree(struct super_block *sb, * Pass the whole disk block of size BTRFS_SUPER_INFO_SIZE (4k). */ if (btrfs_check_super_csum(bh->b_data)) { - printk(KERN_ERR "BTRFS: superblock checksum mismatch\n"); + btrfs_err(fs_info, "superblock checksum mismatch"); err = -EINVAL; brelse(bh); goto fail_alloc; @@ -2733,7 +2733,7 @@ int open_ctree(struct super_block *sb, ret = btrfs_check_super_valid(fs_info, sb->s_flags & MS_RDONLY); if (ret) { - printk(KERN_ERR "BTRFS: superblock contains fatal errors\n"); + btrfs_err(fs_info, "superblock contains fatal errors"); err = -EINVAL; goto fail_alloc; } @@ -2768,9 +2768,9 @@ int open_ctree(struct super_block *sb, features = btrfs_super_incompat_flags(disk_super) & ~BTRFS_FEATURE_INCOMPAT_SUPP; if (features) { - printk(KERN_ERR "BTRFS: couldn't mount because of " - "unsupported optional features (%Lx).\n", - features); + btrfs_err(fs_info, + "cannot mount because of unsupported optional features (%llx)", + features); err = -EINVAL; goto fail_alloc; } @@ -2781,7 +2781,7 @@ int open_ctree(struct super_block *sb, features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO; if (features & BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA) - printk(KERN_INFO "BTRFS: has skinny extents\n"); + btrfs_info(fs_info, "has skinny extents"); /* * flag our filesystem as having big metadata blocks if @@ -2789,7 +2789,8 @@ int open_ctree(struct super_block *sb, */ if (btrfs_super_nodesize(disk_super) > PAGE_SIZE) { if (!(features & BTRFS_FEATURE_INCOMPAT_BIG_METADATA)) - printk(KERN_INFO "BTRFS: flagging fs with big metadata feature\n"); + btrfs_info(fs_info, + "flagging fs with big metadata feature"); features |= BTRFS_FEATURE_INCOMPAT_BIG_METADATA; } @@ -2805,9 +2806,9 @@ int open_ctree(struct super_block *sb, */ if ((features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) && (sectorsize != nodesize)) { - printk(KERN_ERR "BTRFS: unequal leaf/node/sector sizes " - "are not allowed for mixed block groups on %s\n", - sb->s_id); + btrfs_err(fs_info, +"unequal nodesize/sectorsize (%u != %u) are not allowed for mixed block groups", + nodesize, sectorsize); goto fail_alloc; } @@ -2820,8 +2821,8 @@ int open_ctree(struct super_block *sb, features = btrfs_super_compat_ro_flags(disk_super) & ~BTRFS_FEATURE_COMPAT_RO_SUPP; if (!(sb->s_flags & MS_RDONLY) && features) { - printk(KERN_ERR "BTRFS: couldn't mount RDWR because of " - "unsupported option features (%Lx).\n", + btrfs_err(fs_info, + "cannot mount read-write because of unsupported optional features (%llx)", features); err = -EINVAL; goto fail_alloc; @@ -2850,8 +2851,7 @@ int open_ctree(struct super_block *sb, ret = btrfs_read_sys_array(tree_root); mutex_unlock(&fs_info->chunk_mutex); if (ret) { - printk(KERN_ERR "BTRFS: failed to read the system " - "array on %s\n", sb->s_id); + btrfs_err(fs_info, "failed to read the system array: %d", ret); goto fail_sb_buffer; } @@ -2865,8 +2865,7 @@ int open_ctree(struct super_block *sb, generation); if (IS_ERR(chunk_root->node) || !extent_buffer_uptodate(chunk_root->node)) { - printk(KERN_ERR "BTRFS: failed to read chunk root on %s\n", - sb->s_id); + btrfs_err(fs_info, "failed to read chunk root"); if (!IS_ERR(chunk_root->node)) free_extent_buffer(chunk_root->node); chunk_root->node = NULL; @@ -2880,8 +2879,7 @@ int open_ctree(struct super_block *sb, ret = btrfs_read_chunk_tree(chunk_root); if (ret) { - printk(KERN_ERR "BTRFS: failed to read chunk tree on %s\n", - sb->s_id); + btrfs_err(fs_info, "failed to read chunk tree: %d", ret); goto fail_tree_roots; } @@ -2892,8 +2890,7 @@ int open_ctree(struct super_block *sb, btrfs_close_extra_devices(fs_devices, 0); if (!fs_devices->latest_bdev) { - printk(KERN_ERR "BTRFS: failed to read devices on %s\n", - sb->s_id); + btrfs_err(fs_info, "failed to read devices"); goto fail_tree_roots; } @@ -2905,8 +2902,7 @@ retry_root_backup: generation); if (IS_ERR(tree_root->node) || !extent_buffer_uptodate(tree_root->node)) { - printk(KERN_WARNING "BTRFS: failed to read tree root on %s\n", - sb->s_id); + btrfs_warn(fs_info, "failed to read tree root"); if (!IS_ERR(tree_root->node)) free_extent_buffer(tree_root->node); tree_root->node = NULL; @@ -2938,20 +2934,19 @@ retry_root_backup: ret = btrfs_recover_balance(fs_info); if (ret) { - printk(KERN_ERR "BTRFS: failed to recover balance\n"); + btrfs_err(fs_info, "failed to recover balance: %d", ret); goto fail_block_groups; } ret = btrfs_init_dev_stats(fs_info); if (ret) { - printk(KERN_ERR "BTRFS: failed to init dev_stats: %d\n", - ret); + btrfs_err(fs_info, "failed to init dev_stats: %d", ret); goto fail_block_groups; } ret = btrfs_init_dev_replace(fs_info); if (ret) { - pr_err("BTRFS: failed to init dev_replace: %d\n", ret); + btrfs_err(fs_info, "failed to init dev_replace: %d", ret); goto fail_block_groups; } @@ -2959,31 +2954,33 @@ retry_root_backup: ret = btrfs_sysfs_add_fsid(fs_devices, NULL); if (ret) { - pr_err("BTRFS: failed to init sysfs fsid interface: %d\n", ret); + btrfs_err(fs_info, "failed to init sysfs fsid interface: %d", + ret); goto fail_block_groups; } ret = btrfs_sysfs_add_device(fs_devices); if (ret) { - pr_err("BTRFS: failed to init sysfs device interface: %d\n", ret); + btrfs_err(fs_info, "failed to init sysfs device interface: %d", + ret); goto fail_fsdev_sysfs; } ret = btrfs_sysfs_add_mounted(fs_info); if (ret) { - pr_err("BTRFS: failed to init sysfs interface: %d\n", ret); + btrfs_err(fs_info, "failed to init sysfs interface: %d", ret); goto fail_fsdev_sysfs; } ret = btrfs_init_space_info(fs_info); if (ret) { - printk(KERN_ERR "BTRFS: Failed to initial space info: %d\n", ret); + btrfs_err(fs_info, "failed to initialize space info: %d", ret); goto fail_sysfs; } ret = btrfs_read_block_groups(fs_info->extent_root); if (ret) { - printk(KERN_ERR "BTRFS: Failed to read block groups: %d\n", ret); + btrfs_err(fs_info, "failed to read block groups: %d", ret); goto fail_sysfs; } fs_info->num_tolerated_disk_barrier_failures = @@ -2991,7 +2988,8 @@ retry_root_backup: if (fs_info->fs_devices->missing_devices > fs_info->num_tolerated_disk_barrier_failures && !(sb->s_flags & MS_RDONLY)) { - pr_warn("BTRFS: missing devices(%llu) exceeds the limit(%d), writeable mount is not allowed\n", + btrfs_warn(fs_info, +"missing devices (%llu) exceeds the limit (%d), writeable mount is not allowed", fs_info->fs_devices->missing_devices, fs_info->num_tolerated_disk_barrier_failures); goto fail_sysfs; @@ -3011,8 +3009,7 @@ retry_root_backup: if (!btrfs_test_opt(tree_root, SSD) && !btrfs_test_opt(tree_root, NOSSD) && !fs_info->fs_devices->rotating) { - printk(KERN_INFO "BTRFS: detected SSD devices, enabling SSD " - "mode\n"); + btrfs_info(fs_info, "detected SSD devices, enabling SSD mode"); btrfs_set_opt(fs_info->mount_opt, SSD); } @@ -3030,8 +3027,9 @@ retry_root_backup: 1 : 0, fs_info->check_integrity_print_mask); if (ret) - printk(KERN_WARNING "BTRFS: failed to initialize" - " integrity check module %s\n", sb->s_id); + btrfs_warn(fs_info, + "failed to initialize integrity check module: %d", + ret); } #endif ret = btrfs_read_qgroup_config(fs_info); @@ -3061,8 +3059,8 @@ retry_root_backup: ret = btrfs_recover_relocation(tree_root); mutex_unlock(&fs_info->cleaner_mutex); if (ret < 0) { - printk(KERN_WARNING - "BTRFS: failed to recover relocation\n"); + btrfs_warn(fs_info, "failed to recover relocation: %d", + ret); err = -EINVAL; goto fail_qgroup; } @@ -3083,11 +3081,11 @@ retry_root_backup: if (btrfs_test_opt(tree_root, FREE_SPACE_TREE) && !btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) { - pr_info("BTRFS: creating free space tree\n"); + btrfs_info(fs_info, "creating free space tree"); ret = btrfs_create_free_space_tree(fs_info); if (ret) { - pr_warn("BTRFS: failed to create free space tree %d\n", - ret); + btrfs_warn(fs_info, + "failed to create free space tree: %d", ret); close_ctree(tree_root); return ret; } @@ -3104,14 +3102,14 @@ retry_root_backup: ret = btrfs_resume_balance_async(fs_info); if (ret) { - printk(KERN_WARNING "BTRFS: failed to resume balance\n"); + btrfs_warn(fs_info, "failed to resume balance: %d", ret); close_ctree(tree_root); return ret; } ret = btrfs_resume_dev_replace_async(fs_info); if (ret) { - pr_warn("BTRFS: failed to resume dev_replace\n"); + btrfs_warn(fs_info, "failed to resume device replace: %d", ret); close_ctree(tree_root); return ret; } @@ -3120,33 +3118,33 @@ retry_root_backup: if (btrfs_test_opt(tree_root, CLEAR_CACHE) && btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) { - pr_info("BTRFS: clearing free space tree\n"); + btrfs_info(fs_info, "clearing free space tree"); ret = btrfs_clear_free_space_tree(fs_info); if (ret) { - pr_warn("BTRFS: failed to clear free space tree %d\n", - ret); + btrfs_warn(fs_info, + "failed to clear free space tree: %d", ret); close_ctree(tree_root); return ret; } } if (!fs_info->uuid_root) { - pr_info("BTRFS: creating UUID tree\n"); + btrfs_info(fs_info, "creating UUID tree"); ret = btrfs_create_uuid_tree(fs_info); if (ret) { - pr_warn("BTRFS: failed to create the UUID tree %d\n", - ret); + btrfs_warn(fs_info, + "failed to create the UUID tree: %d", ret); close_ctree(tree_root); return ret; } } else if (btrfs_test_opt(tree_root, RESCAN_UUID_TREE) || fs_info->generation != btrfs_super_uuid_tree_generation(disk_super)) { - pr_info("BTRFS: checking UUID tree\n"); + btrfs_info(fs_info, "checking UUID tree"); ret = btrfs_check_uuid_tree(fs_info); if (ret) { - pr_warn("BTRFS: failed to check the UUID tree %d\n", - ret); + btrfs_warn(fs_info, + "failed to check the UUID tree: %d", ret); close_ctree(tree_root); return ret; } -- cgit v1.2.3-55-g7522 From e1860a7724828a341037b010b3f4ff58bad53f95 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Mon, 9 May 2016 14:11:38 +0200 Subject: btrfs: GFP_NOFS does not GFP_HIGHMEM Masking HIGHMEM out of NOFS does not make sense. Signed-off-by: David Sterba --- fs/btrfs/delayed-inode.c | 2 +- fs/btrfs/disk-io.c | 2 +- fs/btrfs/extent_io.c | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 6cef0062f929..61561c2a3f96 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -134,7 +134,7 @@ again: /* cached in the btrfs inode and can be accessed */ atomic_add(2, &node->refs); - ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM); + ret = radix_tree_preload(GFP_NOFS); if (ret) { kmem_cache_free(delayed_node_cache, node); return ERR_PTR(ret); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 123fab55b5a4..4931d3856aa3 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1640,7 +1640,7 @@ int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info, { int ret; - ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM); + ret = radix_tree_preload(GFP_NOFS); if (ret) return ret; diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index d22b0777c7e0..d02a637530b1 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -4815,7 +4815,7 @@ struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info, return NULL; eb->fs_info = fs_info; again: - ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM); + ret = radix_tree_preload(GFP_NOFS); if (ret) goto free_eb; spin_lock(&fs_info->buffer_lock); @@ -4916,7 +4916,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, if (uptodate) set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); again: - ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM); + ret = radix_tree_preload(GFP_NOFS); if (ret) goto free_eb; -- cgit v1.2.3-55-g7522 From 6ac10a6ac2b11ada24580cc76dcd0c182061c576 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Wed, 27 Apr 2016 02:15:15 +0200 Subject: btrfs: rename and document compression workspace members The names are confusing, pick more fitting names and add comments. Signed-off-by: David Sterba --- fs/btrfs/compression.c | 35 +++++++++++++++++++---------------- 1 file changed, 19 insertions(+), 16 deletions(-) diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index ff61a41ac90b..4d5cd9624bb3 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -743,8 +743,11 @@ out: static struct { struct list_head idle_ws; spinlock_t ws_lock; - int num_ws; - atomic_t alloc_ws; + /* Number of free workspaces */ + int free_ws; + /* Total number of allocated workspaces */ + atomic_t total_ws; + /* Waiters for a free workspace */ wait_queue_head_t ws_wait; } btrfs_comp_ws[BTRFS_COMPRESS_TYPES]; @@ -760,7 +763,7 @@ void __init btrfs_init_compress(void) for (i = 0; i < BTRFS_COMPRESS_TYPES; i++) { INIT_LIST_HEAD(&btrfs_comp_ws[i].idle_ws); spin_lock_init(&btrfs_comp_ws[i].ws_lock); - atomic_set(&btrfs_comp_ws[i].alloc_ws, 0); + atomic_set(&btrfs_comp_ws[i].total_ws, 0); init_waitqueue_head(&btrfs_comp_ws[i].ws_wait); } } @@ -777,35 +780,35 @@ static struct list_head *find_workspace(int type) struct list_head *idle_ws = &btrfs_comp_ws[idx].idle_ws; spinlock_t *ws_lock = &btrfs_comp_ws[idx].ws_lock; - atomic_t *alloc_ws = &btrfs_comp_ws[idx].alloc_ws; + atomic_t *total_ws = &btrfs_comp_ws[idx].total_ws; wait_queue_head_t *ws_wait = &btrfs_comp_ws[idx].ws_wait; - int *num_ws = &btrfs_comp_ws[idx].num_ws; + int *free_ws = &btrfs_comp_ws[idx].free_ws; again: spin_lock(ws_lock); if (!list_empty(idle_ws)) { workspace = idle_ws->next; list_del(workspace); - (*num_ws)--; + (*free_ws)--; spin_unlock(ws_lock); return workspace; } - if (atomic_read(alloc_ws) > cpus) { + if (atomic_read(total_ws) > cpus) { DEFINE_WAIT(wait); spin_unlock(ws_lock); prepare_to_wait(ws_wait, &wait, TASK_UNINTERRUPTIBLE); - if (atomic_read(alloc_ws) > cpus && !*num_ws) + if (atomic_read(total_ws) > cpus && !*free_ws) schedule(); finish_wait(ws_wait, &wait); goto again; } - atomic_inc(alloc_ws); + atomic_inc(total_ws); spin_unlock(ws_lock); workspace = btrfs_compress_op[idx]->alloc_workspace(); if (IS_ERR(workspace)) { - atomic_dec(alloc_ws); + atomic_dec(total_ws); wake_up(ws_wait); } return workspace; @@ -820,21 +823,21 @@ static void free_workspace(int type, struct list_head *workspace) int idx = type - 1; struct list_head *idle_ws = &btrfs_comp_ws[idx].idle_ws; spinlock_t *ws_lock = &btrfs_comp_ws[idx].ws_lock; - atomic_t *alloc_ws = &btrfs_comp_ws[idx].alloc_ws; + atomic_t *total_ws = &btrfs_comp_ws[idx].total_ws; wait_queue_head_t *ws_wait = &btrfs_comp_ws[idx].ws_wait; - int *num_ws = &btrfs_comp_ws[idx].num_ws; + int *free_ws = &btrfs_comp_ws[idx].free_ws; spin_lock(ws_lock); - if (*num_ws < num_online_cpus()) { + if (*free_ws < num_online_cpus()) { list_add(workspace, idle_ws); - (*num_ws)++; + (*free_ws)++; spin_unlock(ws_lock); goto wake; } spin_unlock(ws_lock); btrfs_compress_op[idx]->free_workspace(workspace); - atomic_dec(alloc_ws); + atomic_dec(total_ws); wake: /* * Make sure counter is updated before we wake up waiters. @@ -857,7 +860,7 @@ static void free_workspaces(void) workspace = btrfs_comp_ws[i].idle_ws.next; list_del(workspace); btrfs_compress_op[i]->free_workspace(workspace); - atomic_dec(&btrfs_comp_ws[i].alloc_ws); + atomic_dec(&btrfs_comp_ws[i].total_ws); } } } -- cgit v1.2.3-55-g7522 From f77dd0d6b2f0f2cf290cacbd48f5eee18586e52b Mon Sep 17 00:00:00 2001 From: David Sterba Date: Wed, 27 Apr 2016 02:55:15 +0200 Subject: btrfs: preallocate compression workspaces Preallocate one workspace for each compression type so we can guarantee forward progress in the worst case. A failure cannot be a hard error as we might not use compression at all on the filesystem. If we can't allocate the workspaces later when need them, it might actually deadlock, but in such situation the system has effectively not enough memory to operate properly. Signed-off-by: David Sterba --- fs/btrfs/compression.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 4d5cd9624bb3..38c058bcf359 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -761,10 +761,26 @@ void __init btrfs_init_compress(void) int i; for (i = 0; i < BTRFS_COMPRESS_TYPES; i++) { + struct list_head *workspace; + INIT_LIST_HEAD(&btrfs_comp_ws[i].idle_ws); spin_lock_init(&btrfs_comp_ws[i].ws_lock); atomic_set(&btrfs_comp_ws[i].total_ws, 0); init_waitqueue_head(&btrfs_comp_ws[i].ws_wait); + + /* + * Preallocate one workspace for each compression type so + * we can guarantee forward progress in the worst case + */ + workspace = btrfs_compress_op[i]->alloc_workspace(); + if (IS_ERR(workspace)) { + printk(KERN_WARNING + "BTRFS: cannot preallocate compression workspace, will try later"); + } else { + atomic_set(&btrfs_comp_ws[i].total_ws, 1); + btrfs_comp_ws[i].free_ws = 1; + list_add(workspace, &btrfs_comp_ws[i].idle_ws); + } } } -- cgit v1.2.3-55-g7522 From e721e49dd1681d45d71919f0561f5e978a34153c Mon Sep 17 00:00:00 2001 From: David Sterba Date: Wed, 27 Apr 2016 02:41:17 +0200 Subject: btrfs: make find_workspace always succeed With just one preallocated workspace we can guarantee forward progress even if there's no memory available for new workspaces. The cost is more waiting but we also get rid of several error paths. On average, there will be several idle workspaces, so the waiting penalty won't be so bad. In the worst case, all cpus will compete for one workspace until there's some memory. Attempts to allocate a new one are done each time the waiters are woken up. Signed-off-by: David Sterba --- fs/btrfs/compression.c | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 38c058bcf359..c70625560265 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -785,8 +785,10 @@ void __init btrfs_init_compress(void) } /* - * this finds an available workspace or allocates a new one - * ERR_PTR is returned if things go bad. + * This finds an available workspace or allocates a new one. + * If it's not possible to allocate a new one, waits until there's one. + * Preallocation makes a forward progress guarantees and we do not return + * errors. */ static struct list_head *find_workspace(int type) { @@ -826,6 +828,14 @@ again: if (IS_ERR(workspace)) { atomic_dec(total_ws); wake_up(ws_wait); + + /* + * Do not return the error but go back to waiting. There's a + * workspace preallocated for each type and the compression + * time is bounded so we get to a workspace eventually. This + * makes our caller's life easier. + */ + goto again; } return workspace; } @@ -913,8 +923,6 @@ int btrfs_compress_pages(int type, struct address_space *mapping, int ret; workspace = find_workspace(type); - if (IS_ERR(workspace)) - return PTR_ERR(workspace); ret = btrfs_compress_op[type-1]->compress_pages(workspace, mapping, start, len, pages, @@ -949,8 +957,6 @@ static int btrfs_decompress_biovec(int type, struct page **pages_in, int ret; workspace = find_workspace(type); - if (IS_ERR(workspace)) - return PTR_ERR(workspace); ret = btrfs_compress_op[type-1]->decompress_biovec(workspace, pages_in, disk_start, @@ -971,8 +977,6 @@ int btrfs_decompress(int type, unsigned char *data_in, struct page *dest_page, int ret; workspace = find_workspace(type); - if (IS_ERR(workspace)) - return PTR_ERR(workspace); ret = btrfs_compress_op[type-1]->decompress(workspace, data_in, dest_page, start_byte, -- cgit v1.2.3-55-g7522 From 523567168da04bae0f88802ddef49d00072c5d58 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Wed, 27 Apr 2016 03:07:39 +0200 Subject: btrfs: make find_workspace warn if there are no workspaces Be verbose if there are no workspaces at all, ie. the module init time preallocation failed. Signed-off-by: David Sterba --- fs/btrfs/compression.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index c70625560265..658c39b70fba 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -834,7 +834,21 @@ again: * workspace preallocated for each type and the compression * time is bounded so we get to a workspace eventually. This * makes our caller's life easier. + * + * To prevent silent and low-probability deadlocks (when the + * initial preallocation fails), check if there are any + * workspaces at all. */ + if (atomic_read(total_ws) == 0) { + static DEFINE_RATELIMIT_STATE(_rs, + /* once per minute */ 60 * HZ, + /* no burst */ 1); + + if (__ratelimit(&_rs)) { + printk(KERN_WARNING + "no compression workspaces, low memory, retrying"); + } + } goto again; } return workspace; -- cgit v1.2.3-55-g7522 From 72928f2476d08c79f132b4f44a17c9a011dd98e3 Mon Sep 17 00:00:00 2001 From: Vincent Stehlé Date: Tue, 10 May 2016 14:56:20 +0200 Subject: Btrfs: fix fspath error deallocation Make sure to deallocate fspath with vfree() in case of error in init_ipath(). fspath is allocated with vmalloc() in init_data_container() since commit 425d17a290c0 ("Btrfs: use larger limit for translation of logical to inode"). Signed-off-by: Vincent Stehlé Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/backref.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 80e8472d618b..d3090187fd76 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -1991,7 +1991,7 @@ struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root, ifp = kmalloc(sizeof(*ifp), GFP_NOFS); if (!ifp) { - kfree(fspath); + vfree(fspath); return ERR_PTR(-ENOMEM); } -- cgit v1.2.3-55-g7522 From 6426c7ad697df4bddf1d222685e6802e7616feaa Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Wed, 11 May 2016 12:53:52 -0700 Subject: btrfs: qgroup: Fix qgroup accounting when creating snapshot Current btrfs qgroup design implies a requirement that after calling btrfs_qgroup_account_extents() there must be a commit root switch. Normally this is OK, as btrfs_qgroup_accounting_extents() is only called inside btrfs_commit_transaction() just be commit_cowonly_roots(). However there is a exception at create_pending_snapshot(), which will call btrfs_qgroup_account_extents() but no any commit root switch. In case of creating a snapshot whose parent root is itself (create a snapshot of fs tree), it will corrupt qgroup by the following trace: (skipped unrelated data) ====== btrfs_qgroup_account_extent: bytenr = 29786112, num_bytes = 16384, nr_old_roots = 0, nr_new_roots = 1 qgroup_update_counters: qgid = 5, cur_old_count = 0, cur_new_count = 1, rfer = 0, excl = 0 qgroup_update_counters: qgid = 5, cur_old_count = 0, cur_new_count = 1, rfer = 16384, excl = 16384 btrfs_qgroup_account_extent: bytenr = 29786112, num_bytes = 16384, nr_old_roots = 0, nr_new_roots = 0 ====== The problem here is in first qgroup_account_extent(), the nr_new_roots of the extent is 1, which means its reference got increased, and qgroup increased its rfer and excl. But at second qgroup_account_extent(), its reference got decreased, but between these two qgroup_account_extent(), there is no switch roots. This leads to the same nr_old_roots, and this extent just got ignored by qgroup, which means this extent is wrongly accounted. Fix it by call commit_cowonly_roots() after qgroup_account_extent() in create_pending_snapshot(), with needed preparation. Mark: I added a check at the top of qgroup_account_snapshot() to skip this code if qgroups are turned off. xfstest btrfs/122 exposes this problem. Signed-off-by: Qu Wenruo Reviewed-by: Josef Bacik Signed-off-by: Mark Fasheh Signed-off-by: David Sterba --- fs/btrfs/transaction.c | 129 ++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 105 insertions(+), 24 deletions(-) diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 43885e51b882..d7172d7ced5f 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -311,10 +311,11 @@ loop: * when the transaction commits */ static int record_root_in_trans(struct btrfs_trans_handle *trans, - struct btrfs_root *root) + struct btrfs_root *root, + int force) { - if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) && - root->last_trans < trans->transid) { + if ((test_bit(BTRFS_ROOT_REF_COWS, &root->state) && + root->last_trans < trans->transid) || force) { WARN_ON(root == root->fs_info->extent_root); WARN_ON(root->commit_root != root->node); @@ -331,7 +332,7 @@ static int record_root_in_trans(struct btrfs_trans_handle *trans, smp_wmb(); spin_lock(&root->fs_info->fs_roots_radix_lock); - if (root->last_trans == trans->transid) { + if (root->last_trans == trans->transid && !force) { spin_unlock(&root->fs_info->fs_roots_radix_lock); return 0; } @@ -402,7 +403,7 @@ int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans, return 0; mutex_lock(&root->fs_info->reloc_mutex); - record_root_in_trans(trans, root); + record_root_in_trans(trans, root, 0); mutex_unlock(&root->fs_info->reloc_mutex); return 0; @@ -1310,6 +1311,92 @@ int btrfs_defrag_root(struct btrfs_root *root) return ret; } +/* + * Do all special snapshot related qgroup dirty hack. + * + * Will do all needed qgroup inherit and dirty hack like switch commit + * roots inside one transaction and write all btree into disk, to make + * qgroup works. + */ +static int qgroup_account_snapshot(struct btrfs_trans_handle *trans, + struct btrfs_root *src, + struct btrfs_root *parent, + struct btrfs_qgroup_inherit *inherit, + u64 dst_objectid) +{ + struct btrfs_fs_info *fs_info = src->fs_info; + int ret; + + /* + * Save some performance in the case that qgroups are not + * enabled. If this check races with the ioctl, rescan will + * kick in anyway. + */ + mutex_lock(&fs_info->qgroup_ioctl_lock); + if (!fs_info->quota_enabled) { + mutex_unlock(&fs_info->qgroup_ioctl_lock); + return 0; + } + mutex_unlock(&fs_info->qgroup_ioctl_lock); + + /* + * We are going to commit transaction, see btrfs_commit_transaction() + * comment for reason locking tree_log_mutex + */ + mutex_lock(&fs_info->tree_log_mutex); + + ret = commit_fs_roots(trans, src); + if (ret) + goto out; + ret = btrfs_qgroup_prepare_account_extents(trans, fs_info); + if (ret < 0) + goto out; + ret = btrfs_qgroup_account_extents(trans, fs_info); + if (ret < 0) + goto out; + + /* Now qgroup are all updated, we can inherit it to new qgroups */ + ret = btrfs_qgroup_inherit(trans, fs_info, + src->root_key.objectid, dst_objectid, + inherit); + if (ret < 0) + goto out; + + /* + * Now we do a simplified commit transaction, which will: + * 1) commit all subvolume and extent tree + * To ensure all subvolume and extent tree have a valid + * commit_root to accounting later insert_dir_item() + * 2) write all btree blocks onto disk + * This is to make sure later btree modification will be cowed + * Or commit_root can be populated and cause wrong qgroup numbers + * In this simplified commit, we don't really care about other trees + * like chunk and root tree, as they won't affect qgroup. + * And we don't write super to avoid half committed status. + */ + ret = commit_cowonly_roots(trans, src); + if (ret) + goto out; + switch_commit_roots(trans->transaction, fs_info); + ret = btrfs_write_and_wait_transaction(trans, src); + if (ret) + btrfs_std_error(fs_info, ret, + "Error while writing out transaction for qgroup"); + +out: + mutex_unlock(&fs_info->tree_log_mutex); + + /* + * Force parent root to be updated, as we recorded it before so its + * last_trans == cur_transid. + * Or it won't be committed again onto disk after later + * insert_dir_item() + */ + if (!ret) + record_root_in_trans(trans, parent, 1); + return ret; +} + /* * new snapshots need to be created at a very specific time in the * transaction commit. This does the actual creation. @@ -1383,7 +1470,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, dentry = pending->dentry; parent_inode = pending->dir; parent_root = BTRFS_I(parent_inode)->root; - record_root_in_trans(trans, parent_root); + record_root_in_trans(trans, parent_root, 0); cur_time = current_fs_time(parent_inode->i_sb); @@ -1420,7 +1507,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, goto fail; } - record_root_in_trans(trans, root); + record_root_in_trans(trans, root, 0); btrfs_set_root_last_snapshot(&root->root_item, trans->transid); memcpy(new_root_item, &root->root_item, sizeof(*new_root_item)); btrfs_check_and_init_root_item(new_root_item); @@ -1516,6 +1603,17 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, goto fail; } + /* + * Do special qgroup accounting for snapshot, as we do some qgroup + * snapshot hack to do fast snapshot. + * To co-operate with that hack, we do hack again. + * Or snapshot will be greatly slowed down by a subtree qgroup rescan + */ + ret = qgroup_account_snapshot(trans, root, parent_root, + pending->inherit, objectid); + if (ret < 0) + goto fail; + ret = btrfs_insert_dir_item(trans, parent_root, dentry->d_name.name, dentry->d_name.len, parent_inode, &key, @@ -1559,23 +1657,6 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, goto fail; } - /* - * account qgroup counters before qgroup_inherit() - */ - ret = btrfs_qgroup_prepare_account_extents(trans, fs_info); - if (ret) - goto fail; - ret = btrfs_qgroup_account_extents(trans, fs_info); - if (ret) - goto fail; - ret = btrfs_qgroup_inherit(trans, fs_info, - root->root_key.objectid, - objectid, pending->inherit); - if (ret) { - btrfs_abort_transaction(trans, root, ret); - goto fail; - } - fail: pending->error = ret; dir_item_existed: -- cgit v1.2.3-55-g7522 From 2c1984f244838477aab4e5882f4479491ae1084a Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 12 May 2016 11:05:03 +0200 Subject: btrfs: build fixup for qgroup_account_snapshot The macro btrfs_std_error got renamed to btrfs_handle_fs_error in an independent branch for the same merge target (4.7). To make the code compilable for bisectability reasons, add a temporary stub. Signed-off-by: David Sterba --- fs/btrfs/transaction.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index d7172d7ced5f..530081388d77 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1311,6 +1311,11 @@ int btrfs_defrag_root(struct btrfs_root *root) return ret; } +/* Bisesctability fixup, remove in 4.8 */ +#ifndef btrfs_std_error +#define btrfs_std_error btrfs_handle_fs_error +#endif + /* * Do all special snapshot related qgroup dirty hack. * -- cgit v1.2.3-55-g7522 From 657ed1aa4898c8304500e0d13f240d5a67e8be5f Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 6 Apr 2016 17:11:56 +0100 Subject: Btrfs: fix for incorrect directory entries after fsync log replay If we move a directory to a new parent and later log that parent and don't explicitly log the old parent, when we replay the log we can end up with entries for the moved directory in both the old and new parent directories. Besides being ilegal to have directories with multiple hard links in linux, it also resulted in the leaving the inode item with a link count of 1. A similar issue also happens if we move a regular file - after the log tree is replayed the file has a link in both the old and new parent directories, when it should be only at the new directory. Sample reproducer: $ mkfs.btrfs -f /dev/sdc $ mount /dev/sdc /mnt $ mkdir /mnt/x $ mkdir /mnt/y $ touch /mnt/x/foo $ mkdir /mnt/y/z $ sync $ ln /mnt/x/foo /mnt/x/bar $ mv /mnt/y/z /mnt/x/z < power fail > $ mount /dev/sdc /mnt $ ls -1Ri /mnt /mnt: 257 x 258 y /mnt/x: 259 bar 259 foo 260 z /mnt/x/z: /mnt/y: 260 z /mnt/y/z: $ umount /dev/sdc $ btrfs check /dev/sdc Checking filesystem on /dev/sdc UUID: a67e2c4a-a4b4-4fdc-b015-9d9af1e344be checking extents checking free space cache checking fs roots root 5 inode 260 errors 2000, link count wrong unresolved ref dir 257 index 4 namelen 1 name z filetype 2 errors 0 unresolved ref dir 258 index 2 namelen 1 name z filetype 2 errors 0 (...) Attempting to remove the directory becomes impossible: $ mount /dev/sdc /mnt $ rmdir /mnt/y/z $ ls -lh /mnt/y ls: cannot access /mnt/y/z: No such file or directory total 0 d????????? ? ? ? ? ? z $ rmdir /mnt/x/z rmdir: failed to remove ‘/mnt/x/z’: Stale file handle $ ls -lh /mnt/x ls: cannot access /mnt/x/z: Stale file handle total 0 -rw-r--r-- 2 root root 0 Apr 6 18:06 bar -rw-r--r-- 2 root root 0 Apr 6 18:06 foo d????????? ? ? ? ? ? z So make sure that on rename we set the last_unlink_trans value for our inode, even if it's a directory, to the value of the current transaction's ID and that if the new parent directory is logged that we fallback to a transaction commit. A test case for fstests is being submitted as well. Signed-off-by: Filipe Manana --- fs/btrfs/tree-log.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 517d0ccb351e..4709932c62fb 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -5278,11 +5278,16 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans, if (IS_ERR(dir_inode)) continue; + if (ctx) + ctx->log_new_dentries = false; ret = btrfs_log_inode(trans, root, dir_inode, LOG_INODE_ALL, 0, LLONG_MAX, ctx); if (!ret && btrfs_must_commit_transaction(trans, dir_inode)) ret = 1; + if (!ret && ctx && ctx->log_new_dentries) + ret = log_new_dir_dentries(trans, root, + dir_inode, ctx); iput(dir_inode); if (ret) goto out; @@ -5652,11 +5657,9 @@ void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans, * into the file. When the file is logged we check it and * don't log the parents if the file is fully on disk. */ - if (S_ISREG(inode->i_mode)) { - mutex_lock(&BTRFS_I(inode)->log_mutex); - BTRFS_I(inode)->last_unlink_trans = trans->transid; - mutex_unlock(&BTRFS_I(inode)->log_mutex); - } + mutex_lock(&BTRFS_I(inode)->log_mutex); + BTRFS_I(inode)->last_unlink_trans = trans->transid; + mutex_unlock(&BTRFS_I(inode)->log_mutex); /* * if this directory was already logged any new -- cgit v1.2.3-55-g7522 From 3f9749f6e9edcf8ec569fb542efc3be35e06e84a Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 25 Apr 2016 04:45:02 +0100 Subject: Btrfs: fix empty symlink after creating symlink and fsync parent dir If we create a symlink, fsync its parent directory, crash/power fail and mount the filesystem, we end up with an empty symlink, which not only is useless it's also not allowed in linux (the man page symlink(2) is well explicit about that). So we just need to make sure to fully log an inode if it's a symlink, to ensure its inline extent gets logged, ensuring the same behaviour as ext3, ext4, xfs, reiserfs, f2fs, nilfs2, etc. Example reproducer: $ mkfs.btrfs -f /dev/sdb $ mount /dev/sdb /mnt $ mkdir /mnt/testdir $ sync $ ln -s /mnt/foo /mnt/testdir/bar $ xfs_io -c fsync /mnt/testdir $ mount /dev/sdb /mnt $ readlink /mnt/testdir/bar A test case for fstests follows soon. Signed-off-by: Filipe Manana --- fs/btrfs/tree-log.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 4709932c62fb..a24a0ba523d6 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -5158,7 +5158,7 @@ process_leaf: } ctx->log_new_dentries = false; - if (type == BTRFS_FT_DIR) + if (type == BTRFS_FT_DIR || type == BTRFS_FT_SYMLINK) log_mode = LOG_INODE_ALL; btrfs_release_path(path); ret = btrfs_log_inode(trans, root, di_inode, -- cgit v1.2.3-55-g7522 From 578def7c50f236432ba140d35bb7ca4ef0a1b20b Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 26 Apr 2016 15:36:38 +0100 Subject: Btrfs: don't wait for unrelated IO to finish before relocation Before the relocation process of a block group starts, it sets the block group to readonly mode, then flushes all delalloc writes and then finally it waits for all ordered extents to complete. This last step includes waiting for ordered extents destinated at extents allocated in other block groups, making us waste unecessary time. So improve this by waiting only for ordered extents that fall into the block group's range. Signed-off-by: Filipe Manana Reviewed-by: Josef Bacik Reviewed-by: Liu Bo --- fs/btrfs/dev-replace.c | 4 ++-- fs/btrfs/extent-tree.c | 11 +++++++---- fs/btrfs/ioctl.c | 2 +- fs/btrfs/ordered-data.c | 26 +++++++++++++++++++------- fs/btrfs/ordered-data.h | 6 ++++-- fs/btrfs/relocation.c | 4 +++- fs/btrfs/super.c | 2 +- fs/btrfs/transaction.c | 2 +- 8 files changed, 38 insertions(+), 19 deletions(-) diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 26bcb487f958..3371f9e546d9 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -403,7 +403,7 @@ int btrfs_dev_replace_start(struct btrfs_root *root, if (ret) btrfs_err(root->fs_info, "kobj add dev failed %d\n", ret); - btrfs_wait_ordered_roots(root->fs_info, -1); + btrfs_wait_ordered_roots(root->fs_info, -1, 0, (u64)-1); /* force writing the updated state information to disk */ trans = btrfs_start_transaction(root, 0); @@ -495,7 +495,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); return ret; } - btrfs_wait_ordered_roots(root->fs_info, -1); + btrfs_wait_ordered_roots(root->fs_info, -1, 0, (u64)-1); trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 84e060eb0de8..251452a2b72c 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4141,7 +4141,7 @@ commit_trans: if (need_commit > 0) { btrfs_start_delalloc_roots(fs_info, 0, -1); - btrfs_wait_ordered_roots(fs_info, -1); + btrfs_wait_ordered_roots(fs_info, -1, 0, (u64)-1); } trans = btrfs_join_transaction(root); @@ -4583,7 +4583,8 @@ static void btrfs_writeback_inodes_sb_nr(struct btrfs_root *root, */ btrfs_start_delalloc_roots(root->fs_info, 0, nr_items); if (!current->journal_info) - btrfs_wait_ordered_roots(root->fs_info, nr_items); + btrfs_wait_ordered_roots(root->fs_info, nr_items, + 0, (u64)-1); } } @@ -4632,7 +4633,8 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig, if (trans) return; if (wait_ordered) - btrfs_wait_ordered_roots(root->fs_info, items); + btrfs_wait_ordered_roots(root->fs_info, items, + 0, (u64)-1); return; } @@ -4671,7 +4673,8 @@ skip_async: loops++; if (wait_ordered && !trans) { - btrfs_wait_ordered_roots(root->fs_info, items); + btrfs_wait_ordered_roots(root->fs_info, items, + 0, (u64)-1); } else { time_left = schedule_timeout_killable(1); if (time_left) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 5a23806ae418..697cc336bd1c 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -681,7 +681,7 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, if (ret) goto dec_and_free; - btrfs_wait_ordered_extents(root, -1); + btrfs_wait_ordered_extents(root, -1, 0, (u64)-1); btrfs_init_block_rsv(&pending_snapshot->block_rsv, BTRFS_BLOCK_RSV_TEMP); diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 0de7da5a610d..559170464d7c 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -661,14 +661,15 @@ static void btrfs_run_ordered_extent_work(struct btrfs_work *work) * wait for all the ordered extents in a root. This is done when balancing * space between drives. */ -int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr) +int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr, + const u64 range_start, const u64 range_len) { - struct list_head splice, works; + LIST_HEAD(splice); + LIST_HEAD(skipped); + LIST_HEAD(works); struct btrfs_ordered_extent *ordered, *next; int count = 0; - - INIT_LIST_HEAD(&splice); - INIT_LIST_HEAD(&works); + const u64 range_end = range_start + range_len; mutex_lock(&root->ordered_extent_mutex); spin_lock(&root->ordered_extent_lock); @@ -676,6 +677,14 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr) while (!list_empty(&splice) && nr) { ordered = list_first_entry(&splice, struct btrfs_ordered_extent, root_extent_list); + + if (range_end <= ordered->start || + ordered->start + ordered->disk_len <= range_start) { + list_move_tail(&ordered->root_extent_list, &skipped); + cond_resched_lock(&root->ordered_extent_lock); + continue; + } + list_move_tail(&ordered->root_extent_list, &root->ordered_extents); atomic_inc(&ordered->refs); @@ -694,6 +703,7 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr) nr--; count++; } + list_splice_tail(&skipped, &root->ordered_extents); list_splice_tail(&splice, &root->ordered_extents); spin_unlock(&root->ordered_extent_lock); @@ -708,7 +718,8 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr) return count; } -void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr) +void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr, + const u64 range_start, const u64 range_len) { struct btrfs_root *root; struct list_head splice; @@ -728,7 +739,8 @@ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr) &fs_info->ordered_roots); spin_unlock(&fs_info->ordered_root_lock); - done = btrfs_wait_ordered_extents(root, nr); + done = btrfs_wait_ordered_extents(root, nr, + range_start, range_len); btrfs_put_fs_root(root); spin_lock(&fs_info->ordered_root_lock); diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index 23c96059cef2..8ef12623d65c 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -197,8 +197,10 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, struct btrfs_ordered_extent *ordered); int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum, int len); -int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr); -void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr); +int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr, + const u64 range_start, const u64 range_len); +void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr, + const u64 range_start, const u64 range_len); void btrfs_get_logged_extents(struct inode *inode, struct list_head *logged_list, const loff_t start, diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 08ef890deca6..30f77ed60133 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -4259,7 +4259,9 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start) err = ret; goto out; } - btrfs_wait_ordered_roots(fs_info, -1); + btrfs_wait_ordered_roots(fs_info, -1, + rc->block_group->key.objectid, + rc->block_group->key.offset); while (1) { mutex_lock(&fs_info->cleaner_mutex); diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 00b8f37cc306..89d134794d47 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1160,7 +1160,7 @@ int btrfs_sync_fs(struct super_block *sb, int wait) return 0; } - btrfs_wait_ordered_roots(fs_info, -1); + btrfs_wait_ordered_roots(fs_info, -1, 0, (u64)-1); trans = btrfs_attach_transaction_barrier(root); if (IS_ERR(trans)) { diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 43885e51b882..f0bb54a77314 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1821,7 +1821,7 @@ static inline int btrfs_start_delalloc_flush(struct btrfs_fs_info *fs_info) static inline void btrfs_wait_delalloc_flush(struct btrfs_fs_info *fs_info) { if (btrfs_test_opt(fs_info->tree_root, FLUSHONCOMMIT)) - btrfs_wait_ordered_roots(fs_info, -1); + btrfs_wait_ordered_roots(fs_info, -1, 0, (u64)-1); } static inline void -- cgit v1.2.3-55-g7522 From 9cfa3e34e20e6798a671236000d9e97c8aa5d318 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 26 Apr 2016 15:39:32 +0100 Subject: Btrfs: don't do unnecessary delalloc flushes when relocating Before we start the actual relocation process of a block group, we do calls to flush delalloc of all inodes and then wait for ordered extents to complete. However we do these flush calls just to make sure we don't race with concurrent tasks that have actually already started to run delalloc and have allocated an extent from the block group we want to relocate, right before we set it to readonly mode, but have not yet created the respective ordered extents. The flush calls make us wait for such concurrent tasks because they end up calling filemap_fdatawrite_range() (through btrfs_start_delalloc_roots() -> __start_delalloc_inodes() -> btrfs_alloc_delalloc_work() -> btrfs_run_delalloc_work()) which ends up serializing us with those tasks due to attempts to lock the same pages (and the delalloc flush procedure calls the allocator and creates the ordered extents before unlocking the pages). These flushing calls not only make us waste time (cpu, IO) but also reduce the chances of writing larger extents (applications might be writing to contiguous ranges and we flush before they finish dirtying the whole ranges). So make sure we don't flush delalloc and just wait for concurrent tasks that have already started flushing delalloc and have allocated an extent from the block group we are about to relocate. This change also ends up fixing a race with direct IO writes that makes relocation not wait for direct IO ordered extents. This race is illustrated by the following diagram: CPU 1 CPU 2 btrfs_relocate_block_group(bg X) starts direct IO write, target inode currently has no ordered extents ongoing nor dirty pages (delalloc regions), therefore the root for our inode is not in the list fs_info->ordered_roots btrfs_direct_IO() __blockdev_direct_IO() btrfs_get_blocks_direct() btrfs_lock_extent_direct() locks range in the io tree btrfs_new_extent_direct() btrfs_reserve_extent() --> extent allocated from bg X btrfs_inc_block_group_ro(bg X) btrfs_start_delalloc_roots() __start_delalloc_inodes() --> does nothing, no dealloc ranges in the inode's io tree so the inode's root is not in the list fs_info->delalloc_roots btrfs_wait_ordered_roots() --> does not find the inode's root in the list fs_info->ordered_roots --> ends up not waiting for the direct IO write started by the task at CPU 2 relocate_block_group(rc->stage == MOVE_DATA_EXTENTS) prepare_to_relocate() btrfs_commit_transaction() iterates the extent tree, using its commit root and moves extents into new locations btrfs_add_ordered_extent_dio() --> now a ordered extent is created and added to the list root->ordered_extents and the root added to the list fs_info->ordered_roots --> this is too late and the task at CPU 1 already started the relocation btrfs_commit_transaction() btrfs_finish_ordered_io() btrfs_alloc_reserved_file_extent() --> adds delayed data reference for the extent allocated from bg X relocate_block_group(rc->stage == UPDATE_DATA_PTRS) prepare_to_relocate() btrfs_commit_transaction() --> delayed refs are run, so an extent item for the allocated extent from bg X is added to extent tree --> commit roots are switched, so the next scan in the extent tree will see the extent item sees the extent in the extent tree When this happens the relocation produces the following warning when it finishes: [ 7260.832836] ------------[ cut here ]------------ [ 7260.834653] WARNING: CPU: 5 PID: 6765 at fs/btrfs/relocation.c:4318 btrfs_relocate_block_group+0x245/0x2a1 [btrfs]() [ 7260.838268] Modules linked in: btrfs crc32c_generic xor ppdev raid6_pq psmouse sg acpi_cpufreq evdev i2c_piix4 tpm_tis serio_raw tpm i2c_core pcspkr parport_pc [ 7260.850935] CPU: 5 PID: 6765 Comm: btrfs Not tainted 4.5.0-rc6-btrfs-next-28+ #1 [ 7260.852998] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS by qemu-project.org 04/01/2014 [ 7260.852998] 0000000000000000 ffff88020bf57bc0 ffffffff812648b3 0000000000000000 [ 7260.852998] 0000000000000009 ffff88020bf57bf8 ffffffff81051608 ffffffffa03c1b2d [ 7260.852998] ffff8800b2bbb800 0000000000000000 ffff8800b17bcc58 ffff8800399dd000 [ 7260.852998] Call Trace: [ 7260.852998] [] dump_stack+0x67/0x90 [ 7260.852998] [] warn_slowpath_common+0x99/0xb2 [ 7260.852998] [] ? btrfs_relocate_block_group+0x245/0x2a1 [btrfs] [ 7260.852998] [] warn_slowpath_null+0x1a/0x1c [ 7260.852998] [] btrfs_relocate_block_group+0x245/0x2a1 [btrfs] [ 7260.852998] [] btrfs_relocate_chunk.isra.29+0x66/0xdb [btrfs] [ 7260.852998] [] btrfs_balance+0xde1/0xe4e [btrfs] [ 7260.852998] [] ? debug_smp_processor_id+0x17/0x19 [ 7260.852998] [] btrfs_ioctl_balance+0x255/0x2d3 [btrfs] [ 7260.852998] [] btrfs_ioctl+0x11e0/0x1dff [btrfs] [ 7260.852998] [] ? handle_mm_fault+0x443/0xd63 [ 7260.852998] [] ? _raw_spin_unlock+0x31/0x44 [ 7260.852998] [] ? arch_local_irq_save+0x9/0xc [ 7260.852998] [] vfs_ioctl+0x18/0x34 [ 7260.852998] [] do_vfs_ioctl+0x550/0x5be [ 7260.852998] [] ? __fget_light+0x4d/0x71 [ 7260.852998] [] SyS_ioctl+0x57/0x79 [ 7260.852998] [] entry_SYSCALL_64_fastpath+0x12/0x6b [ 7260.893268] ---[ end trace eb7803b24ebab8ad ]--- This is because at the end of the first stage, in relocate_block_group(), we commit the current transaction, which makes delayed refs run, the commit roots are switched and so the second stage will find the extent item that the ordered extent added to the delayed refs. But this extent was not moved (ordered extent completed after first stage finished), so at the end of the relocation our block group item still has a positive used bytes counter, triggering a warning at the end of btrfs_relocate_block_group(). Later on when trying to read the extent contents from disk we hit a BUG_ON() due to the inability to map a block with a logical address that belongs to the block group we relocated and is no longer valid, resulting in the following trace: [ 7344.885290] BTRFS critical (device sdi): unable to find logical 12845056 len 4096 [ 7344.887518] ------------[ cut here ]------------ [ 7344.888431] kernel BUG at fs/btrfs/inode.c:1833! [ 7344.888431] invalid opcode: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC [ 7344.888431] Modules linked in: btrfs crc32c_generic xor ppdev raid6_pq psmouse sg acpi_cpufreq evdev i2c_piix4 tpm_tis serio_raw tpm i2c_core pcspkr parport_pc [ 7344.888431] CPU: 0 PID: 6831 Comm: od Tainted: G W 4.5.0-rc6-btrfs-next-28+ #1 [ 7344.888431] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS by qemu-project.org 04/01/2014 [ 7344.888431] task: ffff880215818600 ti: ffff880204684000 task.ti: ffff880204684000 [ 7344.888431] RIP: 0010:[] [] btrfs_merge_bio_hook+0x54/0x6b [btrfs] [ 7344.888431] RSP: 0018:ffff8802046878f0 EFLAGS: 00010282 [ 7344.888431] RAX: 00000000ffffffea RBX: 0000000000001000 RCX: 0000000000000001 [ 7344.888431] RDX: ffff88023ec0f950 RSI: ffffffff8183b638 RDI: 00000000ffffffff [ 7344.888431] RBP: ffff880204687908 R08: 0000000000000001 R09: 0000000000000000 [ 7344.888431] R10: ffff880204687770 R11: ffffffff82f2d52d R12: 0000000000001000 [ 7344.888431] R13: ffff88021afbfee8 R14: 0000000000006208 R15: ffff88006cd199b0 [ 7344.888431] FS: 00007f1f9e1d6700(0000) GS:ffff88023ec00000(0000) knlGS:0000000000000000 [ 7344.888431] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 7344.888431] CR2: 00007f1f9dc8cb60 CR3: 000000023e3b6000 CR4: 00000000000006f0 [ 7344.888431] Stack: [ 7344.888431] 0000000000001000 0000000000001000 ffff880204687b98 ffff880204687950 [ 7344.888431] ffffffffa0395c8f ffffea0004d64d48 0000000000000000 0000000000001000 [ 7344.888431] ffffea0004d64d48 0000000000001000 0000000000000000 0000000000000000 [ 7344.888431] Call Trace: [ 7344.888431] [] submit_extent_page+0xf5/0x16f [btrfs] [ 7344.888431] [] __do_readpage+0x4a0/0x4f1 [btrfs] [ 7344.888431] [] ? btrfs_create_repair_bio+0xcb/0xcb [btrfs] [ 7344.888431] [] ? btrfs_writepage_start_hook+0xbc/0xbc [btrfs] [ 7344.888431] [] ? trace_hardirqs_on+0xd/0xf [ 7344.888431] [] __do_contiguous_readpages.constprop.26+0xc2/0xe4 [btrfs] [ 7344.888431] [] ? btrfs_writepage_start_hook+0xbc/0xbc [btrfs] [ 7344.888431] [] __extent_readpages.constprop.25+0xed/0x100 [btrfs] [ 7344.888431] [] ? lru_cache_add+0xe/0x10 [ 7344.888431] [] extent_readpages+0x160/0x1aa [btrfs] [ 7344.888431] [] ? btrfs_writepage_start_hook+0xbc/0xbc [btrfs] [ 7344.888431] [] ? alloc_pages_current+0xa9/0xcd [ 7344.888431] [] btrfs_readpages+0x1f/0x21 [btrfs] [ 7344.888431] [] __do_page_cache_readahead+0x168/0x1fc [ 7344.888431] [] ondemand_readahead+0x1f6/0x207 [ 7344.888431] [] ? ondemand_readahead+0x1f6/0x207 [ 7344.888431] [] ? pagecache_get_page+0x2b/0x154 [ 7344.888431] [] page_cache_sync_readahead+0x3d/0x3f [ 7344.888431] [] generic_file_read_iter+0x197/0x4e1 [ 7344.888431] [] __vfs_read+0x79/0x9d [ 7344.888431] [] vfs_read+0x8f/0xd2 [ 7344.888431] [] SyS_read+0x50/0x7e [ 7344.888431] [] entry_SYSCALL_64_fastpath+0x12/0x6b [ 7344.888431] Code: 8d 4d e8 45 31 c9 45 31 c0 48 8b 00 48 c1 e2 09 48 8b 80 80 fc ff ff 4c 89 65 e8 48 8b b8 f0 01 00 00 e8 1d 42 02 00 85 c0 79 02 <0f> 0b 4c 0 [ 7344.888431] RIP [] btrfs_merge_bio_hook+0x54/0x6b [btrfs] [ 7344.888431] RSP [ 7344.970544] ---[ end trace eb7803b24ebab8ae ]--- Signed-off-by: Filipe Manana Reviewed-by: Josef Bacik Reviewed-by: Liu Bo --- fs/btrfs/ctree.h | 14 ++++++++++++ fs/btrfs/extent-tree.c | 58 ++++++++++++++++++++++++++++++++++++++++++++++++-- fs/btrfs/inode.c | 8 +++++++ fs/btrfs/relocation.c | 6 +----- 4 files changed, 79 insertions(+), 7 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 84a6a5b3384a..90e70e21e479 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1408,6 +1408,17 @@ struct btrfs_block_group_cache { struct btrfs_io_ctl io_ctl; + /* + * Incremented when doing extent allocations and holding a read lock + * on the space_info's groups_sem semaphore. + * Decremented when an ordered extent that represents an IO against this + * block group's range is created (after it's added to its inode's + * root's list of ordered extents) or immediately after the allocation + * if it's a metadata extent or fallocate extent (for these cases we + * don't create ordered extents). + */ + atomic_t reservations; + /* Lock for free space tree operations. */ struct mutex free_space_lock; @@ -3499,6 +3510,9 @@ int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans, struct btrfs_root *root); int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans, struct btrfs_root *root); +void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info, + const u64 start); +void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg); void btrfs_put_block_group(struct btrfs_block_group_cache *cache); int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, struct btrfs_root *root, unsigned long count); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 251452a2b72c..09aad7b447f5 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -6175,6 +6175,57 @@ int btrfs_exclude_logged_extents(struct btrfs_root *log, return 0; } +static void +btrfs_inc_block_group_reservations(struct btrfs_block_group_cache *bg) +{ + atomic_inc(&bg->reservations); +} + +void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info, + const u64 start) +{ + struct btrfs_block_group_cache *bg; + + bg = btrfs_lookup_block_group(fs_info, start); + ASSERT(bg); + if (atomic_dec_and_test(&bg->reservations)) + wake_up_atomic_t(&bg->reservations); + btrfs_put_block_group(bg); +} + +static int btrfs_wait_bg_reservations_atomic_t(atomic_t *a) +{ + schedule(); + return 0; +} + +void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg) +{ + struct btrfs_space_info *space_info = bg->space_info; + + ASSERT(bg->ro); + + if (!(bg->flags & BTRFS_BLOCK_GROUP_DATA)) + return; + + /* + * Our block group is read only but before we set it to read only, + * some task might have had allocated an extent from it already, but it + * has not yet created a respective ordered extent (and added it to a + * root's list of ordered extents). + * Therefore wait for any task currently allocating extents, since the + * block group's reservations counter is incremented while a read lock + * on the groups' semaphore is held and decremented after releasing + * the read access on that semaphore and creating the ordered extent. + */ + down_write(&space_info->groups_sem); + up_write(&space_info->groups_sem); + + wait_on_atomic_t(&bg->reservations, + btrfs_wait_bg_reservations_atomic_t, + TASK_UNINTERRUPTIBLE); +} + /** * btrfs_update_reserved_bytes - update the block_group and space info counters * @cache: The cache we are manipulating @@ -7434,6 +7485,7 @@ checks: btrfs_add_free_space(block_group, offset, num_bytes); goto loop; } + btrfs_inc_block_group_reservations(block_group); /* we are all good, lets return */ ins->objectid = search_start; @@ -7615,8 +7667,10 @@ again: WARN_ON(num_bytes < root->sectorsize); ret = find_free_extent(root, num_bytes, empty_size, hint_byte, ins, flags, delalloc); - - if (ret == -ENOSPC) { + if (!ret && !is_data) { + btrfs_dec_block_group_reservations(root->fs_info, + ins->objectid); + } else if (ret == -ENOSPC) { if (!final_tried && ins->offset) { num_bytes = min(num_bytes >> 1, ins->offset); num_bytes = round_down(num_bytes, root->sectorsize); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 2aaba58b4856..3991d8a74f61 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -824,6 +824,7 @@ retry: async_extent->ram_size - 1, 0); goto out_free_reserve; } + btrfs_dec_block_group_reservations(root->fs_info, ins.objectid); /* * clear dirty, set writeback and unlock the pages. @@ -861,6 +862,7 @@ retry: } return; out_free_reserve: + btrfs_dec_block_group_reservations(root->fs_info, ins.objectid); btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1); out_free: extent_clear_unlock_delalloc(inode, async_extent->start, @@ -1038,6 +1040,8 @@ static noinline int cow_file_range(struct inode *inode, goto out_drop_extent_cache; } + btrfs_dec_block_group_reservations(root->fs_info, ins.objectid); + if (disk_num_bytes < cur_alloc_size) break; @@ -1066,6 +1070,7 @@ out: out_drop_extent_cache: btrfs_drop_extent_cache(inode, start, start + ram_size - 1, 0); out_reserve: + btrfs_dec_block_group_reservations(root->fs_info, ins.objectid); btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1); out_unlock: extent_clear_unlock_delalloc(inode, start, end, locked_page, @@ -7162,6 +7167,8 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode, return ERR_PTR(ret); } + btrfs_dec_block_group_reservations(root->fs_info, ins.objectid); + em = create_pinned_em(inode, start, ins.offset, start, ins.objectid, ins.offset, ins.offset, ins.offset, 0); if (IS_ERR(em)) { @@ -9942,6 +9949,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, btrfs_end_transaction(trans, root); break; } + btrfs_dec_block_group_reservations(root->fs_info, ins.objectid); last_alloc = ins.offset; ret = insert_reserved_file_extent(trans, inode, diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 30f77ed60133..e78f8e44bd9a 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -4254,11 +4254,7 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start) btrfs_info(extent_root->fs_info, "relocating block group %llu flags %llu", rc->block_group->key.objectid, rc->block_group->flags); - ret = btrfs_start_delalloc_roots(fs_info, 0, -1); - if (ret < 0) { - err = ret; - goto out; - } + btrfs_wait_block_group_reservations(rc->block_group); btrfs_wait_ordered_roots(fs_info, -1, rc->block_group->key.objectid, rc->block_group->key.offset); -- cgit v1.2.3-55-g7522 From 3dc9e8f76720fbbd9c56a11775932733fe13d214 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Fri, 29 Apr 2016 11:34:22 +0100 Subject: Btrfs: unpin log if rename operation fails If rename operations fail at some point after we pinned the log, we end up aborting the current transaction but never unpin the log, which leaves concurrent tasks that are trying to sync the log (as part of an fsync request from user space) blocked forever and preventing the filesystem from being unmountable. Fix this by safely unpinning the log. Signed-off-by: Filipe Manana --- fs/btrfs/inode.c | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 3991d8a74f61..4594a263a5f8 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -9406,6 +9406,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, u64 root_objectid; int ret; u64 old_ino = btrfs_ino(old_inode); + bool log_pinned = false; if (btrfs_ino(new_dir) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) return -EPERM; @@ -9493,6 +9494,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, * we unlink the name but before we add the new name back in. */ btrfs_pin_log_trans(root); + log_pinned = true; } inode_inc_iversion(old_dir); @@ -9559,12 +9561,36 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, if (old_inode->i_nlink == 1) BTRFS_I(old_inode)->dir_index = index; - if (old_ino != BTRFS_FIRST_FREE_OBJECTID) { + if (log_pinned) { struct dentry *parent = new_dentry->d_parent; + btrfs_log_new_name(trans, old_inode, old_dir, parent); btrfs_end_log_trans(root); + log_pinned = false; } out_fail: + /* + * If we have pinned the log and an error happened, we unpin tasks + * trying to sync the log and force them to fallback to a transaction + * commit if the log currently contains any of the inodes involved in + * this rename operation (to ensure we do not persist a log with an + * inconsistent state for any of these inodes or leading to any + * inconsistencies when replayed). If the transaction was aborted, the + * abortion reason is propagated to userspace when attempting to commit + * the transaction. If the log does not contain any of these inodes, we + * allow the tasks to sync it. + */ + if (ret && log_pinned) { + if (btrfs_inode_in_log(old_dir, root->fs_info->generation) || + btrfs_inode_in_log(new_dir, root->fs_info->generation) || + btrfs_inode_in_log(old_inode, root->fs_info->generation) || + (new_inode && + btrfs_inode_in_log(new_inode, root->fs_info->generation))) + btrfs_set_log_full_commit(root->fs_info, trans); + + btrfs_end_log_trans(root); + log_pinned = false; + } btrfs_end_transaction(trans, root); out_notrans: if (old_ino == BTRFS_FIRST_FREE_OBJECTID) -- cgit v1.2.3-55-g7522 From c4aba9545430ed0f842d0833072f990a42da90f0 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Fri, 29 Apr 2016 13:14:42 +0100 Subject: Btrfs: pin log earlier when renaming We were pinning the log right after the first step in the rename operation (inserting inode ref for the new name in the destination directory) instead of doing it before. This behaviour was introduced in 2009 for some reason that was not mentioned neither on the changelog nor any comment, with the drawback of a small time window where concurrent log writers can end up logging the new inode reference for the inode we are renaming while the rename operation is in progress (so that we can end up with a log containing both the new and old references). As of today there's no reason to not pin the log before that first step anymore, so just fix this. Signed-off-by: Filipe Manana --- fs/btrfs/inode.c | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 4594a263a5f8..422833e70d00 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -9479,6 +9479,8 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, /* force full log commit if subvolume involved. */ btrfs_set_log_full_commit(root->fs_info, trans); } else { + btrfs_pin_log_trans(root); + log_pinned = true; ret = btrfs_insert_inode_ref(trans, dest, new_dentry->d_name.name, new_dentry->d_name.len, @@ -9486,15 +9488,6 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, btrfs_ino(new_dir), index); if (ret) goto out_fail; - /* - * this is an ugly little race, but the rename is required - * to make sure that if we crash, the inode is either at the - * old name or the new one. pinning the log transaction lets - * us make sure we don't allow a log commit to come in after - * we unlink the name but before we add the new name back in. - */ - btrfs_pin_log_trans(root); - log_pinned = true; } inode_inc_iversion(old_dir); -- cgit v1.2.3-55-g7522 From cdd1fedf8261cd7a73c0596298902ff4f0f04492 Mon Sep 17 00:00:00 2001 From: Dan Fuhry Date: Thu, 17 Mar 2016 15:23:38 +0100 Subject: btrfs: add support for RENAME_EXCHANGE and RENAME_WHITEOUT Two new flags, RENAME_EXCHANGE and RENAME_WHITEOUT, provide for new behavior in the renameat2() syscall. This behavior is primarily used by overlayfs. This patch adds support for these flags to btrfs, enabling it to be used as a fully functional upper layer for overlayfs. RENAME_EXCHANGE support was written by Davide Italiano originally submitted on 2 April 2015. Signed-off-by: Davide Italiano Signed-off-by: Dan Fuhry [ remove unlikely ] Signed-off-by: David Sterba Signed-off-by: Filipe Manana --- fs/btrfs/inode.c | 264 +++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 257 insertions(+), 7 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 422833e70d00..452cfefbf4b8 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -9394,8 +9394,244 @@ static int btrfs_getattr(struct vfsmount *mnt, return 0; } +static int btrfs_rename_exchange(struct inode *old_dir, + struct dentry *old_dentry, + struct inode *new_dir, + struct dentry *new_dentry) +{ + struct btrfs_trans_handle *trans; + struct btrfs_root *root = BTRFS_I(old_dir)->root; + struct btrfs_root *dest = BTRFS_I(new_dir)->root; + struct inode *new_inode = new_dentry->d_inode; + struct inode *old_inode = old_dentry->d_inode; + struct timespec ctime = CURRENT_TIME; + struct dentry *parent; + u64 old_ino = btrfs_ino(old_inode); + u64 new_ino = btrfs_ino(new_inode); + u64 old_idx = 0; + u64 new_idx = 0; + u64 root_objectid; + int ret; + + /* we only allow rename subvolume link between subvolumes */ + if (old_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest) + return -EXDEV; + + /* close the race window with snapshot create/destroy ioctl */ + if (old_ino == BTRFS_FIRST_FREE_OBJECTID) + down_read(&root->fs_info->subvol_sem); + if (new_ino == BTRFS_FIRST_FREE_OBJECTID) + down_read(&dest->fs_info->subvol_sem); + + /* + * We want to reserve the absolute worst case amount of items. So if + * both inodes are subvols and we need to unlink them then that would + * require 4 item modifications, but if they are both normal inodes it + * would require 5 item modifications, so we'll assume their normal + * inodes. So 5 * 2 is 10, plus 2 for the new links, so 12 total items + * should cover the worst case number of items we'll modify. + */ + trans = btrfs_start_transaction(root, 12); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto out_notrans; + } + + /* + * We need to find a free sequence number both in the source and + * in the destination directory for the exchange. + */ + ret = btrfs_set_inode_index(new_dir, &old_idx); + if (ret) + goto out_fail; + ret = btrfs_set_inode_index(old_dir, &new_idx); + if (ret) + goto out_fail; + + BTRFS_I(old_inode)->dir_index = 0ULL; + BTRFS_I(new_inode)->dir_index = 0ULL; + + /* Reference for the source. */ + if (old_ino == BTRFS_FIRST_FREE_OBJECTID) { + /* force full log commit if subvolume involved. */ + btrfs_set_log_full_commit(root->fs_info, trans); + } else { + ret = btrfs_insert_inode_ref(trans, dest, + new_dentry->d_name.name, + new_dentry->d_name.len, + old_ino, + btrfs_ino(new_dir), old_idx); + if (ret) + goto out_fail; + btrfs_pin_log_trans(root); + } + + /* And now for the dest. */ + if (new_ino == BTRFS_FIRST_FREE_OBJECTID) { + /* force full log commit if subvolume involved. */ + btrfs_set_log_full_commit(dest->fs_info, trans); + } else { + ret = btrfs_insert_inode_ref(trans, root, + old_dentry->d_name.name, + old_dentry->d_name.len, + new_ino, + btrfs_ino(old_dir), new_idx); + if (ret) + goto out_fail; + btrfs_pin_log_trans(dest); + } + + /* Update inode version and ctime/mtime. */ + inode_inc_iversion(old_dir); + inode_inc_iversion(new_dir); + inode_inc_iversion(old_inode); + inode_inc_iversion(new_inode); + old_dir->i_ctime = old_dir->i_mtime = ctime; + new_dir->i_ctime = new_dir->i_mtime = ctime; + old_inode->i_ctime = ctime; + new_inode->i_ctime = ctime; + + if (old_dentry->d_parent != new_dentry->d_parent) { + btrfs_record_unlink_dir(trans, old_dir, old_inode, 1); + btrfs_record_unlink_dir(trans, new_dir, new_inode, 1); + } + + /* src is a subvolume */ + if (old_ino == BTRFS_FIRST_FREE_OBJECTID) { + root_objectid = BTRFS_I(old_inode)->root->root_key.objectid; + ret = btrfs_unlink_subvol(trans, root, old_dir, + root_objectid, + old_dentry->d_name.name, + old_dentry->d_name.len); + } else { /* src is an inode */ + ret = __btrfs_unlink_inode(trans, root, old_dir, + old_dentry->d_inode, + old_dentry->d_name.name, + old_dentry->d_name.len); + if (!ret) + ret = btrfs_update_inode(trans, root, old_inode); + } + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto out_fail; + } + + /* dest is a subvolume */ + if (new_ino == BTRFS_FIRST_FREE_OBJECTID) { + root_objectid = BTRFS_I(new_inode)->root->root_key.objectid; + ret = btrfs_unlink_subvol(trans, dest, new_dir, + root_objectid, + new_dentry->d_name.name, + new_dentry->d_name.len); + } else { /* dest is an inode */ + ret = __btrfs_unlink_inode(trans, dest, new_dir, + new_dentry->d_inode, + new_dentry->d_name.name, + new_dentry->d_name.len); + if (!ret) + ret = btrfs_update_inode(trans, dest, new_inode); + } + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto out_fail; + } + + ret = btrfs_add_link(trans, new_dir, old_inode, + new_dentry->d_name.name, + new_dentry->d_name.len, 0, old_idx); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto out_fail; + } + + ret = btrfs_add_link(trans, old_dir, new_inode, + old_dentry->d_name.name, + old_dentry->d_name.len, 0, new_idx); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto out_fail; + } + + if (old_inode->i_nlink == 1) + BTRFS_I(old_inode)->dir_index = old_idx; + if (new_inode->i_nlink == 1) + BTRFS_I(new_inode)->dir_index = new_idx; + + if (old_ino != BTRFS_FIRST_FREE_OBJECTID) { + parent = new_dentry->d_parent; + btrfs_log_new_name(trans, old_inode, old_dir, parent); + btrfs_end_log_trans(root); + } + if (new_ino != BTRFS_FIRST_FREE_OBJECTID) { + parent = old_dentry->d_parent; + btrfs_log_new_name(trans, new_inode, new_dir, parent); + btrfs_end_log_trans(dest); + } +out_fail: + ret = btrfs_end_transaction(trans, root); +out_notrans: + if (new_ino == BTRFS_FIRST_FREE_OBJECTID) + up_read(&dest->fs_info->subvol_sem); + if (old_ino == BTRFS_FIRST_FREE_OBJECTID) + up_read(&root->fs_info->subvol_sem); + + return ret; +} + +static int btrfs_whiteout_for_rename(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode *dir, + struct dentry *dentry) +{ + int ret; + struct inode *inode; + u64 objectid; + u64 index; + + ret = btrfs_find_free_ino(root, &objectid); + if (ret) + return ret; + + inode = btrfs_new_inode(trans, root, dir, + dentry->d_name.name, + dentry->d_name.len, + btrfs_ino(dir), + objectid, + S_IFCHR | WHITEOUT_MODE, + &index); + + if (IS_ERR(inode)) { + ret = PTR_ERR(inode); + return ret; + } + + inode->i_op = &btrfs_special_inode_operations; + init_special_inode(inode, inode->i_mode, + WHITEOUT_DEV); + + ret = btrfs_init_inode_security(trans, inode, dir, + &dentry->d_name); + if (ret) + return ret; + + ret = btrfs_add_nondir(trans, dir, dentry, + inode, 0, index); + if (ret) + return ret; + + ret = btrfs_update_inode(trans, root, inode); + if (ret) + return ret; + + unlock_new_inode(inode); + iput(inode); + + return 0; +} + static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry) + struct inode *new_dir, struct dentry *new_dentry, + unsigned int flags) { struct btrfs_trans_handle *trans; struct btrfs_root *root = BTRFS_I(old_dir)->root; @@ -9457,15 +9693,15 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, * We want to reserve the absolute worst case amount of items. So if * both inodes are subvols and we need to unlink them then that would * require 4 item modifications, but if they are both normal inodes it - * would require 5 item modifications, so we'll assume their normal + * would require 5 item modifications, so we'll assume they are normal * inodes. So 5 * 2 is 10, plus 1 for the new link, so 11 total items * should cover the worst case number of items we'll modify. */ trans = btrfs_start_transaction(root, 11); if (IS_ERR(trans)) { - ret = PTR_ERR(trans); - goto out_notrans; - } + ret = PTR_ERR(trans); + goto out_notrans; + } if (dest != root) btrfs_record_root_in_trans(trans, dest); @@ -9561,6 +9797,16 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, btrfs_end_log_trans(root); log_pinned = false; } + + if (flags & RENAME_WHITEOUT) { + ret = btrfs_whiteout_for_rename(trans, root, old_dir, + old_dentry); + + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto out_fail; + } + } out_fail: /* * If we have pinned the log and an error happened, we unpin tasks @@ -9596,10 +9842,14 @@ static int btrfs_rename2(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { - if (flags & ~RENAME_NOREPLACE) + if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT)) return -EINVAL; - return btrfs_rename(old_dir, old_dentry, new_dir, new_dentry); + if (flags & RENAME_EXCHANGE) + return btrfs_rename_exchange(old_dir, old_dentry, new_dir, + new_dentry); + + return btrfs_rename(old_dir, old_dentry, new_dir, new_dentry, flags); } static void btrfs_run_delalloc_work(struct btrfs_work *work) -- cgit v1.2.3-55-g7522 From c990161888f387db136856337c237aa8d5003292 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Thu, 5 May 2016 01:41:57 +0100 Subject: Btrfs: fix inode leak on failure to setup whiteout inode in rename If we failed to fully setup the whiteout inode during a rename operation with the whiteout flag, we ended up leaking the inode, not decrementing its link count nor removing all its items from the fs/subvol tree. Signed-off-by: Filipe Manana --- fs/btrfs/inode.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 452cfefbf4b8..98c119b02916 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -9612,21 +9612,21 @@ static int btrfs_whiteout_for_rename(struct btrfs_trans_handle *trans, ret = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); if (ret) - return ret; + goto out; ret = btrfs_add_nondir(trans, dir, dentry, inode, 0, index); if (ret) - return ret; + goto out; ret = btrfs_update_inode(trans, root, inode); - if (ret) - return ret; - +out: unlock_new_inode(inode); + if (ret) + inode_dec_link_count(inode); iput(inode); - return 0; + return ret; } static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, -- cgit v1.2.3-55-g7522 From 86e8aa0e772caba5f0e0471d5f836b2b997dcb3e Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Thu, 5 May 2016 02:02:27 +0100 Subject: Btrfs: unpin logs if rename exchange operation fails If rename exchange operations fail at some point after we pinned any of the logs, we end up aborting the current transaction but never unpin the logs, which leaves concurrent tasks that are trying to sync the logs (as part of an fsync request from user space) blocked forever and preventing the filesystem from being unmountable. Fix this by safely unpinning the log. Signed-off-by: Filipe Manana --- fs/btrfs/inode.c | 38 ++++++++++++++++++++++++++++++++++++-- 1 file changed, 36 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 98c119b02916..c92d9b83bb38 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -9412,6 +9412,8 @@ static int btrfs_rename_exchange(struct inode *old_dir, u64 new_idx = 0; u64 root_objectid; int ret; + bool root_log_pinned = false; + bool dest_log_pinned = false; /* we only allow rename subvolume link between subvolumes */ if (old_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest) @@ -9464,6 +9466,7 @@ static int btrfs_rename_exchange(struct inode *old_dir, if (ret) goto out_fail; btrfs_pin_log_trans(root); + root_log_pinned = true; } /* And now for the dest. */ @@ -9479,6 +9482,7 @@ static int btrfs_rename_exchange(struct inode *old_dir, if (ret) goto out_fail; btrfs_pin_log_trans(dest); + dest_log_pinned = true; } /* Update inode version and ctime/mtime. */ @@ -9557,17 +9561,47 @@ static int btrfs_rename_exchange(struct inode *old_dir, if (new_inode->i_nlink == 1) BTRFS_I(new_inode)->dir_index = new_idx; - if (old_ino != BTRFS_FIRST_FREE_OBJECTID) { + if (root_log_pinned) { parent = new_dentry->d_parent; btrfs_log_new_name(trans, old_inode, old_dir, parent); btrfs_end_log_trans(root); + root_log_pinned = false; } - if (new_ino != BTRFS_FIRST_FREE_OBJECTID) { + if (dest_log_pinned) { parent = old_dentry->d_parent; btrfs_log_new_name(trans, new_inode, new_dir, parent); btrfs_end_log_trans(dest); + dest_log_pinned = false; } out_fail: + /* + * If we have pinned a log and an error happened, we unpin tasks + * trying to sync the log and force them to fallback to a transaction + * commit if the log currently contains any of the inodes involved in + * this rename operation (to ensure we do not persist a log with an + * inconsistent state for any of these inodes or leading to any + * inconsistencies when replayed). If the transaction was aborted, the + * abortion reason is propagated to userspace when attempting to commit + * the transaction. If the log does not contain any of these inodes, we + * allow the tasks to sync it. + */ + if (ret && (root_log_pinned || dest_log_pinned)) { + if (btrfs_inode_in_log(old_dir, root->fs_info->generation) || + btrfs_inode_in_log(new_dir, root->fs_info->generation) || + btrfs_inode_in_log(old_inode, root->fs_info->generation) || + (new_inode && + btrfs_inode_in_log(new_inode, root->fs_info->generation))) + btrfs_set_log_full_commit(root->fs_info, trans); + + if (root_log_pinned) { + btrfs_end_log_trans(root); + root_log_pinned = false; + } + if (dest_log_pinned) { + btrfs_end_log_trans(dest); + dest_log_pinned = false; + } + } ret = btrfs_end_transaction(trans, root); out_notrans: if (new_ino == BTRFS_FIRST_FREE_OBJECTID) -- cgit v1.2.3-55-g7522 From 376e5a57bf7f1466031a957d04bf8b8f6801ee6d Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Thu, 5 May 2016 02:08:56 +0100 Subject: Btrfs: pin logs earlier when doing a rename exchange operation The btrfs_rename_exchange() started as a copy-paste from btrfs_rename(), which had a race fixed by my previous patch titled "Btrfs: pin log earlier when renaming", and so it suffers from the same problem. We pin the logs of the affected roots after we insert the new inode references, leaving a time window where concurrent tasks logging the inodes can end up logging both the new and old references, resulting in log trees that when replayed can turn the metadata into inconsistent states. This behaviour was added to btrfs_rename() in 2009 without any explanation about why not pinning the logs earlier, just leaving a comment about the posibility for the race. As of today it's perfectly safe and sane to pin the logs before we start doing any of the steps involved in the rename operation. Signed-off-by: Filipe Manana --- fs/btrfs/inode.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index c92d9b83bb38..a7db3ed6ac88 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -9458,6 +9458,8 @@ static int btrfs_rename_exchange(struct inode *old_dir, /* force full log commit if subvolume involved. */ btrfs_set_log_full_commit(root->fs_info, trans); } else { + btrfs_pin_log_trans(root); + root_log_pinned = true; ret = btrfs_insert_inode_ref(trans, dest, new_dentry->d_name.name, new_dentry->d_name.len, @@ -9465,8 +9467,6 @@ static int btrfs_rename_exchange(struct inode *old_dir, btrfs_ino(new_dir), old_idx); if (ret) goto out_fail; - btrfs_pin_log_trans(root); - root_log_pinned = true; } /* And now for the dest. */ @@ -9474,6 +9474,8 @@ static int btrfs_rename_exchange(struct inode *old_dir, /* force full log commit if subvolume involved. */ btrfs_set_log_full_commit(dest->fs_info, trans); } else { + btrfs_pin_log_trans(dest); + dest_log_pinned = true; ret = btrfs_insert_inode_ref(trans, root, old_dentry->d_name.name, old_dentry->d_name.len, @@ -9481,8 +9483,6 @@ static int btrfs_rename_exchange(struct inode *old_dir, btrfs_ino(old_dir), new_idx); if (ret) goto out_fail; - btrfs_pin_log_trans(dest); - dest_log_pinned = true; } /* Update inode version and ctime/mtime. */ -- cgit v1.2.3-55-g7522 From 5062af35c3c6e49110ab1ec99295339259298a3d Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Thu, 5 May 2016 10:26:26 +0100 Subject: Btrfs: fix number of transaction units for renames with whiteout When we do a rename with the whiteout flag, we need to create the whiteout inode, which in the worst case requires 5 transaction units (1 inode item, 1 inode ref, 2 dir items and 1 xattr if selinux is enabled). So bump the number of transaction units from 11 to 16 if the whiteout flag is set. Signed-off-by: Filipe Manana --- fs/btrfs/inode.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index a7db3ed6ac88..0921d2b07f1d 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -9668,6 +9668,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, unsigned int flags) { struct btrfs_trans_handle *trans; + unsigned int trans_num_items; struct btrfs_root *root = BTRFS_I(old_dir)->root; struct btrfs_root *dest = BTRFS_I(new_dir)->root; struct inode *new_inode = d_inode(new_dentry); @@ -9730,8 +9731,14 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, * would require 5 item modifications, so we'll assume they are normal * inodes. So 5 * 2 is 10, plus 1 for the new link, so 11 total items * should cover the worst case number of items we'll modify. + * If our rename has the whiteout flag, we need more 5 units for the + * new inode (1 inode item, 1 inode ref, 2 dir items and 1 xattr item + * when selinux is enabled). */ - trans = btrfs_start_transaction(root, 11); + trans_num_items = 11; + if (flags & RENAME_WHITEOUT) + trans_num_items += 5; + trans = btrfs_start_transaction(root, trans_num_items); if (IS_ERR(trans)) { ret = PTR_ERR(trans); goto out_notrans; -- cgit v1.2.3-55-g7522 From 0b901916a00bc7b14ee83cc8e41c3b0d561a8f22 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 9 May 2016 13:15:27 +0100 Subject: Btrfs: fix race between fsync and direct IO writes for prealloc extents When we do a direct IO write against a preallocated extent (fallocate) that does not go beyond the i_size of the inode, we do the write operation without holding the inode's i_mutex (an optimization that landed in commit 38851cc19adb ("Btrfs: implement unlocked dio write")). This allows for a very tiny time window where a race can happen with a concurrent fsync using the fast code path, as the direct IO write path creates first a new extent map (no longer flagged as a prealloc extent) and then it creates the ordered extent, while the fast fsync path first collects ordered extents and then it collects extent maps. This allows for the possibility of the fast fsync path to collect the new extent map without collecting the new ordered extent, and therefore logging an extent item based on the extent map without waiting for the ordered extent to be created and complete. This can result in a situation where after a log replay we end up with an extent not marked anymore as prealloc but it was only partially written (or not written at all), exposing random, stale or garbage data corresponding to the unwritten pages and without any checksums in the csum tree covering the extent's range. This is an extension of what was done in commit de0ee0edb21f ("Btrfs: fix race between fsync and lockless direct IO writes"). So fix this by creating first the ordered extent and then the extent map, so that this way if the fast fsync patch collects the new extent map it also collects the corresponding ordered extent. Signed-off-by: Filipe Manana Reviewed-by: Josef Bacik --- fs/btrfs/inode.c | 43 +++++++++++++++++++++++++++++++++++++------ 1 file changed, 37 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 0921d2b07f1d..45d0dafbbf40 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -7658,6 +7658,25 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, if (can_nocow_extent(inode, start, &len, &orig_start, &orig_block_len, &ram_bytes) == 1) { + + /* + * Create the ordered extent before the extent map. This + * is to avoid races with the fast fsync path because it + * collects ordered extents into a local list and then + * collects all the new extent maps, so we must create + * the ordered extent first and make sure the fast fsync + * path collects any new ordered extents after + * collecting new extent maps as well. The fsync path + * simply can not rely on inode_dio_wait() because it + * causes deadlock with AIO. + */ + ret = btrfs_add_ordered_extent_dio(inode, start, + block_start, len, len, type); + if (ret) { + free_extent_map(em); + goto unlock_err; + } + if (type == BTRFS_ORDERED_PREALLOC) { free_extent_map(em); em = create_pinned_em(inode, start, len, @@ -7666,17 +7685,29 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, orig_block_len, ram_bytes, type); if (IS_ERR(em)) { + struct btrfs_ordered_extent *oe; + ret = PTR_ERR(em); + oe = btrfs_lookup_ordered_extent(inode, + start); + ASSERT(oe); + if (WARN_ON(!oe)) + goto unlock_err; + set_bit(BTRFS_ORDERED_IOERR, + &oe->flags); + set_bit(BTRFS_ORDERED_IO_DONE, + &oe->flags); + btrfs_remove_ordered_extent(inode, oe); + /* + * Once for our lookup and once for the + * ordered extents tree. + */ + btrfs_put_ordered_extent(oe); + btrfs_put_ordered_extent(oe); goto unlock_err; } } - ret = btrfs_add_ordered_extent_dio(inode, start, - block_start, len, len, type); - if (ret) { - free_extent_map(em); - goto unlock_err; - } goto unlock; } } -- cgit v1.2.3-55-g7522 From f78c436c3931e7df713688028f2b4faf72bf9f2a Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 9 May 2016 13:15:41 +0100 Subject: Btrfs: fix race between block group relocation and nocow writes Relocation of a block group waits for all existing tasks flushing dellaloc, starting direct IO writes and any ordered extents before starting the relocation process. However for direct IO writes that end up doing nocow (inode either has the flag nodatacow set or the write is against a prealloc extent) we have a short time window that allows for a race that makes relocation proceed without waiting for the direct IO write to complete first, resulting in data loss after the relocation finishes. This is illustrated by the following diagram: CPU 1 CPU 2 btrfs_relocate_block_group(bg X) direct IO write starts against an extent in block group X using nocow mode (inode has the nodatacow flag or the write is for a prealloc extent) btrfs_direct_IO() btrfs_get_blocks_direct() --> can_nocow_extent() returns 1 btrfs_inc_block_group_ro(bg X) --> turns block group into RO mode btrfs_wait_ordered_roots() --> returns and does not know about the DIO write happening at CPU 2 (the task there has not created yet an ordered extent) relocate_block_group(bg X) --> rc->stage == MOVE_DATA_EXTENTS find_next_extent() --> returns extent that the DIO write is going to write to relocate_data_extent() relocate_file_extent_cluster() --> reads the extent from disk into pages belonging to the relocation inode and dirties them --> creates DIO ordered extent btrfs_submit_direct() --> submits bio against a location on disk obtained from an extent map before the relocation started btrfs_wait_ordered_range() --> writes all the pages read before to disk (belonging to the relocation inode) relocation finishes bio completes and wrote new data to the old location of the block group So fix this by tracking the number of nocow writers for a block group and make sure relocation waits for that number to go down to 0 before starting to move the extents. The same race can also happen with buffered writes in nocow mode since the patch I recently made titled "Btrfs: don't do unnecessary delalloc flushes when relocating", because we are no longer flushing all delalloc which served as a synchonization mechanism (due to page locking) and ensured the ordered extents for nocow buffered writes were created before we called btrfs_wait_ordered_roots(). The race with direct IO writes in nocow mode existed before that patch (no pages are locked or used during direct IO) and that fixed only races with direct IO writes that do cow. Signed-off-by: Filipe Manana Reviewed-by: Josef Bacik --- fs/btrfs/ctree.h | 13 +++++++++++++ fs/btrfs/extent-tree.c | 53 ++++++++++++++++++++++++++++++++++++++++++++++++++ fs/btrfs/inode.c | 15 +++++++++++++- fs/btrfs/relocation.c | 1 + 4 files changed, 81 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 90e70e21e479..7ae758685c7b 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1419,6 +1419,16 @@ struct btrfs_block_group_cache { */ atomic_t reservations; + /* + * Incremented while holding the spinlock *lock* by a task checking if + * it can perform a nocow write (incremented if the value for the *ro* + * field is 0). Decremented by such tasks once they create an ordered + * extent or before that if some error happens before reaching that step. + * This is to prevent races between block group relocation and nocow + * writes through direct IO. + */ + atomic_t nocow_writers; + /* Lock for free space tree operations. */ struct mutex free_space_lock; @@ -3513,6 +3523,9 @@ int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans, void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info, const u64 start); void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg); +bool btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr); +void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr); +void btrfs_wait_nocow_writers(struct btrfs_block_group_cache *bg); void btrfs_put_block_group(struct btrfs_block_group_cache *cache); int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, struct btrfs_root *root, unsigned long count); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 09aad7b447f5..dcf89bfa990d 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3824,6 +3824,59 @@ int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr) return readonly; } +bool btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr) +{ + struct btrfs_block_group_cache *bg; + bool ret = true; + + bg = btrfs_lookup_block_group(fs_info, bytenr); + if (!bg) + return false; + + spin_lock(&bg->lock); + if (bg->ro) + ret = false; + else + atomic_inc(&bg->nocow_writers); + spin_unlock(&bg->lock); + + /* no put on block group, done by btrfs_dec_nocow_writers */ + if (!ret) + btrfs_put_block_group(bg); + + return ret; + +} + +void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr) +{ + struct btrfs_block_group_cache *bg; + + bg = btrfs_lookup_block_group(fs_info, bytenr); + ASSERT(bg); + if (atomic_dec_and_test(&bg->nocow_writers)) + wake_up_atomic_t(&bg->nocow_writers); + /* + * Once for our lookup and once for the lookup done by a previous call + * to btrfs_inc_nocow_writers() + */ + btrfs_put_block_group(bg); + btrfs_put_block_group(bg); +} + +static int btrfs_wait_nocow_writers_atomic_t(atomic_t *a) +{ + schedule(); + return 0; +} + +void btrfs_wait_nocow_writers(struct btrfs_block_group_cache *bg) +{ + wait_on_atomic_t(&bg->nocow_writers, + btrfs_wait_nocow_writers_atomic_t, + TASK_UNINTERRUPTIBLE); +} + static const char *alloc_name(u64 flags) { switch (flags) { diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 45d0dafbbf40..ee9be4199e7c 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1382,6 +1382,9 @@ next_slot: */ if (csum_exist_in_range(root, disk_bytenr, num_bytes)) goto out_check; + if (!btrfs_inc_nocow_writers(root->fs_info, + disk_bytenr)) + goto out_check; nocow = 1; } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { extent_end = found_key.offset + @@ -1396,6 +1399,9 @@ out_check: path->slots[0]++; if (!nolock && nocow) btrfs_end_write_no_snapshoting(root); + if (nocow) + btrfs_dec_nocow_writers(root->fs_info, + disk_bytenr); goto next_slot; } if (!nocow) { @@ -1416,6 +1422,9 @@ out_check: if (ret) { if (!nolock && nocow) btrfs_end_write_no_snapshoting(root); + if (nocow) + btrfs_dec_nocow_writers(root->fs_info, + disk_bytenr); goto error; } cow_start = (u64)-1; @@ -1458,6 +1467,8 @@ out_check: ret = btrfs_add_ordered_extent(inode, cur_offset, disk_bytenr, num_bytes, num_bytes, type); + if (nocow) + btrfs_dec_nocow_writers(root->fs_info, disk_bytenr); BUG_ON(ret); /* -ENOMEM */ if (root->root_key.objectid == @@ -7657,7 +7668,8 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, block_start = em->block_start + (start - em->start); if (can_nocow_extent(inode, start, &len, &orig_start, - &orig_block_len, &ram_bytes) == 1) { + &orig_block_len, &ram_bytes) == 1 && + btrfs_inc_nocow_writers(root->fs_info, block_start)) { /* * Create the ordered extent before the extent map. This @@ -7672,6 +7684,7 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, */ ret = btrfs_add_ordered_extent_dio(inode, start, block_start, len, len, type); + btrfs_dec_nocow_writers(root->fs_info, block_start); if (ret) { free_extent_map(em); goto unlock_err; diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index e78f8e44bd9a..054d9a80e77e 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -4255,6 +4255,7 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start) rc->block_group->key.objectid, rc->block_group->flags); btrfs_wait_block_group_reservations(rc->block_group); + btrfs_wait_nocow_writers(rc->block_group); btrfs_wait_ordered_roots(fs_info, -1, rc->block_group->key.objectid, rc->block_group->key.offset); -- cgit v1.2.3-55-g7522 From 5f9a8a51d8b95505d8de8b7191ae2ed8c504d4af Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Thu, 12 May 2016 13:53:36 +0100 Subject: Btrfs: add semaphore to synchronize direct IO writes with fsync Due to the optimization of lockless direct IO writes (the inode's i_mutex is not held) introduced in commit 38851cc19adb ("Btrfs: implement unlocked dio write"), we started having races between such writes with concurrent fsync operations that use the fast fsync path. These races were addressed in the patches titled "Btrfs: fix race between fsync and lockless direct IO writes" and "Btrfs: fix race between fsync and direct IO writes for prealloc extents". The races happened because the direct IO path, like every other write path, does create extent maps followed by the corresponding ordered extents while the fast fsync path collected first ordered extents and then it collected extent maps. This made it possible to log file extent items (based on the collected extent maps) without waiting for the corresponding ordered extents to complete (get their IO done). The two fixes mentioned before added a solution that consists of making the direct IO path create first the ordered extents and then the extent maps, while the fsync path attempts to collect any new ordered extents once it collects the extent maps. This was simple and did not require adding any synchonization primitive to any data structure (struct btrfs_inode for example) but it makes things more fragile for future development endeavours and adds an exceptional approach compared to the other write paths. This change adds a read-write semaphore to the btrfs inode structure and makes the direct IO path create the extent maps and the ordered extents while holding read access on that semaphore, while the fast fsync path collects extent maps and ordered extents while holding write access on that semaphore. The logic for direct IO write path is encapsulated in a new helper function that is used both for cow and nocow direct IO writes. Signed-off-by: Filipe Manana Reviewed-by: Josef Bacik --- fs/btrfs/btrfs_inode.h | 10 ++++ fs/btrfs/inode.c | 134 +++++++++++++++++++------------------------------ fs/btrfs/tree-log.c | 51 ++++++------------- 3 files changed, 77 insertions(+), 118 deletions(-) diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 61205e3bbefa..1da5753d886d 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -196,6 +196,16 @@ struct btrfs_inode { struct list_head delayed_iput; long delayed_iput_count; + /* + * To avoid races between lockless (i_mutex not held) direct IO writes + * and concurrent fsync requests. Direct IO writes must acquire read + * access on this semaphore for creating an extent map and its + * corresponding ordered extent. The fast fsync path must acquire write + * access on this semaphore before it collects ordered extents and + * extent maps. + */ + struct rw_semaphore dio_sem; + struct inode vfs_inode; }; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index ee9be4199e7c..c1ee4ade2d87 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -7145,6 +7145,43 @@ out: return em; } +static struct extent_map *btrfs_create_dio_extent(struct inode *inode, + const u64 start, + const u64 len, + const u64 orig_start, + const u64 block_start, + const u64 block_len, + const u64 orig_block_len, + const u64 ram_bytes, + const int type) +{ + struct extent_map *em = NULL; + int ret; + + down_read(&BTRFS_I(inode)->dio_sem); + if (type != BTRFS_ORDERED_NOCOW) { + em = create_pinned_em(inode, start, len, orig_start, + block_start, block_len, orig_block_len, + ram_bytes, type); + if (IS_ERR(em)) + goto out; + } + ret = btrfs_add_ordered_extent_dio(inode, start, block_start, + len, block_len, type); + if (ret) { + if (em) { + free_extent_map(em); + btrfs_drop_extent_cache(inode, start, + start + len - 1, 0); + } + em = ERR_PTR(ret); + } + out: + up_read(&BTRFS_I(inode)->dio_sem); + + return em; +} + static struct extent_map *btrfs_new_extent_direct(struct inode *inode, u64 start, u64 len) { @@ -7160,43 +7197,13 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode, if (ret) return ERR_PTR(ret); - /* - * Create the ordered extent before the extent map. This is to avoid - * races with the fast fsync path that would lead to it logging file - * extent items that point to disk extents that were not yet written to. - * The fast fsync path collects ordered extents into a local list and - * then collects all the new extent maps, so we must create the ordered - * extent first and make sure the fast fsync path collects any new - * ordered extents after collecting new extent maps as well. - * The fsync path simply can not rely on inode_dio_wait() because it - * causes deadlock with AIO. - */ - ret = btrfs_add_ordered_extent_dio(inode, start, ins.objectid, - ins.offset, ins.offset, 0); - if (ret) { - btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1); - return ERR_PTR(ret); - } - + em = btrfs_create_dio_extent(inode, start, ins.offset, start, + ins.objectid, ins.offset, ins.offset, + ins.offset, 0); btrfs_dec_block_group_reservations(root->fs_info, ins.objectid); - - em = create_pinned_em(inode, start, ins.offset, start, ins.objectid, - ins.offset, ins.offset, ins.offset, 0); - if (IS_ERR(em)) { - struct btrfs_ordered_extent *oe; - + if (IS_ERR(em)) btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1); - oe = btrfs_lookup_ordered_extent(inode, start); - ASSERT(oe); - if (WARN_ON(!oe)) - return em; - set_bit(BTRFS_ORDERED_IOERR, &oe->flags); - set_bit(BTRFS_ORDERED_IO_DONE, &oe->flags); - btrfs_remove_ordered_extent(inode, oe); - /* Once for our lookup and once for the ordered extents tree. */ - btrfs_put_ordered_extent(oe); - btrfs_put_ordered_extent(oe); - } + return em; } @@ -7670,57 +7677,21 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, if (can_nocow_extent(inode, start, &len, &orig_start, &orig_block_len, &ram_bytes) == 1 && btrfs_inc_nocow_writers(root->fs_info, block_start)) { + struct extent_map *em2; - /* - * Create the ordered extent before the extent map. This - * is to avoid races with the fast fsync path because it - * collects ordered extents into a local list and then - * collects all the new extent maps, so we must create - * the ordered extent first and make sure the fast fsync - * path collects any new ordered extents after - * collecting new extent maps as well. The fsync path - * simply can not rely on inode_dio_wait() because it - * causes deadlock with AIO. - */ - ret = btrfs_add_ordered_extent_dio(inode, start, - block_start, len, len, type); + em2 = btrfs_create_dio_extent(inode, start, len, + orig_start, block_start, + len, orig_block_len, + ram_bytes, type); btrfs_dec_nocow_writers(root->fs_info, block_start); - if (ret) { - free_extent_map(em); - goto unlock_err; - } - if (type == BTRFS_ORDERED_PREALLOC) { free_extent_map(em); - em = create_pinned_em(inode, start, len, - orig_start, - block_start, len, - orig_block_len, - ram_bytes, type); - if (IS_ERR(em)) { - struct btrfs_ordered_extent *oe; - - ret = PTR_ERR(em); - oe = btrfs_lookup_ordered_extent(inode, - start); - ASSERT(oe); - if (WARN_ON(!oe)) - goto unlock_err; - set_bit(BTRFS_ORDERED_IOERR, - &oe->flags); - set_bit(BTRFS_ORDERED_IO_DONE, - &oe->flags); - btrfs_remove_ordered_extent(inode, oe); - /* - * Once for our lookup and once for the - * ordered extents tree. - */ - btrfs_put_ordered_extent(oe); - btrfs_put_ordered_extent(oe); - goto unlock_err; - } + em = em2; + } + if (em2 && IS_ERR(em2)) { + ret = PTR_ERR(em2); + goto unlock_err; } - goto unlock; } } @@ -9281,6 +9252,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) INIT_LIST_HEAD(&ei->delalloc_inodes); INIT_LIST_HEAD(&ei->delayed_iput); RB_CLEAR_NODE(&ei->rb_node); + init_rwsem(&ei->dio_sem); return inode; } diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index a24a0ba523d6..003a826f4cff 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -4141,6 +4141,7 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, INIT_LIST_HEAD(&extents); + down_write(&BTRFS_I(inode)->dio_sem); write_lock(&tree->lock); test_gen = root->fs_info->last_trans_committed; @@ -4169,13 +4170,20 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, } list_sort(NULL, &extents, extent_cmp); + btrfs_get_logged_extents(inode, logged_list, start, end); /* - * Collect any new ordered extents within the range. This is to - * prevent logging file extent items without waiting for the disk - * location they point to being written. We do this only to deal - * with races against concurrent lockless direct IO writes. + * Some ordered extents started by fsync might have completed + * before we could collect them into the list logged_list, which + * means they're gone, not in our logged_list nor in the inode's + * ordered tree. We want the application/user space to know an + * error happened while attempting to persist file data so that + * it can take proper action. If such error happened, we leave + * without writing to the log tree and the fsync must report the + * file data write error and not commit the current transaction. */ - btrfs_get_logged_extents(inode, logged_list, start, end); + ret = btrfs_inode_check_errors(inode); + if (ret) + ctx->io_err = ret; process: while (!list_empty(&extents)) { em = list_entry(extents.next, struct extent_map, list); @@ -4202,6 +4210,7 @@ process: } WARN_ON(!list_empty(&extents)); write_unlock(&tree->lock); + up_write(&BTRFS_I(inode)->dio_sem); btrfs_release_path(path); return ret; @@ -4622,23 +4631,6 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, mutex_lock(&BTRFS_I(inode)->log_mutex); - /* - * Collect ordered extents only if we are logging data. This is to - * ensure a subsequent request to log this inode in LOG_INODE_ALL mode - * will process the ordered extents if they still exists at the time, - * because when we collect them we test and set for the flag - * BTRFS_ORDERED_LOGGED to prevent multiple log requests to process the - * same ordered extents. The consequence for the LOG_INODE_ALL log mode - * not processing the ordered extents is that we end up logging the - * corresponding file extent items, based on the extent maps in the - * inode's extent_map_tree's modified_list, without logging the - * respective checksums (since the may still be only attached to the - * ordered extents and have not been inserted in the csum tree by - * btrfs_finish_ordered_io() yet). - */ - if (inode_only == LOG_INODE_ALL) - btrfs_get_logged_extents(inode, &logged_list, start, end); - /* * a brute force approach to making sure we get the most uptodate * copies of everything. @@ -4846,21 +4838,6 @@ log_extents: goto out_unlock; } if (fast_search) { - /* - * Some ordered extents started by fsync might have completed - * before we collected the ordered extents in logged_list, which - * means they're gone, not in our logged_list nor in the inode's - * ordered tree. We want the application/user space to know an - * error happened while attempting to persist file data so that - * it can take proper action. If such error happened, we leave - * without writing to the log tree and the fsync must report the - * file data write error and not commit the current transaction. - */ - err = btrfs_inode_check_errors(inode); - if (err) { - ctx->io_err = err; - goto out_unlock; - } ret = btrfs_log_changed_extents(trans, root, inode, dst_path, &logged_list, ctx, start, end); if (ret) { -- cgit v1.2.3-55-g7522 From 4673272f43ae790ab9ec04e38a7542f82bb8f020 Mon Sep 17 00:00:00 2001 From: Scott Talbert Date: Mon, 9 May 2016 09:14:28 -0400 Subject: btrfs: fix memory leak during RAID 5/6 device replacement A 'struct bio' is allocated in scrub_missing_raid56_pages(), but it was never freed anywhere. Signed-off-by: Scott Talbert Signed-off-by: David Sterba --- fs/btrfs/scrub.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index d270c700ed31..fa35cdc46494 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -2127,6 +2127,8 @@ static void scrub_missing_raid56_end_io(struct bio *bio) if (bio->bi_error) sblock->no_io_error_seen = 0; + bio_put(bio); + btrfs_queue_work(fs_info->scrub_workers, &sblock->work); } -- cgit v1.2.3-55-g7522