From 248d200df34f3e44a4140f32dfc7428c52615332 Mon Sep 17 00:00:00 2001
From: Anand Jain
Date: Tue, 10 Mar 2015 06:38:19 +0800
Subject: Btrfs: sysfs: fix, btrfs_release_super_kobj() should to clean up the
 kobject data

The following test case fails indicating that, thread tried to init an initialized object.

kernel: [232104.016513] kobject (ffff880006c1c980): tried to init an initialized object, something is seriously wrong.

btrfs_sysfs_remove_one() self test code:

open_tree()
{
 ::
        ret = btrfs_sysfs_add_one(fs_info);
	if (ret) {
              pr_err("BTRFS: failed to init sysfs interface: %d\n", ret);
                goto fail_block_groups;
        }
+       btrfs_sysfs_remove_one(fs_info);
+       ret = btrfs_sysfs_add_one(fs_info);
+       if (ret) {
+               pr_err("BTRFS: failed to init sysfs interface: %d\n", ret);
+               goto fail_block_groups;
+       }

cleaning up the unregistered kobject fixes this.

Signed-off-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.cz>
---
 fs/btrfs/sysfs.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index e8a4c86d274d..db2f8aed2b7d 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -439,6 +439,8 @@ static struct attribute *btrfs_attrs[] = {
 static void btrfs_release_super_kobj(struct kobject *kobj)
 {
 	struct btrfs_fs_info *fs_info = to_fs_info(kobj);
+
+	memset(&fs_info->super_kobj, 0, sizeof(struct kobject));
 	complete(&fs_info->kobj_unregister);
 }
 
-- 
cgit v1.2.3-55-g7522


From 4e51f005a22a2c7795351b975437b0cee0acce3e Mon Sep 17 00:00:00 2001
From: Anand Jain
Date: Tue, 10 Mar 2015 06:38:20 +0800
Subject: Btrfs: sysfs: fix, fs_info kobject_unregister has init_completion()
 twice

kobject_unregister is to handle the release of the kobject,
its completion init is being called in btrfs_sysfs_add_one(),
so we don't have to do the same in the open_ctree() again.

Signed-off-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.cz>
---
 fs/btrfs/disk-io.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 2ef9a4b72d06..c42503cf3462 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2494,7 +2494,6 @@ int open_ctree(struct super_block *sb,
 	seqlock_init(&fs_info->profiles_lock);
 	init_rwsem(&fs_info->delayed_iput_sem);
 
-	init_completion(&fs_info->kobj_unregister);
 	INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
 	INIT_LIST_HEAD(&fs_info->space_info);
 	INIT_LIST_HEAD(&fs_info->tree_mod_seq_list);
-- 
cgit v1.2.3-55-g7522


From e7e1aa9c913da380fbf1977ac4fa98e464023605 Mon Sep 17 00:00:00 2001
From: Anand Jain
Date: Tue, 10 Mar 2015 06:38:21 +0800
Subject: Btrfs: sysfs: fix, undo sysfs device links

Theoritically need to remove the device links attributes, but since its entire device
kobject was removed, so there wasn't any issue of about it. Just do it nicely.

Signed-off-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.cz>
---
 fs/btrfs/sysfs.c | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index db2f8aed2b7d..ca0786190edd 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -522,6 +522,7 @@ void btrfs_sysfs_remove_one(struct btrfs_fs_info *fs_info)
 		kobject_del(fs_info->space_info_kobj);
 		kobject_put(fs_info->space_info_kobj);
 	}
+	btrfs_kobj_rm_device(fs_info, NULL);
 	kobject_del(fs_info->device_dir_kobj);
 	kobject_put(fs_info->device_dir_kobj);
 	addrm_unknown_feature_attrs(fs_info, false);
@@ -604,6 +605,8 @@ static void init_feature_attrs(void)
 	}
 }
 
+/* when one_device is NULL, it removes all device links */
+
 int btrfs_kobj_rm_device(struct btrfs_fs_info *fs_info,
 		struct btrfs_device *one_device)
 {
@@ -621,6 +624,20 @@ int btrfs_kobj_rm_device(struct btrfs_fs_info *fs_info,
 						disk_kobj->name);
 	}
 
+	if (one_device)
+		return 0;
+
+	list_for_each_entry(one_device,
+			&fs_info->fs_devices->devices, dev_list) {
+		if (!one_device->bdev)
+			continue;
+		disk = one_device->bdev->bd_part;
+		disk_kobj = &part_to_dev(disk)->kobj;
+
+		sysfs_remove_link(fs_info->device_dir_kobj,
+						disk_kobj->name);
+	}
+
 	return 0;
 }
 
-- 
cgit v1.2.3-55-g7522


From 8345ea31dc6e1772726d1d0b8864dedac55038e0 Mon Sep 17 00:00:00 2001
From: Anand Jain
Date: Fri, 20 Mar 2015 18:01:20 +0800
Subject: Btrfs: sysfs: fix, kobject pointer clean up needed after kobject
 release

The sysfs clean up self test like in the below code fails, since
fs_info->device_dir_kobject still points to its stale kobject.
Reseting this pointer will help to fix this.

open_ctree()
{

ret = btrfs_sysfs_add_one(fs_info);
::
+       btrfs_sysfs_remove_one(fs_info);
+       ret = btrfs_sysfs_add_one(fs_info);
+       if (ret) {
+               pr_err("BTRFS: failed to init sysfs interface: %d\n", ret);
+               goto fail_block_groups;
+       }

Signed-off-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.cz>
---
 fs/btrfs/sysfs.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index ca0786190edd..f31fcec01e9c 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -525,6 +525,7 @@ void btrfs_sysfs_remove_one(struct btrfs_fs_info *fs_info)
 	btrfs_kobj_rm_device(fs_info, NULL);
 	kobject_del(fs_info->device_dir_kobj);
 	kobject_put(fs_info->device_dir_kobj);
+	fs_info->device_dir_kobj = NULL;
 	addrm_unknown_feature_attrs(fs_info, false);
 	sysfs_remove_group(&fs_info->super_kobj, &btrfs_feature_attr_group);
 	__btrfs_sysfs_remove_one(fs_info);
-- 
cgit v1.2.3-55-g7522


From 4d435731f99db87020730c4b17e1da148908340e Mon Sep 17 00:00:00 2001
From: Anand Jain
Date: Tue, 10 Mar 2015 06:38:23 +0800
Subject: Btrfc: sysfs: fix, check if device_dir_kobj is init before destroy

Since the failure code in the btrfs_sysfs_add_one() can
call btrfs_sysfs_remove_one() even before device_dir_kobj
has been created we need to check if its null.

Signed-off-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.cz>
---
 fs/btrfs/sysfs.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index f31fcec01e9c..6218d31a6912 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -522,10 +522,12 @@ void btrfs_sysfs_remove_one(struct btrfs_fs_info *fs_info)
 		kobject_del(fs_info->space_info_kobj);
 		kobject_put(fs_info->space_info_kobj);
 	}
-	btrfs_kobj_rm_device(fs_info, NULL);
-	kobject_del(fs_info->device_dir_kobj);
-	kobject_put(fs_info->device_dir_kobj);
-	fs_info->device_dir_kobj = NULL;
+	if (fs_info->device_dir_kobj) {
+		btrfs_kobj_rm_device(fs_info, NULL);
+		kobject_del(fs_info->device_dir_kobj);
+		kobject_put(fs_info->device_dir_kobj);
+		fs_info->device_dir_kobj = NULL;
+	}
 	addrm_unknown_feature_attrs(fs_info, false);
 	sysfs_remove_group(&fs_info->super_kobj, &btrfs_feature_attr_group);
 	__btrfs_sysfs_remove_one(fs_info);
-- 
cgit v1.2.3-55-g7522


From aaf13305160490531b0d5ee4d56d32fc09f9bfa0 Mon Sep 17 00:00:00 2001
From: Anand Jain
Date: Tue, 10 Mar 2015 06:38:24 +0800
Subject: Btrfs: sysfs: reorder the kobject creations

As of now the order in which the kobjects are created
at btrfs_sysfs_add_one() is..
 fsid
 features
 unknown features (dynamic features)
 devices.

Since we would move fsid and device kobject to fs_devices
from fs_info structure, this patch will reorder in which
the kobjects are created as below.
 fsid
 devices
 features
 unknown features (dynamic features)

And hence the btrfs_sysfs_remove_one() will follow the same
in reverse order. and the device kobject destroy now can
be moved into the function __btrfs_sysfs_remove_one()

Signed-off-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.cz>
---
 fs/btrfs/sysfs.c | 23 +++++++++++++----------
 1 file changed, 13 insertions(+), 10 deletions(-)

diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 6218d31a6912..8eb2463c973c 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -510,6 +510,13 @@ static int addrm_unknown_feature_attrs(struct btrfs_fs_info *fs_info, bool add)
 
 static void __btrfs_sysfs_remove_one(struct btrfs_fs_info *fs_info)
 {
+	if (fs_info->device_dir_kobj) {
+		btrfs_kobj_rm_device(fs_info, NULL);
+		kobject_del(fs_info->device_dir_kobj);
+		kobject_put(fs_info->device_dir_kobj);
+		fs_info->device_dir_kobj = NULL;
+	}
+
 	kobject_del(&fs_info->super_kobj);
 	kobject_put(&fs_info->super_kobj);
 	wait_for_completion(&fs_info->kobj_unregister);
@@ -522,12 +529,6 @@ void btrfs_sysfs_remove_one(struct btrfs_fs_info *fs_info)
 		kobject_del(fs_info->space_info_kobj);
 		kobject_put(fs_info->space_info_kobj);
 	}
-	if (fs_info->device_dir_kobj) {
-		btrfs_kobj_rm_device(fs_info, NULL);
-		kobject_del(fs_info->device_dir_kobj);
-		kobject_put(fs_info->device_dir_kobj);
-		fs_info->device_dir_kobj = NULL;
-	}
 	addrm_unknown_feature_attrs(fs_info, false);
 	sysfs_remove_group(&fs_info->super_kobj, &btrfs_feature_attr_group);
 	__btrfs_sysfs_remove_one(fs_info);
@@ -700,6 +701,12 @@ int btrfs_sysfs_add_one(struct btrfs_fs_info *fs_info)
 	if (error)
 		return error;
 
+	error = btrfs_kobj_add_device(fs_info, NULL);
+	if (error) {
+		__btrfs_sysfs_remove_one(fs_info);
+		return error;
+	}
+
 	error = sysfs_create_group(&fs_info->super_kobj,
 				   &btrfs_feature_attr_group);
 	if (error) {
@@ -711,10 +718,6 @@ int btrfs_sysfs_add_one(struct btrfs_fs_info *fs_info)
 	if (error)
 		goto failure;
 
-	error = btrfs_kobj_add_device(fs_info, NULL);
-	if (error)
-		goto failure;
-
 	fs_info->space_info_kobj = kobject_create_and_add("allocation",
 						  &fs_info->super_kobj);
 	if (!fs_info->space_info_kobj) {
-- 
cgit v1.2.3-55-g7522


From 3a08f3b72a4f03e413d89d8505b17c05194e63f9 Mon Sep 17 00:00:00 2001
From: Anand Jain
Date: Tue, 10 Mar 2015 06:38:25 +0800
Subject: Btrfs: sysfs: rename __btrfs_sysfs_remove_one to
 btrfs_sysfs_remove_fsid

Signed-off-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.cz>
---
 fs/btrfs/sysfs.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 8eb2463c973c..8b6eff48268a 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -508,7 +508,7 @@ static int addrm_unknown_feature_attrs(struct btrfs_fs_info *fs_info, bool add)
 	return 0;
 }
 
-static void __btrfs_sysfs_remove_one(struct btrfs_fs_info *fs_info)
+static void btrfs_sysfs_remove_fsid(struct btrfs_fs_info *fs_info)
 {
 	if (fs_info->device_dir_kobj) {
 		btrfs_kobj_rm_device(fs_info, NULL);
@@ -531,7 +531,7 @@ void btrfs_sysfs_remove_one(struct btrfs_fs_info *fs_info)
 	}
 	addrm_unknown_feature_attrs(fs_info, false);
 	sysfs_remove_group(&fs_info->super_kobj, &btrfs_feature_attr_group);
-	__btrfs_sysfs_remove_one(fs_info);
+	btrfs_sysfs_remove_fsid(fs_info);
 }
 
 const char * const btrfs_feature_set_names[3] = {
@@ -703,14 +703,14 @@ int btrfs_sysfs_add_one(struct btrfs_fs_info *fs_info)
 
 	error = btrfs_kobj_add_device(fs_info, NULL);
 	if (error) {
-		__btrfs_sysfs_remove_one(fs_info);
+		btrfs_sysfs_remove_fsid(fs_info);
 		return error;
 	}
 
 	error = sysfs_create_group(&fs_info->super_kobj,
 				   &btrfs_feature_attr_group);
 	if (error) {
-		__btrfs_sysfs_remove_one(fs_info);
+		btrfs_sysfs_remove_fsid(fs_info);
 		return error;
 	}
 
-- 
cgit v1.2.3-55-g7522


From 720592157eeef627ff9a7c7c55ab1713bc48fb50 Mon Sep 17 00:00:00 2001
From: Anand Jain
Date: Tue, 10 Mar 2015 06:38:26 +0800
Subject: Btrfs: sysfs: introduce function btrfs_sysfs_add_fsid() to create
 sysfs fsid

We need it in a seperate function so that it can be called from the
device discovery thread as well.

Signed-off-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.cz>
---
 fs/btrfs/sysfs.c | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 8b6eff48268a..83c0c5d5368b 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -690,7 +690,12 @@ static struct dentry *btrfs_debugfs_root_dentry;
 /* Debugging tunables and exported data */
 u64 btrfs_debugfs_test;
 
-int btrfs_sysfs_add_one(struct btrfs_fs_info *fs_info)
+/*
+ * Can be called by the device discovery thread.
+ * And parent can be specified for seed device
+ */
+int btrfs_sysfs_add_fsid(struct btrfs_fs_info *fs_info,
+				struct kobject *parent)
 {
 	int error;
 
@@ -698,6 +703,14 @@ int btrfs_sysfs_add_one(struct btrfs_fs_info *fs_info)
 	fs_info->super_kobj.kset = btrfs_kset;
 	error = kobject_init_and_add(&fs_info->super_kobj, &btrfs_ktype, NULL,
 				     "%pU", fs_info->fsid);
+	return error;
+}
+
+int btrfs_sysfs_add_one(struct btrfs_fs_info *fs_info)
+{
+	int error;
+
+	error = btrfs_sysfs_add_fsid(fs_info, NULL);
 	if (error)
 		return error;
 
-- 
cgit v1.2.3-55-g7522


From 0dd2906f7229186424cdc80be8654b2c21d9c04c Mon Sep 17 00:00:00 2001
From: Anand Jain
Date: Tue, 10 Mar 2015 06:38:27 +0800
Subject: Btrfs: sysfs: let default_attrs be separate from the kset

As of now btrfs_attrs are provided using the default_attrs through
the kset. Separate them and create the default_attrs using the
sysfs_create_files instead. By doing this we will have the
flexibility that device discovery thread could create fsid
kobject.

Signed-off-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.cz>
---
 fs/btrfs/sysfs.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 83c0c5d5368b..11fa8e6c533c 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -428,7 +428,7 @@ static ssize_t btrfs_clone_alignment_show(struct kobject *kobj,
 
 BTRFS_ATTR(clone_alignment, btrfs_clone_alignment_show);
 
-static struct attribute *btrfs_attrs[] = {
+static const struct attribute *btrfs_attrs[] = {
 	BTRFS_ATTR_PTR(label),
 	BTRFS_ATTR_PTR(nodesize),
 	BTRFS_ATTR_PTR(sectorsize),
@@ -447,7 +447,6 @@ static void btrfs_release_super_kobj(struct kobject *kobj)
 static struct kobj_type btrfs_ktype = {
 	.sysfs_ops	= &kobj_sysfs_ops,
 	.release	= btrfs_release_super_kobj,
-	.default_attrs	= btrfs_attrs,
 };
 
 static inline struct btrfs_fs_info *to_fs_info(struct kobject *kobj)
@@ -531,6 +530,7 @@ void btrfs_sysfs_remove_one(struct btrfs_fs_info *fs_info)
 	}
 	addrm_unknown_feature_attrs(fs_info, false);
 	sysfs_remove_group(&fs_info->super_kobj, &btrfs_feature_attr_group);
+	sysfs_remove_files(&fs_info->super_kobj, btrfs_attrs);
 	btrfs_sysfs_remove_fsid(fs_info);
 }
 
@@ -720,13 +720,17 @@ int btrfs_sysfs_add_one(struct btrfs_fs_info *fs_info)
 		return error;
 	}
 
-	error = sysfs_create_group(&fs_info->super_kobj,
-				   &btrfs_feature_attr_group);
+	error = sysfs_create_files(&fs_info->super_kobj, btrfs_attrs);
 	if (error) {
 		btrfs_sysfs_remove_fsid(fs_info);
 		return error;
 	}
 
+	error = sysfs_create_group(&fs_info->super_kobj,
+				   &btrfs_feature_attr_group);
+	if (error)
+		goto failure;
+
 	error = addrm_unknown_feature_attrs(fs_info, true);
 	if (error)
 		goto failure;
-- 
cgit v1.2.3-55-g7522


From 00c921c23f9155597ff4ac1263a6ff46dc9e2206 Mon Sep 17 00:00:00 2001
From: Anand Jain
Date: Tue, 10 Mar 2015 06:38:28 +0800
Subject: Btrfs: sysfs: separate device kobject and its attribute creation

Separate device kobject and its attribute creation so that device
kobject can be created from the device discovery thread.

Signed-off-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.cz>
---
 fs/btrfs/sysfs.c | 21 +++++++++++++++------
 1 file changed, 15 insertions(+), 6 deletions(-)

diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 11fa8e6c533c..e71da324104a 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -645,13 +645,8 @@ int btrfs_kobj_rm_device(struct btrfs_fs_info *fs_info,
 	return 0;
 }
 
-int btrfs_kobj_add_device(struct btrfs_fs_info *fs_info,
-		struct btrfs_device *one_device)
+int btrfs_sysfs_add_device(struct btrfs_fs_info *fs_info)
 {
-	int error = 0;
-	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
-	struct btrfs_device *dev;
-
 	if (!fs_info->device_dir_kobj)
 		fs_info->device_dir_kobj = kobject_create_and_add("devices",
 						&fs_info->super_kobj);
@@ -659,6 +654,20 @@ int btrfs_kobj_add_device(struct btrfs_fs_info *fs_info,
 	if (!fs_info->device_dir_kobj)
 		return -ENOMEM;
 
+	return 0;
+}
+
+int btrfs_kobj_add_device(struct btrfs_fs_info *fs_info,
+		struct btrfs_device *one_device)
+{
+	int error = 0;
+	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+	struct btrfs_device *dev;
+
+	error = btrfs_sysfs_add_device(fs_info);
+	if (error)
+		return error;
+
 	list_for_each_entry(dev, &fs_devices->devices, dev_list) {
 		struct hd_struct *disk;
 		struct kobject *disk_kobj;
-- 
cgit v1.2.3-55-g7522


From 2e7910d6ca359ff1dbe05b74e3d7f353b5b65362 Mon Sep 17 00:00:00 2001
From: Anand Jain
Date: Tue, 10 Mar 2015 06:38:29 +0800
Subject: Btrfs: sysfs: move super_kobj and device_dir_kobj from fs_info to
 btrfs_fs_devices

This patch will provide a framework and help to create attributes
from the structure btrfs_fs_devices which are available even before
fs_info is created. So by moving the parent kobject super_kobj from
fs_info to btrfs_fs_devices, it will help to create attributes
from the btrfs_fs_devices as well.

Patches on top of this patch now will be able to create the
sys/fs/btrfs/fsid kobject and attributes from btrfs_fs_devices
when devices are scanned and registered to the kernel.

Just to note, this does not change any of the existing btrfs sysfs
external kobject names and its attributes and not even the life
cycle of them. Changes are internal only. And to ensure the same,
this path has been tested with various device operations and,
checking and comparing the sysfs kobjects and attributes with
sysfs kobject and attributes with out this patch, and they remain
same.

Signed-off-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.cz>
---
 fs/btrfs/ctree.h   |  3 --
 fs/btrfs/sysfs.c   | 88 ++++++++++++++++++++++++++++++------------------------
 fs/btrfs/volumes.c |  3 +-
 fs/btrfs/volumes.h |  5 ++++
 4 files changed, 56 insertions(+), 43 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 6f364e1d8d3d..3335245f2636 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1619,10 +1619,7 @@ struct btrfs_fs_info {
 	struct task_struct *cleaner_kthread;
 	int thread_pool_size;
 
-	struct kobject super_kobj;
 	struct kobject *space_info_kobj;
-	struct kobject *device_dir_kobj;
-	struct completion kobj_unregister;
 	int do_barriers;
 	int closing;
 	int log_root_recovering;
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index e71da324104a..f045c568b360 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -33,6 +33,7 @@
 #include "volumes.h"
 
 static inline struct btrfs_fs_info *to_fs_info(struct kobject *kobj);
+static inline struct btrfs_fs_devices *to_fs_devs(struct kobject *kobj);
 
 static u64 get_features(struct btrfs_fs_info *fs_info,
 			enum btrfs_feature_set set)
@@ -438,10 +439,10 @@ static const struct attribute *btrfs_attrs[] = {
 
 static void btrfs_release_super_kobj(struct kobject *kobj)
 {
-	struct btrfs_fs_info *fs_info = to_fs_info(kobj);
+	struct btrfs_fs_devices *fs_devs = to_fs_devs(kobj);
 
-	memset(&fs_info->super_kobj, 0, sizeof(struct kobject));
-	complete(&fs_info->kobj_unregister);
+	memset(&fs_devs->super_kobj, 0, sizeof(struct kobject));
+	complete(&fs_devs->kobj_unregister);
 }
 
 static struct kobj_type btrfs_ktype = {
@@ -449,11 +450,18 @@ static struct kobj_type btrfs_ktype = {
 	.release	= btrfs_release_super_kobj,
 };
 
+static inline struct btrfs_fs_devices *to_fs_devs(struct kobject *kobj)
+{
+	if (kobj->ktype != &btrfs_ktype)
+		return NULL;
+	return container_of(kobj, struct btrfs_fs_devices, super_kobj);
+}
+
 static inline struct btrfs_fs_info *to_fs_info(struct kobject *kobj)
 {
 	if (kobj->ktype != &btrfs_ktype)
 		return NULL;
-	return container_of(kobj, struct btrfs_fs_info, super_kobj);
+	return to_fs_devs(kobj)->fs_info;
 }
 
 #define NUM_FEATURE_BITS 64
@@ -494,12 +502,12 @@ static int addrm_unknown_feature_attrs(struct btrfs_fs_info *fs_info, bool add)
 			attrs[0] = &fa->kobj_attr.attr;
 			if (add) {
 				int ret;
-				ret = sysfs_merge_group(&fs_info->super_kobj,
+				ret = sysfs_merge_group(&fs_info->fs_devices->super_kobj,
 							&agroup);
 				if (ret)
 					return ret;
 			} else
-				sysfs_unmerge_group(&fs_info->super_kobj,
+				sysfs_unmerge_group(&fs_info->fs_devices->super_kobj,
 						    &agroup);
 		}
 
@@ -507,18 +515,17 @@ static int addrm_unknown_feature_attrs(struct btrfs_fs_info *fs_info, bool add)
 	return 0;
 }
 
-static void btrfs_sysfs_remove_fsid(struct btrfs_fs_info *fs_info)
+static void btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs)
 {
-	if (fs_info->device_dir_kobj) {
-		btrfs_kobj_rm_device(fs_info, NULL);
-		kobject_del(fs_info->device_dir_kobj);
-		kobject_put(fs_info->device_dir_kobj);
-		fs_info->device_dir_kobj = NULL;
+	if (fs_devs->device_dir_kobj) {
+		kobject_del(fs_devs->device_dir_kobj);
+		kobject_put(fs_devs->device_dir_kobj);
+		fs_devs->device_dir_kobj = NULL;
 	}
 
-	kobject_del(&fs_info->super_kobj);
-	kobject_put(&fs_info->super_kobj);
-	wait_for_completion(&fs_info->kobj_unregister);
+	kobject_del(&fs_devs->super_kobj);
+	kobject_put(&fs_devs->super_kobj);
+	wait_for_completion(&fs_devs->kobj_unregister);
 }
 
 void btrfs_sysfs_remove_one(struct btrfs_fs_info *fs_info)
@@ -529,9 +536,10 @@ void btrfs_sysfs_remove_one(struct btrfs_fs_info *fs_info)
 		kobject_put(fs_info->space_info_kobj);
 	}
 	addrm_unknown_feature_attrs(fs_info, false);
-	sysfs_remove_group(&fs_info->super_kobj, &btrfs_feature_attr_group);
-	sysfs_remove_files(&fs_info->super_kobj, btrfs_attrs);
-	btrfs_sysfs_remove_fsid(fs_info);
+	sysfs_remove_group(&fs_info->fs_devices->super_kobj, &btrfs_feature_attr_group);
+	sysfs_remove_files(&fs_info->fs_devices->super_kobj, btrfs_attrs);
+	btrfs_kobj_rm_device(fs_info, NULL);
+	btrfs_sysfs_remove_fsid(fs_info->fs_devices);
 }
 
 const char * const btrfs_feature_set_names[3] = {
@@ -617,14 +625,14 @@ int btrfs_kobj_rm_device(struct btrfs_fs_info *fs_info,
 	struct hd_struct *disk;
 	struct kobject *disk_kobj;
 
-	if (!fs_info->device_dir_kobj)
+	if (!fs_info->fs_devices->device_dir_kobj)
 		return -EINVAL;
 
 	if (one_device && one_device->bdev) {
 		disk = one_device->bdev->bd_part;
 		disk_kobj = &part_to_dev(disk)->kobj;
 
-		sysfs_remove_link(fs_info->device_dir_kobj,
+		sysfs_remove_link(fs_info->fs_devices->device_dir_kobj,
 						disk_kobj->name);
 	}
 
@@ -638,20 +646,20 @@ int btrfs_kobj_rm_device(struct btrfs_fs_info *fs_info,
 		disk = one_device->bdev->bd_part;
 		disk_kobj = &part_to_dev(disk)->kobj;
 
-		sysfs_remove_link(fs_info->device_dir_kobj,
+		sysfs_remove_link(fs_info->fs_devices->device_dir_kobj,
 						disk_kobj->name);
 	}
 
 	return 0;
 }
 
-int btrfs_sysfs_add_device(struct btrfs_fs_info *fs_info)
+int btrfs_sysfs_add_device(struct btrfs_fs_devices *fs_devs)
 {
-	if (!fs_info->device_dir_kobj)
-		fs_info->device_dir_kobj = kobject_create_and_add("devices",
-						&fs_info->super_kobj);
+	if (!fs_devs->device_dir_kobj)
+		fs_devs->device_dir_kobj = kobject_create_and_add("devices",
+						&fs_devs->super_kobj);
 
-	if (!fs_info->device_dir_kobj)
+	if (!fs_devs->device_dir_kobj)
 		return -ENOMEM;
 
 	return 0;
@@ -664,7 +672,7 @@ int btrfs_kobj_add_device(struct btrfs_fs_info *fs_info,
 	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
 	struct btrfs_device *dev;
 
-	error = btrfs_sysfs_add_device(fs_info);
+	error = btrfs_sysfs_add_device(fs_devices);
 	if (error)
 		return error;
 
@@ -681,7 +689,7 @@ int btrfs_kobj_add_device(struct btrfs_fs_info *fs_info,
 		disk = dev->bdev->bd_part;
 		disk_kobj = &part_to_dev(disk)->kobj;
 
-		error = sysfs_create_link(fs_info->device_dir_kobj,
+		error = sysfs_create_link(fs_devices->device_dir_kobj,
 					  disk_kobj, disk_kobj->name);
 		if (error)
 			break;
@@ -703,39 +711,41 @@ u64 btrfs_debugfs_test;
  * Can be called by the device discovery thread.
  * And parent can be specified for seed device
  */
-int btrfs_sysfs_add_fsid(struct btrfs_fs_info *fs_info,
+static int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs,
 				struct kobject *parent)
 {
 	int error;
 
-	init_completion(&fs_info->kobj_unregister);
-	fs_info->super_kobj.kset = btrfs_kset;
-	error = kobject_init_and_add(&fs_info->super_kobj, &btrfs_ktype, NULL,
-				     "%pU", fs_info->fsid);
+	init_completion(&fs_devs->kobj_unregister);
+	fs_devs->super_kobj.kset = btrfs_kset;
+	error = kobject_init_and_add(&fs_devs->super_kobj, &btrfs_ktype, NULL,
+				     "%pU", fs_devs->fsid);
 	return error;
 }
 
 int btrfs_sysfs_add_one(struct btrfs_fs_info *fs_info)
 {
 	int error;
+	struct btrfs_fs_devices *fs_devs = fs_info->fs_devices;
+	struct kobject *super_kobj = &fs_devs->super_kobj;
 
-	error = btrfs_sysfs_add_fsid(fs_info, NULL);
+	error = btrfs_sysfs_add_fsid(fs_devs, NULL);
 	if (error)
 		return error;
 
 	error = btrfs_kobj_add_device(fs_info, NULL);
 	if (error) {
-		btrfs_sysfs_remove_fsid(fs_info);
+		btrfs_sysfs_remove_fsid(fs_devs);
 		return error;
 	}
 
-	error = sysfs_create_files(&fs_info->super_kobj, btrfs_attrs);
+	error = sysfs_create_files(super_kobj, btrfs_attrs);
 	if (error) {
-		btrfs_sysfs_remove_fsid(fs_info);
+		btrfs_sysfs_remove_fsid(fs_devs);
 		return error;
 	}
 
-	error = sysfs_create_group(&fs_info->super_kobj,
+	error = sysfs_create_group(super_kobj,
 				   &btrfs_feature_attr_group);
 	if (error)
 		goto failure;
@@ -745,7 +755,7 @@ int btrfs_sysfs_add_one(struct btrfs_fs_info *fs_info)
 		goto failure;
 
 	fs_info->space_info_kobj = kobject_create_and_add("allocation",
-						  &fs_info->super_kobj);
+						  super_kobj);
 	if (!fs_info->space_info_kobj) {
 		error = -ENOMEM;
 		goto failure;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 174f5e1e00ab..39ff99e4b5a6 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -2252,7 +2252,8 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 		 */
 		snprintf(fsid_buf, BTRFS_UUID_UNPARSED_SIZE, "%pU",
 						root->fs_info->fsid);
-		if (kobject_rename(&root->fs_info->super_kobj, fsid_buf))
+		if (kobject_rename(&root->fs_info->fs_devices->super_kobj,
+								fsid_buf))
 			goto error_trans;
 	}
 
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index ebc31331a837..e9780e9e6d97 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -253,6 +253,11 @@ struct btrfs_fs_devices {
 	 * nonrot flag set
 	 */
 	int rotating;
+
+	/* sysfs kobjects */
+	struct kobject super_kobj;
+	struct kobject *device_dir_kobj;
+	struct completion kobj_unregister;
 };
 
 #define BTRFS_BIO_INLINE_CSUM_SIZE	64
-- 
cgit v1.2.3-55-g7522


From c73eccf75bf92e49be30884da32a169b04eb5bc9 Mon Sep 17 00:00:00 2001
From: Anand Jain
Date: Tue, 10 Mar 2015 06:38:30 +0800
Subject: Btrfs: introduce btrfs_get_fs_uuids to get fs_uuids

Signed-off-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.cz>
---
 fs/btrfs/volumes.c | 4 ++++
 fs/btrfs/volumes.h | 1 +
 2 files changed, 5 insertions(+)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 39ff99e4b5a6..e500bfa54dc9 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -52,6 +52,10 @@ static void btrfs_dev_stat_print_on_load(struct btrfs_device *device);
 
 DEFINE_MUTEX(uuid_mutex);
 static LIST_HEAD(fs_uuids);
+struct list_head *btrfs_get_fs_uuids(void)
+{
+	return &fs_uuids;
+}
 
 static struct btrfs_fs_devices *__alloc_fs_devices(void)
 {
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index e9780e9e6d97..ac7e938c6977 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -542,5 +542,6 @@ static inline void unlock_chunks(struct btrfs_root *root)
 	mutex_unlock(&root->fs_info->chunk_mutex);
 }
 
+struct list_head *btrfs_get_fs_uuids(void);
 
 #endif
-- 
cgit v1.2.3-55-g7522


From 5a13f4308c5b4af28c01ca9cacdd8a6db777dfcb Mon Sep 17 00:00:00 2001
From: Anand Jain
Date: Tue, 10 Mar 2015 06:38:31 +0800
Subject: Btrfs: sysfs: add pointer to access fs_info from fs_devices

adds fs_info pointer with struct btrfs_fs_devices.

Signed-off-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.cz>
---
 fs/btrfs/sysfs.c   |  4 ++++
 fs/btrfs/volumes.c | 18 ++++++++++++++++++
 fs/btrfs/volumes.h |  3 +++
 3 files changed, 25 insertions(+)

diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index f045c568b360..4b9a8df3faea 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -530,6 +530,8 @@ static void btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs)
 
 void btrfs_sysfs_remove_one(struct btrfs_fs_info *fs_info)
 {
+	btrfs_reset_fs_info_ptr(fs_info);
+
 	if (fs_info->space_info_kobj) {
 		sysfs_remove_files(fs_info->space_info_kobj, allocation_attrs);
 		kobject_del(fs_info->space_info_kobj);
@@ -729,6 +731,8 @@ int btrfs_sysfs_add_one(struct btrfs_fs_info *fs_info)
 	struct btrfs_fs_devices *fs_devs = fs_info->fs_devices;
 	struct kobject *super_kobj = &fs_devs->super_kobj;
 
+	btrfs_set_fs_info_ptr(fs_info);
+
 	error = btrfs_sysfs_add_fsid(fs_devs, NULL);
 	if (error)
 		return error;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index e500bfa54dc9..5719470b50cd 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -6733,3 +6733,21 @@ void btrfs_update_commit_device_bytes_used(struct btrfs_root *root,
 	}
 	unlock_chunks(root);
 }
+
+void btrfs_set_fs_info_ptr(struct btrfs_fs_info *fs_info)
+{
+	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+	while (fs_devices) {
+		fs_devices->fs_info = fs_info;
+		fs_devices = fs_devices->seed;
+	}
+}
+
+void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info)
+{
+	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+	while (fs_devices) {
+		fs_devices->fs_info = NULL;
+		fs_devices = fs_devices->seed;
+	}
+}
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index ac7e938c6977..210a64390f40 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -254,6 +254,7 @@ struct btrfs_fs_devices {
 	 */
 	int rotating;
 
+	struct btrfs_fs_info *fs_info;
 	/* sysfs kobjects */
 	struct kobject super_kobj;
 	struct kobject *device_dir_kobj;
@@ -543,5 +544,7 @@ static inline void unlock_chunks(struct btrfs_root *root)
 }
 
 struct list_head *btrfs_get_fs_uuids(void);
+void btrfs_set_fs_info_ptr(struct btrfs_fs_info *fs_info);
+void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info);
 
 #endif
-- 
cgit v1.2.3-55-g7522


From 2e3e12815a296f263261b17b3a5781cbd517f7f3 Mon Sep 17 00:00:00 2001
From: Anand Jain
Date: Tue, 10 Mar 2015 06:38:32 +0800
Subject: Btrfs: sysfs: provide framework to remove all fsid sysfs kobject

Just a helper function to clean up the sysfs fsid kobjects.

Signed-off-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.cz>
---
 fs/btrfs/sysfs.c | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 4b9a8df3faea..333ed0840907 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -515,7 +515,7 @@ static int addrm_unknown_feature_attrs(struct btrfs_fs_info *fs_info, bool add)
 	return 0;
 }
 
-static void btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs)
+static void __btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs)
 {
 	if (fs_devs->device_dir_kobj) {
 		kobject_del(fs_devs->device_dir_kobj);
@@ -528,6 +528,21 @@ static void btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs)
 	wait_for_completion(&fs_devs->kobj_unregister);
 }
 
+/* when fs_devs is NULL it will remove all fsid kobject */
+static void btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs)
+{
+	struct list_head *fs_uuids = btrfs_get_fs_uuids();
+
+	if (fs_devs) {
+		__btrfs_sysfs_remove_fsid(fs_devs);
+		return;
+	}
+
+	list_for_each_entry(fs_devs, fs_uuids, list) {
+		__btrfs_sysfs_remove_fsid(fs_devs);
+	}
+}
+
 void btrfs_sysfs_remove_one(struct btrfs_fs_info *fs_info)
 {
 	btrfs_reset_fs_info_ptr(fs_info);
-- 
cgit v1.2.3-55-g7522


From 1ba43816af921219d596c462baa7674ff0228229 Mon Sep 17 00:00:00 2001
From: Anand Jain
Date: Tue, 10 Mar 2015 06:38:33 +0800
Subject: Btrfs: sysfs btrfs_kobj_add_device() pass fs_devices instead of
 fs_info

btrfs_kobj_add_device() does not need fs_info any more.

Signed-off-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.cz>
---
 fs/btrfs/dev-replace.c | 2 +-
 fs/btrfs/sysfs.c       | 7 +++----
 fs/btrfs/sysfs.h       | 2 +-
 fs/btrfs/volumes.c     | 2 +-
 4 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 0573848c7333..72c03adb6562 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -584,7 +584,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
 
 	/* replace the sysfs entry */
 	btrfs_kobj_rm_device(fs_info, src_device);
-	btrfs_kobj_add_device(fs_info, tgt_device);
+	btrfs_kobj_add_device(fs_info->fs_devices, tgt_device);
 	btrfs_rm_dev_replace_free_srcdev(fs_info, src_device);
 
 	/* write back the superblocks */
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 333ed0840907..82e18f532958 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -682,11 +682,10 @@ int btrfs_sysfs_add_device(struct btrfs_fs_devices *fs_devs)
 	return 0;
 }
 
-int btrfs_kobj_add_device(struct btrfs_fs_info *fs_info,
-		struct btrfs_device *one_device)
+int btrfs_kobj_add_device(struct btrfs_fs_devices *fs_devices,
+				struct btrfs_device *one_device)
 {
 	int error = 0;
-	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
 	struct btrfs_device *dev;
 
 	error = btrfs_sysfs_add_device(fs_devices);
@@ -752,7 +751,7 @@ int btrfs_sysfs_add_one(struct btrfs_fs_info *fs_info)
 	if (error)
 		return error;
 
-	error = btrfs_kobj_add_device(fs_info, NULL);
+	error = btrfs_kobj_add_device(fs_devs, NULL);
 	if (error) {
 		btrfs_sysfs_remove_fsid(fs_devs);
 		return error;
diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h
index 3a4bbed723fd..f1d7c7604f6f 100644
--- a/fs/btrfs/sysfs.h
+++ b/fs/btrfs/sysfs.h
@@ -82,7 +82,7 @@ char *btrfs_printable_features(enum btrfs_feature_set set, u64 flags);
 extern const char * const btrfs_feature_set_names[3];
 extern struct kobj_type space_info_ktype;
 extern struct kobj_type btrfs_raid_ktype;
-int btrfs_kobj_add_device(struct btrfs_fs_info *fs_info,
+int btrfs_kobj_add_device(struct btrfs_fs_devices *fs_devices,
 		struct btrfs_device *one_device);
 int btrfs_kobj_rm_device(struct btrfs_fs_info *fs_info,
                 struct btrfs_device *one_device);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 5719470b50cd..d68d3944af4b 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -2215,7 +2215,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 				    tmp + 1);
 
 	/* add sysfs device entry */
-	btrfs_kobj_add_device(root->fs_info, device);
+	btrfs_kobj_add_device(root->fs_info->fs_devices, device);
 
 	/*
 	 * we've got more storage, clear any full flags on the space
-- 
cgit v1.2.3-55-g7522


From 6c14a1641bfaa213bb3c5bcb7f4d8cde234ada70 Mon Sep 17 00:00:00 2001
From: Anand Jain
Date: Tue, 10 Mar 2015 06:38:34 +0800
Subject: Btrfs: sysfs btrfs_kobj_rm_device() pass fs_devices instead of
 fs_info

since btrfs_kobj_rm_device() does nothing with fs_info

Signed-off-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.cz>
---
 fs/btrfs/dev-replace.c |  2 +-
 fs/btrfs/sysfs.c       | 12 ++++++------
 fs/btrfs/sysfs.h       |  2 +-
 fs/btrfs/volumes.c     |  4 ++--
 4 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 72c03adb6562..f982ef303f1c 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -583,7 +583,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
 	mutex_unlock(&uuid_mutex);
 
 	/* replace the sysfs entry */
-	btrfs_kobj_rm_device(fs_info, src_device);
+	btrfs_kobj_rm_device(fs_info->fs_devices, src_device);
 	btrfs_kobj_add_device(fs_info->fs_devices, tgt_device);
 	btrfs_rm_dev_replace_free_srcdev(fs_info, src_device);
 
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 82e18f532958..0a5d1eebc27e 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -555,7 +555,7 @@ void btrfs_sysfs_remove_one(struct btrfs_fs_info *fs_info)
 	addrm_unknown_feature_attrs(fs_info, false);
 	sysfs_remove_group(&fs_info->fs_devices->super_kobj, &btrfs_feature_attr_group);
 	sysfs_remove_files(&fs_info->fs_devices->super_kobj, btrfs_attrs);
-	btrfs_kobj_rm_device(fs_info, NULL);
+	btrfs_kobj_rm_device(fs_info->fs_devices, NULL);
 	btrfs_sysfs_remove_fsid(fs_info->fs_devices);
 }
 
@@ -636,20 +636,20 @@ static void init_feature_attrs(void)
 
 /* when one_device is NULL, it removes all device links */
 
-int btrfs_kobj_rm_device(struct btrfs_fs_info *fs_info,
+int btrfs_kobj_rm_device(struct btrfs_fs_devices *fs_devices,
 		struct btrfs_device *one_device)
 {
 	struct hd_struct *disk;
 	struct kobject *disk_kobj;
 
-	if (!fs_info->fs_devices->device_dir_kobj)
+	if (!fs_devices->device_dir_kobj)
 		return -EINVAL;
 
 	if (one_device && one_device->bdev) {
 		disk = one_device->bdev->bd_part;
 		disk_kobj = &part_to_dev(disk)->kobj;
 
-		sysfs_remove_link(fs_info->fs_devices->device_dir_kobj,
+		sysfs_remove_link(fs_devices->device_dir_kobj,
 						disk_kobj->name);
 	}
 
@@ -657,13 +657,13 @@ int btrfs_kobj_rm_device(struct btrfs_fs_info *fs_info,
 		return 0;
 
 	list_for_each_entry(one_device,
-			&fs_info->fs_devices->devices, dev_list) {
+			&fs_devices->devices, dev_list) {
 		if (!one_device->bdev)
 			continue;
 		disk = one_device->bdev->bd_part;
 		disk_kobj = &part_to_dev(disk)->kobj;
 
-		sysfs_remove_link(fs_info->fs_devices->device_dir_kobj,
+		sysfs_remove_link(fs_devices->device_dir_kobj,
 						disk_kobj->name);
 	}
 
diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h
index f1d7c7604f6f..808d12ac4a5a 100644
--- a/fs/btrfs/sysfs.h
+++ b/fs/btrfs/sysfs.h
@@ -84,6 +84,6 @@ extern struct kobj_type space_info_ktype;
 extern struct kobj_type btrfs_raid_ktype;
 int btrfs_kobj_add_device(struct btrfs_fs_devices *fs_devices,
 		struct btrfs_device *one_device);
-int btrfs_kobj_rm_device(struct btrfs_fs_info *fs_info,
+int btrfs_kobj_rm_device(struct btrfs_fs_devices *fs_devices,
                 struct btrfs_device *one_device);
 #endif /* _BTRFS_SYSFS_H_ */
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index d68d3944af4b..b5cc129bfa4b 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1710,7 +1710,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 	if (device->bdev) {
 		device->fs_devices->open_devices--;
 		/* remove sysfs entry */
-		btrfs_kobj_rm_device(root->fs_info, device);
+		btrfs_kobj_rm_device(root->fs_info->fs_devices, device);
 	}
 
 	call_rcu(&device->rcu, free_device);
@@ -2294,7 +2294,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 error_trans:
 	btrfs_end_transaction(trans, root);
 	rcu_string_free(device->name);
-	btrfs_kobj_rm_device(root->fs_info, device);
+	btrfs_kobj_rm_device(root->fs_info->fs_devices, device);
 	kfree(device);
 error:
 	blkdev_put(bdev, FMODE_EXCL);
-- 
cgit v1.2.3-55-g7522


From 0c10e2d482ba7eafb9806f3ee071c8af5afcde55 Mon Sep 17 00:00:00 2001
From: Anand Jain
Date: Tue, 10 Mar 2015 06:38:35 +0800
Subject: Btrfs: sysfs: make btrfs_sysfs_add_fsid() non static

Signed-off-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.cz>
---
 fs/btrfs/sysfs.c | 2 +-
 fs/btrfs/sysfs.h | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 0a5d1eebc27e..53a327ba75da 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -727,7 +727,7 @@ u64 btrfs_debugfs_test;
  * Can be called by the device discovery thread.
  * And parent can be specified for seed device
  */
-static int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs,
+int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs,
 				struct kobject *parent)
 {
 	int error;
diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h
index 808d12ac4a5a..4edf2a7839ad 100644
--- a/fs/btrfs/sysfs.h
+++ b/fs/btrfs/sysfs.h
@@ -86,4 +86,6 @@ int btrfs_kobj_add_device(struct btrfs_fs_devices *fs_devices,
 		struct btrfs_device *one_device);
 int btrfs_kobj_rm_device(struct btrfs_fs_devices *fs_devices,
                 struct btrfs_device *one_device);
+int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs,
+				struct kobject *parent);
 #endif /* _BTRFS_SYSFS_H_ */
-- 
cgit v1.2.3-55-g7522


From ef1a0daadf2b722fc1bb6c2d5cf39239591e78ec Mon Sep 17 00:00:00 2001
From: Anand Jain
Date: Tue, 10 Mar 2015 06:38:36 +0800
Subject: Btrfs: sysfs: make btrfs_sysfs_add_device() non static

Signed-off-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.cz>
---
 fs/btrfs/sysfs.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h
index 4edf2a7839ad..915307ff1a76 100644
--- a/fs/btrfs/sysfs.h
+++ b/fs/btrfs/sysfs.h
@@ -88,4 +88,5 @@ int btrfs_kobj_rm_device(struct btrfs_fs_devices *fs_devices,
                 struct btrfs_device *one_device);
 int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs,
 				struct kobject *parent);
+int btrfs_sysfs_add_device(struct btrfs_fs_devices *fs_devs);
 #endif /* _BTRFS_SYSFS_H_ */
-- 
cgit v1.2.3-55-g7522


From 1d1c1be3720c1f4b621c5f90a4f61bdd856a0aa8 Mon Sep 17 00:00:00 2001
From: Anand Jain
Date: Tue, 10 Mar 2015 06:38:37 +0800
Subject: Btrfs: sysfs: btrfs_sysfs_remove_fsid() make it non static

Signed-off-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.cz>
---
 fs/btrfs/sysfs.c | 2 +-
 fs/btrfs/sysfs.h | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 53a327ba75da..06eae456612f 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -529,7 +529,7 @@ static void __btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs)
 }
 
 /* when fs_devs is NULL it will remove all fsid kobject */
-static void btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs)
+void btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs)
 {
 	struct list_head *fs_uuids = btrfs_get_fs_uuids();
 
diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h
index 915307ff1a76..6392527bcc15 100644
--- a/fs/btrfs/sysfs.h
+++ b/fs/btrfs/sysfs.h
@@ -89,4 +89,5 @@ int btrfs_kobj_rm_device(struct btrfs_fs_devices *fs_devices,
 int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs,
 				struct kobject *parent);
 int btrfs_sysfs_add_device(struct btrfs_fs_devices *fs_devs);
+void btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs);
 #endif /* _BTRFS_SYSFS_H_ */
-- 
cgit v1.2.3-55-g7522


From b7c35e81adcd593daca2160b5ba0ec62f71a9303 Mon Sep 17 00:00:00 2001
From: Anand Jain
Date: Tue, 10 Mar 2015 06:38:38 +0800
Subject: Btrfs: sysfs: separate kobject and attribute creation

Signed-off-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.cz>
---
 fs/btrfs/disk-io.c | 18 +++++++++++++++++-
 fs/btrfs/sysfs.c   | 15 ++-------------
 2 files changed, 19 insertions(+), 14 deletions(-)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index c42503cf3462..d29a2515f1e8 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2873,10 +2873,22 @@ retry_root_backup:
 
 	btrfs_close_extra_devices(fs_devices, 1);
 
+	ret = btrfs_sysfs_add_fsid(fs_devices, NULL);
+	if (ret) {
+		pr_err("BTRFS: failed to init sysfs fsid interface: %d\n", ret);
+		goto fail_block_groups;
+	}
+
+	ret = btrfs_sysfs_add_device(fs_devices);
+	if (ret) {
+		pr_err("BTRFS: failed to init sysfs device interface: %d\n", ret);
+		goto fail_fsdev_sysfs;
+	}
+
 	ret = btrfs_sysfs_add_one(fs_info);
 	if (ret) {
 		pr_err("BTRFS: failed to init sysfs interface: %d\n", ret);
-		goto fail_block_groups;
+		goto fail_fsdev_sysfs;
 	}
 
 	ret = btrfs_init_space_info(fs_info);
@@ -3054,6 +3066,9 @@ fail_cleaner:
 fail_sysfs:
 	btrfs_sysfs_remove_one(fs_info);
 
+fail_fsdev_sysfs:
+	btrfs_sysfs_remove_fsid(fs_info->fs_devices);
+
 fail_block_groups:
 	btrfs_put_block_group_cache(fs_info);
 	btrfs_free_block_groups(fs_info);
@@ -3731,6 +3746,7 @@ void close_ctree(struct btrfs_root *root)
 	}
 
 	btrfs_sysfs_remove_one(fs_info);
+	btrfs_sysfs_remove_fsid(fs_info->fs_devices);
 
 	btrfs_free_fs_roots(fs_info);
 
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 06eae456612f..b35366c88d3d 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -556,7 +556,6 @@ void btrfs_sysfs_remove_one(struct btrfs_fs_info *fs_info)
 	sysfs_remove_group(&fs_info->fs_devices->super_kobj, &btrfs_feature_attr_group);
 	sysfs_remove_files(&fs_info->fs_devices->super_kobj, btrfs_attrs);
 	btrfs_kobj_rm_device(fs_info->fs_devices, NULL);
-	btrfs_sysfs_remove_fsid(fs_info->fs_devices);
 }
 
 const char * const btrfs_feature_set_names[3] = {
@@ -688,10 +687,6 @@ int btrfs_kobj_add_device(struct btrfs_fs_devices *fs_devices,
 	int error = 0;
 	struct btrfs_device *dev;
 
-	error = btrfs_sysfs_add_device(fs_devices);
-	if (error)
-		return error;
-
 	list_for_each_entry(dev, &fs_devices->devices, dev_list) {
 		struct hd_struct *disk;
 		struct kobject *disk_kobj;
@@ -747,19 +742,13 @@ int btrfs_sysfs_add_one(struct btrfs_fs_info *fs_info)
 
 	btrfs_set_fs_info_ptr(fs_info);
 
-	error = btrfs_sysfs_add_fsid(fs_devs, NULL);
-	if (error)
-		return error;
-
 	error = btrfs_kobj_add_device(fs_devs, NULL);
-	if (error) {
-		btrfs_sysfs_remove_fsid(fs_devs);
+	if (error)
 		return error;
-	}
 
 	error = sysfs_create_files(super_kobj, btrfs_attrs);
 	if (error) {
-		btrfs_sysfs_remove_fsid(fs_devs);
+		btrfs_kobj_rm_device(fs_devs, NULL);
 		return error;
 	}
 
-- 
cgit v1.2.3-55-g7522


From 24bd69cb0fb2d90f4ced5f6acfee3688b33bcbdd Mon Sep 17 00:00:00 2001
From: Anand Jain
Date: Tue, 10 Mar 2015 06:38:39 +0800
Subject: Btrfs: sysfs: add support to add parent for fsid

To support seed sysfs layout and represent seed fsid under
the sprout we need the facility to create fsid under the
specified parent.

Signed-off-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.cz>
---
 fs/btrfs/sysfs.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index b35366c88d3d..ea81a057c79b 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -729,8 +729,8 @@ int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs,
 
 	init_completion(&fs_devs->kobj_unregister);
 	fs_devs->super_kobj.kset = btrfs_kset;
-	error = kobject_init_and_add(&fs_devs->super_kobj, &btrfs_ktype, NULL,
-				     "%pU", fs_devs->fsid);
+	error = kobject_init_and_add(&fs_devs->super_kobj,
+				&btrfs_ktype, parent, "%pU", fs_devs->fsid);
 	return error;
 }
 
-- 
cgit v1.2.3-55-g7522


From 2421a8cd5faaa9c2c9397123c5a297bab227d965 Mon Sep 17 00:00:00 2001
From: Anand Jain
Date: Tue, 10 Mar 2015 06:38:40 +0800
Subject: Btrfs: sysfs: don't fail seeding for the sake of sysfs kobject issue

Signed-off-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.cz>
---
 fs/btrfs/volumes.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index b5cc129bfa4b..b851964b7345 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -2258,7 +2258,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 						root->fs_info->fsid);
 		if (kobject_rename(&root->fs_info->fs_devices->super_kobj,
 								fsid_buf))
-			goto error_trans;
+			pr_warn("BTRFS: sysfs: failed to create fsid for sprout\n");
 	}
 
 	root->fs_info->num_tolerated_disk_barrier_failures =
-- 
cgit v1.2.3-55-g7522


From 80aa6027561eef12b49031d46fd6724daf1e7fb6 Mon Sep 17 00:00:00 2001
From: Filipe Manana
Date: Fri, 27 Mar 2015 17:50:45 +0000
Subject: Btrfs: incremental send, don't delay directory renames unnecessarily

Even though we delay the rename of directories when they become
descendents of other directories that were also renamed in the send
root to prevent infinite path build loops, we were doing it in cases
where this was not needed and was actually harmful resulting in
infinite path build loops as we ended up with a circular dependency
of delayed directory renames.

Consider the following reproducer:

  $ mkfs.btrfs -f /dev/sdb
  $ mount /dev/sdb /mnt
  $ mkfs.btrfs -f /dev/sdc
  $ mount /dev/sdc /mnt2

  $ mkdir /mnt/data
  $ mkdir /mnt/data/n1
  $ mkdir /mnt/data/n1/n2
  $ mkdir /mnt/data/n4
  $ mkdir /mnt/data/n1/n2/p1
  $ mkdir /mnt/data/n1/n2/p1/p2
  $ mkdir /mnt/data/t6
  $ mkdir /mnt/data/t7
  $ mkdir -p /mnt/data/t5/t7
  $ mkdir /mnt/data/t2
  $ mkdir /mnt/data/t4
  $ mkdir -p /mnt/data/t1/t3
  $ mkdir /mnt/data/p1
  $ mv /mnt/data/t1 /mnt/data/p1
  $ mkdir -p /mnt/data/p1/p2
  $ mv /mnt/data/t4 /mnt/data/p1/p2/t1
  $ mv /mnt/data/t5 /mnt/data/n4/t5
  $ mv /mnt/data/n1/n2/p1/p2 /mnt/data/n4/t5/p2
  $ mv /mnt/data/t7 /mnt/data/n4/t5/p2/t7
  $ mv /mnt/data/t2 /mnt/data/n4/t1
  $ mv /mnt/data/p1 /mnt/data/n4/t5/p2/p1
  $ mv /mnt/data/n1/n2 /mnt/data/n4/t5/p2/p1/p2/n2
  $ mv /mnt/data/n4/t5/p2/p1/p2/t1 /mnt/data/n4/t5/p2/p1/p2/n2/t1
  $ mv /mnt/data/n4/t5/t7 /mnt/data/n4/t5/p2/p1/p2/n2/t1/t7
  $ mv /mnt/data/n4/t5/p2/p1/t1/t3 /mnt/data/n4/t5/p2/p1/p2/n2/t1/t3
  $ mv /mnt/data/n4/t5/p2/p1/p2/n2/p1 /mnt/data/n4/t5/p2/p1/p2/n2/t1/t7/p1
  $ mv /mnt/data/t6 /mnt/data/n4/t5/p2/p1/p2/n2/t1/t3/t5
  $ mv /mnt/data/n4/t5/p2/p1/t1 /mnt/data/n4/t5/p2/p1/p2/n2/t1/t3/t1
  $ mv /mnt/data/n1 /mnt/data/n4/t5/p2/p1/p2/n2/t1/t7/p1/n1

  $ btrfs subvolume snapshot -r /mnt /mnt/snap1

  $ mv /mnt/data/n4/t1 /mnt/data/n4/t5/p2/p1/p2/n2/t1/t7/p1/t1
  $ mv /mnt/data/n4/t5/p2/p1/p2/n2/t1 /mnt/data/n4/
  $ mv /mnt/data/n4/t5/p2/p1/p2/n2 /mnt/data/n4/t1/n2
  $ mv /mnt/data/n4/t1/t7/p1 /mnt/data/n4/t1/n2/p1
  $ mv /mnt/data/n4/t1/t3/t1 /mnt/data/n4/t1/n2/t1
  $ mv /mnt/data/n4/t1/t3 /mnt/data/n4/t1/n2/t1/t3
  $ mv /mnt/data/n4/t5/p2/p1/p2 /mnt/data/n4/t1/n2/p1/p2
  $ mv /mnt/data/n4/t1/t7 /mnt/data/n4/t1/n2/p1/t7
  $ mv /mnt/data/n4/t5/p2/p1 /mnt/data/n4/t1/n2/p1/p2/p1
  $ mv /mnt/data/n4/t1/n2/t1/t3/t5 /mnt/data/n4/t1/n2/p1/p2/t5
  $ mv /mnt/data/n4/t5 /mnt/data/n4/t1/n2/p1/p2/p1/t5
  $ mv /mnt/data/n4/t1/n2/p1/p2/p1/t5/p2 /mnt/data/n4/t1/n2/p1/p2/p1/p2
  $ mv /mnt/data/n4/t1/n2/p1/p2/p1/p2/t7 /mnt/data/n4/t1/t7

  $ btrfs subvolume snapshot -r /mnt /mnt/snap2

  $ btrfs send /mnt/snap1 | btrfs receive /mnt2
  $ btrfs send -p /mnt/snap1 /mnt/snap2 | btrfs receive -vv /mnt2
  ERROR: send ioctl failed with -12: Cannot allocate memory

This reproducer resulted in an infinite path build loop when building the
path for inode 266 because the following circular dependency of delayed
directory renames was created:

   ino 272 <- ino 261 <- ino 259 <- ino 268 <- ino 267 <- ino 261

Where the notation "X <- Y" means the rename of inode X is delayed by the
rename of inode Y (X will be renamed after Y is renamed). This resulted
in an infinite path build loop of inode 266 because that inode has inode
261 as an ancestor in the send root and inode 261 is in the circular
dependency of delayed renames listed above.

Fix this by not delaying the rename of a directory inode if an ancestor of
the inode in the send root, which has a delayed rename operation, is not
also a descendent of the inode in the parent root.

Thanks to Robbie Ko for sending the reproducer example.
A test case for xfstests follows soon.

Reported-by: Robbie Ko <robbieko@synology.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
---
 fs/btrfs/send.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 46 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index a1216f9b4917..2ed36af6e43a 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -3353,6 +3353,37 @@ out:
 	return ret;
 }
 
+/*
+ * Check if ino ino1 is an ancestor of inode ino2 in the given root.
+ * Return 1 if true, 0 if false and < 0 on error.
+ */
+static int is_ancestor(struct btrfs_root *root,
+		       const u64 ino1,
+		       const u64 ino1_gen,
+		       const u64 ino2,
+		       struct fs_path *fs_path)
+{
+	u64 ino = ino2;
+
+	while (ino > BTRFS_FIRST_FREE_OBJECTID) {
+		int ret;
+		u64 parent;
+		u64 parent_gen;
+
+		fs_path_reset(fs_path);
+		ret = get_first_ref(root, ino, &parent, &parent_gen, fs_path);
+		if (ret < 0) {
+			if (ret == -ENOENT && ino == ino2)
+				ret = 0;
+			return ret;
+		}
+		if (parent == ino1)
+			return parent_gen == ino1_gen ? 1 : 0;
+		ino = parent;
+	}
+	return 0;
+}
+
 static int wait_for_parent_move(struct send_ctx *sctx,
 				struct recorded_ref *parent_ref)
 {
@@ -3374,11 +3405,24 @@ static int wait_for_parent_move(struct send_ctx *sctx,
 	 * Our current directory inode may not yet be renamed/moved because some
 	 * ancestor (immediate or not) has to be renamed/moved first. So find if
 	 * such ancestor exists and make sure our own rename/move happens after
-	 * that ancestor is processed.
+	 * that ancestor is processed to avoid path build infinite loops (done
+	 * at get_cur_path()).
 	 */
 	while (ino > BTRFS_FIRST_FREE_OBJECTID) {
 		if (is_waiting_for_move(sctx, ino)) {
-			ret = 1;
+			/*
+			 * If the current inode is an ancestor of ino in the
+			 * parent root, we need to delay the rename of the
+			 * current inode, otherwise don't delayed the rename
+			 * because we can end up with a circular dependency
+			 * of renames, resulting in some directories never
+			 * getting the respective rename operations issued in
+			 * the send stream or getting into infinite path build
+			 * loops.
+			 */
+			ret = is_ancestor(sctx->parent_root,
+					  sctx->cur_ino, sctx->cur_inode_gen,
+					  ino, path_before);
 			break;
 		}
 
-- 
cgit v1.2.3-55-g7522


From 8b191a684968e24b34c9894024b37532c68e6ae8 Mon Sep 17 00:00:00 2001
From: Filipe Manana
Date: Thu, 9 Apr 2015 14:09:14 +0100
Subject: Btrfs: incremental send, check if orphanized dir inode needs delayed
 rename

If a directory inode is orphanized, because some inode previously
processed has a new name that collides with the old name of the current
inode, we need to check if it needs its rename operation delayed too,
as its ancestor-descendent relationship with some other inode might
have been reversed between the parent and send snapshots and therefore
its rename operation needs to happen after that other inode is renamed.

For example, for the following reproducer where this is needed (provided
by Robbie Ko):

  $ mkfs.btrfs -f /dev/sdb
  $ mount /dev/sdb /mnt
  $ mkfs.btrfs -f /dev/sdc
  $ mount /dev/sdc /mnt2

  $ mkdir -p /mnt/data/n1/n2
  $ mkdir /mnt/data/n4
  $ mkdir -p /mnt/data/t6/t7
  $ mkdir /mnt/data/t5
  $ mkdir /mnt/data/t7
  $ mkdir /mnt/data/n4/t2
  $ mkdir /mnt/data/t4
  $ mkdir /mnt/data/t3
  $ mv /mnt/data/t7 /mnt/data/n4/t2
  $ mv /mnt/data/t4 /mnt/data/n4/t2/t7
  $ mv /mnt/data/t5 /mnt/data/n4/t2/t7/t4
  $ mv /mnt/data/t6 /mnt/data/n4/t2/t7/t4/t5
  $ mv /mnt/data/n1/n2 /mnt/data/n4/t2/t7/t4/t5/t6
  $ mv /mnt/data/n1 /mnt/data/n4/t2/t7/t4/t5/t6
  $ mv /mnt/data/n4/t2/t7/t4/t5/t6/t7 /mnt/data/n4/t2/t7/t4/t5/t6/n2
  $ mv /mnt/data/t3 /mnt/data/n4/t2/t7/t4/t5/t6/n2/t7

  $ btrfs subvolume snapshot -r /mnt /mnt/snap1

  $ mv /mnt/data/n4/t2/t7/t4/t5/t6/n1 /mnt/data/n4
  $ mv /mnt/data/n4/t2 /mnt/data/n4/n1
  $ mv /mnt/data/n4/n1/t2/t7/t4/t5/t6/n2 /mnt/data/n4/n1/t2
  $ mv /mnt/data/n4/n1/t2/n2/t7/t3 /mnt/data/n4/n1/t2
  $ mv /mnt/data/n4/n1/t2/t7/t4/t5/t6 /mnt/data/n4/n1/t2
  $ mv /mnt/data/n4/n1/t2/t7/t4 /mnt/data/n4/n1/t2/t6
  $ mv /mnt/data/n4/n1/t2/t7 /mnt/data/n4/n1/t2/t3
  $ mv /mnt/data/n4/n1/t2/n2/t7 /mnt/data/n4/n1/t2

  $ btrfs subvolume snapshot -r /mnt /mnt/snap2

  $ btrfs send /mnt/snap1 | btrfs receive /mnt2
  $ btrfs send -p /mnt/snap1 /mnt/snap2 | btrfs receive /mnt2
  ERROR: send ioctl failed with -12: Cannot allocate memory

Where the parent snapshot directory hierarchy is the following:

  .                                                        (ino 256)
  |-- data/                                                (ino 257)
        |-- n4/                                            (ino 260)
             |-- t2/                                       (ino 265)
                  |-- t7/                                  (ino 264)
                       |-- t4/                             (ino 266)
                            |-- t5/                        (ino 263)
                                 |-- t6/                   (ino 261)
                                      |-- n1/              (ino 258)
                                      |-- n2/              (ino 259)
                                           |-- t7/         (ino 262)
                                                |-- t3/    (ino 267)

And the send snapshot's directory hierarchy is the following:

  .                                                        (ino 256)
  |-- data/                                                (ino 257)
        |-- n4/                                            (ino 260)
             |-- n1/                                       (ino 258)
                  |-- t2/                                  (ino 265)
                       |-- n2/                             (ino 259)
                       |-- t3/                             (ino 267)
                       |    |-- t7                         (ino 264)
                       |
                       |-- t6/                             (ino 261)
                       |    |-- t4/                        (ino 266)
                       |         |-- t5/                   (ino 263)
                       |
                       |-- t7/                             (ino 262)

While processing inode 262 we orphanize inode 264 and later attempt
to rename inode 264 to its new name/location, which resulted in building
an incorrect destination path string for the rename operation with the
value "data/n4/t2/t7/t4/t5/t6/n2/t7/t3/t7". This rename operation must
have been done only after inode 267 is processed and renamed, as the
ancestor-descendent relationship between inodes 264 and 267 was reversed
between both snapshots, because otherwise it results in an infinite loop
when building the path string for inode 264 when we are processing an
inode with a number larger than 264. That loop is the following:

  start inode 264, send progress of 265 for example
  parent of 264 -> 267
  parent of 267 -> 262
  parent of 262 -> 259
  parent of 259 -> 261
  parent of 261 -> 263
  parent of 263 -> 266
  parent of 266 -> 264
    |--> back to first iteration while current path string length
         is <= PATH_MAX, and fail with -ENOMEM otherwise

So fix this by making the check if we need to delay a directory rename
regardless of the current inode having been orphanized or not.

A test case for fstests follows soon.

Thanks to Robbie Ko for providing a reproducer for this problem.

Reported-by: Robbie Ko <robbieko@synology.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
---
 fs/btrfs/send.c | 56 +++++++++++++++++++++++++++++++++++++-------------------
 1 file changed, 37 insertions(+), 19 deletions(-)

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 2ed36af6e43a..895f1b180b2f 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -243,6 +243,7 @@ struct waiting_dir_move {
 	 * after this directory is moved, we can try to rmdir the ino rmdir_ino.
 	 */
 	u64 rmdir_ino;
+	bool orphanized;
 };
 
 struct orphan_dir_info {
@@ -1900,8 +1901,13 @@ static int did_overwrite_ref(struct send_ctx *sctx,
 		goto out;
 	}
 
-	/* we know that it is or will be overwritten. check this now */
-	if (ow_inode < sctx->send_progress)
+	/*
+	 * We know that it is or will be overwritten. Check this now.
+	 * The current inode being processed might have been the one that caused
+	 * inode 'ino' to be orphanized, therefore ow_inode can actually be the
+	 * same as sctx->send_progress.
+	 */
+	if (ow_inode <= sctx->send_progress)
 		ret = 1;
 	else
 		ret = 0;
@@ -2223,6 +2229,8 @@ static int get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen,
 	fs_path_reset(dest);
 
 	while (!stop && ino != BTRFS_FIRST_FREE_OBJECTID) {
+		struct waiting_dir_move *wdm;
+
 		fs_path_reset(name);
 
 		if (is_waiting_for_rm(sctx, ino)) {
@@ -2233,7 +2241,11 @@ static int get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen,
 			break;
 		}
 
-		if (is_waiting_for_move(sctx, ino)) {
+		wdm = get_waiting_dir_move(sctx, ino);
+		if (wdm && wdm->orphanized) {
+			ret = gen_unique_name(sctx, ino, gen, name);
+			stop = 1;
+		} else if (wdm) {
 			ret = get_first_ref(sctx->parent_root, ino,
 					    &parent_inode, &parent_gen, name);
 		} else {
@@ -2923,7 +2935,7 @@ static int is_waiting_for_move(struct send_ctx *sctx, u64 ino)
 	return entry != NULL;
 }
 
-static int add_waiting_dir_move(struct send_ctx *sctx, u64 ino)
+static int add_waiting_dir_move(struct send_ctx *sctx, u64 ino, bool orphanized)
 {
 	struct rb_node **p = &sctx->waiting_dir_moves.rb_node;
 	struct rb_node *parent = NULL;
@@ -2934,6 +2946,7 @@ static int add_waiting_dir_move(struct send_ctx *sctx, u64 ino)
 		return -ENOMEM;
 	dm->ino = ino;
 	dm->rmdir_ino = 0;
+	dm->orphanized = orphanized;
 
 	while (*p) {
 		parent = *p;
@@ -3030,7 +3043,7 @@ static int add_pending_dir_move(struct send_ctx *sctx,
 			goto out;
 	}
 
-	ret = add_waiting_dir_move(sctx, pm->ino);
+	ret = add_waiting_dir_move(sctx, pm->ino, is_orphan);
 	if (ret)
 		goto out;
 
@@ -3385,7 +3398,8 @@ static int is_ancestor(struct btrfs_root *root,
 }
 
 static int wait_for_parent_move(struct send_ctx *sctx,
-				struct recorded_ref *parent_ref)
+				struct recorded_ref *parent_ref,
+				const bool is_orphan)
 {
 	int ret = 0;
 	u64 ino = parent_ref->dir;
@@ -3464,7 +3478,7 @@ out:
 					   ino,
 					   &sctx->new_refs,
 					   &sctx->deleted_refs,
-					   false);
+					   is_orphan);
 		if (!ret)
 			ret = 1;
 	}
@@ -3633,6 +3647,17 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
 			}
 		}
 
+		if (S_ISDIR(sctx->cur_inode_mode) && sctx->parent_root &&
+		    can_rename) {
+			ret = wait_for_parent_move(sctx, cur, is_orphan);
+			if (ret < 0)
+				goto out;
+			if (ret == 1) {
+				can_rename = false;
+				*pending_move = 1;
+			}
+		}
+
 		/*
 		 * link/move the ref to the new place. If we have an orphan
 		 * inode, move it and update valid_path. If not, link or move
@@ -3653,18 +3678,11 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
 				 * dirs, we always have one new and one deleted
 				 * ref. The deleted ref is ignored later.
 				 */
-				ret = wait_for_parent_move(sctx, cur);
-				if (ret < 0)
-					goto out;
-				if (ret) {
-					*pending_move = 1;
-				} else {
-					ret = send_rename(sctx, valid_path,
-							  cur->full_path);
-					if (!ret)
-						ret = fs_path_copy(valid_path,
-							       cur->full_path);
-				}
+				ret = send_rename(sctx, valid_path,
+						  cur->full_path);
+				if (!ret)
+					ret = fs_path_copy(valid_path,
+							   cur->full_path);
 				if (ret < 0)
 					goto out;
 			} else {
-- 
cgit v1.2.3-55-g7522


From 64ad6c488975d7516230cf7849190a991fd615ae Mon Sep 17 00:00:00 2001
From: Omar Sandoval
Date: Tue, 2 Jun 2015 17:31:00 -0700
Subject: Btrfs: don't invalidate root dentry when subvolume deletion fails

Since commit bafc9b754f75 ("vfs: More precise tests in d_invalidate"),
mounted subvolumes can be deleted because d_invalidate() won't fail.
However, we run into problems when we attempt to delete the default
subvolume while it is mounted as the root filesystem:

	# btrfs subvol list /
	ID 257 gen 306 top level 5 path rootvol
	ID 267 gen 334 top level 5 path snap1
	# btrfs subvol get-default /
	ID 267 gen 334 top level 5 path snap1
	# btrfs inspect-internal rootid /
	267
	# mount -o subvol=/ /dev/vda1 /mnt
	# btrfs subvol del /mnt/snap1
	Delete subvolume (no-commit): '/mnt/snap1'
	ERROR: cannot delete '/mnt/snap1' - Operation not permitted
	# findmnt /
	findmnt: can't read /proc/mounts: No such file or directory
	# ls /proc
	#

Markus reported that this same scenario simply led to a kernel oops.

This happens because in btrfs_ioctl_snap_destroy(), we call
d_invalidate() before we check may_destroy_subvol(), which means that we
detach the submounts and drop the dentry before erroring out. Instead,
we should only invalidate the dentry once the deletion has succeeded.
Additionally, the shrink_dcache_sb() isn't necessary; d_invalidate()
will prune the dcache for the deleted subvolume.

Cc: <stable@vger.kernel.org>
Fixes: bafc9b754f75 ("vfs: More precise tests in d_invalidate")
Reported-by: Markus Schauler <mschauler@gmail.com>
Signed-off-by: Omar Sandoval <osandov@osandov.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/ioctl.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 1c22c6518504..6f790bcddfc1 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -2413,8 +2413,6 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
 		goto out_unlock_inode;
 	}
 
-	d_invalidate(dentry);
-
 	down_write(&root->fs_info->subvol_sem);
 
 	err = may_destroy_subvol(dest);
@@ -2508,7 +2506,7 @@ out_up_write:
 out_unlock_inode:
 	mutex_unlock(&inode->i_mutex);
 	if (!err) {
-		shrink_dcache_sb(root->fs_info->sb);
+		d_invalidate(dentry);
 		btrfs_invalidate_inodes(dest);
 		d_delete(dentry);
 		ASSERT(dest->send_in_progress == 0);
-- 
cgit v1.2.3-55-g7522


From 53e489bc8cd9dcfe95be3e422121539250aa8221 Mon Sep 17 00:00:00 2001
From: Filipe Manana
Date: Tue, 2 Jun 2015 14:43:21 +0100
Subject: Btrfs: check pending chunks when shrinking fs to avoid corruption

When we shrink the usable size of a device (its total_bytes), we go over
all the device extent items in the device tree and attempt to relocate
the chunk of any device extent that goes beyond the new usable size for
the device. We do that after setting the new usable size (total_bytes) in
the device object, so that all new allocations (and reallocations) don't
use areas of the device that go beyond the new (shorter) size. However we
were not considering that before setting the new size in the device,
pending chunks might have been created that use device extents that go
beyond the new size, and those device extents are not yet in the device
tree after we search the device tree - they are still attached to the
list of new block group for some ongoing transaction handle, and they are
only added to the device tree when the transaction handle is ended (via
btrfs_create_pending_block_groups()).

So check for pending chunks with device extents that go beyond the new
size and if any exists, commit the current transaction and repeat the
search in the device tree.

Not doing this it would mean we would return success to user space while
still having extents that go beyond the new size, and later user space
could override those locations on the device while the fs still references
them, causing all sorts of corruption and unexpected events.

Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/volumes.c | 49 ++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 40 insertions(+), 9 deletions(-)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 174f5e1e00ab..d0582b785485 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -3965,6 +3965,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
 	int slot;
 	int failed = 0;
 	bool retried = false;
+	bool checked_pending_chunks = false;
 	struct extent_buffer *l;
 	struct btrfs_key key;
 	struct btrfs_super_block *super_copy = root->fs_info->super_copy;
@@ -4045,15 +4046,6 @@ again:
 		goto again;
 	} else if (failed && retried) {
 		ret = -ENOSPC;
-		lock_chunks(root);
-
-		btrfs_device_set_total_bytes(device, old_size);
-		if (device->writeable)
-			device->fs_devices->total_rw_bytes += diff;
-		spin_lock(&root->fs_info->free_chunk_lock);
-		root->fs_info->free_chunk_space += diff;
-		spin_unlock(&root->fs_info->free_chunk_lock);
-		unlock_chunks(root);
 		goto done;
 	}
 
@@ -4065,6 +4057,35 @@ again:
 	}
 
 	lock_chunks(root);
+
+	/*
+	 * We checked in the above loop all device extents that were already in
+	 * the device tree. However before we have updated the device's
+	 * total_bytes to the new size, we might have had chunk allocations that
+	 * have not complete yet (new block groups attached to transaction
+	 * handles), and therefore their device extents were not yet in the
+	 * device tree and we missed them in the loop above. So if we have any
+	 * pending chunk using a device extent that overlaps the device range
+	 * that we can not use anymore, commit the current transaction and
+	 * repeat the search on the device tree - this way we guarantee we will
+	 * not have chunks using device extents that end beyond 'new_size'.
+	 */
+	if (!checked_pending_chunks) {
+		u64 start = new_size;
+		u64 len = old_size - new_size;
+
+		if (contains_pending_extent(trans, device, &start, len)) {
+			unlock_chunks(root);
+			checked_pending_chunks = true;
+			failed = 0;
+			retried = false;
+			ret = btrfs_commit_transaction(trans, root);
+			if (ret)
+				goto done;
+			goto again;
+		}
+	}
+
 	btrfs_device_set_disk_total_bytes(device, new_size);
 	if (list_empty(&device->resized_list))
 		list_add_tail(&device->resized_list,
@@ -4079,6 +4100,16 @@ again:
 	btrfs_end_transaction(trans, root);
 done:
 	btrfs_free_path(path);
+	if (ret) {
+		lock_chunks(root);
+		btrfs_device_set_total_bytes(device, old_size);
+		if (device->writeable)
+			device->fs_devices->total_rw_bytes += diff;
+		spin_lock(&root->fs_info->free_chunk_lock);
+		root->fs_info->free_chunk_space += diff;
+		spin_unlock(&root->fs_info->free_chunk_lock);
+		unlock_chunks(root);
+	}
 	return ret;
 }
 
-- 
cgit v1.2.3-55-g7522


From 13028901a4a62096e97a0fc44388ea859587f690 Mon Sep 17 00:00:00 2001
From: David Sterba
Date: Fri, 24 Apr 2015 16:44:30 +0200
Subject: btrfs: let tree defrag work in SSD mode

Long time ago (2008) the defrag was automatic for new b-tree writes but
has been disabled after performance problems. There was a leftover in
tree-defrag.c that effectively stops any defragmentation on b-trees.
This is a bit unexpected and IMHO undesired. The SSD mode is an
optimization and defrag is supposed to work if the users asks for it.

Related commits:

6702ed490ca0bb44e17131818a5a18b773957c5a
Btrfs: Add run time btree defrag, and an ioctl to force btree defrag

e18e4809b10e6c9efb5fe10c1ddcb4ebb690d517
Btrfs: Add mount -o ssd, which includes optimizations for seek free
storage

b3236e68bf86b3ae87f58984a1822369225211cb
Btrfs: Leave on the tree defragger in mount -o ssd, it still helps there

9afbb0b752ef30a429c45b9de6706e28ad1a36e1
Btrfs: Disable tree defrag in SSD mode

The last three commits switch the defrag+ssd off/on/off and the last one

3f157a2fd2ad731e1ed9964fecdc5f459f04a4a4
Btrfs: Online btree defragmentation fixes

misses the bits from tree-defrag.c to revert to the behaviour introduced
in e18e4809b10e.

Signed-off-by: David Sterba <dsterba@suse.cz>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/tree-defrag.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
index a63719cc9578..a4b9c8b2d35a 100644
--- a/fs/btrfs/tree-defrag.c
+++ b/fs/btrfs/tree-defrag.c
@@ -52,9 +52,6 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
 	if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state))
 		goto out;
 
-	if (btrfs_test_opt(root, SSD))
-		goto out;
-
 	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;
-- 
cgit v1.2.3-55-g7522


From 1a9a8a71ed1d457d4f03284ebfd3e40fe1e217ac Mon Sep 17 00:00:00 2001
From: David Sterba
Date: Fri, 24 Apr 2015 19:11:54 +0200
Subject: btrfs: report exact callsite where transaction abort occurs

WARN is called from a single location and all bugreports say that's in
super.c __btrfs_abort_transaction. This is slightly confusing as we'd
rather want to know the exact callsite. Whereas this information is
printed in the syslog below the stacktrace, this requires further look
and we usually see only the headline from WARNING.

Moving the WARN into the macro has to inline some code and increases
code by a few kilobytes:

  text    data     bss     dec     hex filename
835481   20305   14120  869906   d4612 btrfs.ko.before
842883   20305   14120  877308   d62fc btrfs.ko.after

The delta is +7k (130+ calls), measured on 3.19 x86_64, distro config.
The increase is not small and could lead to worse icache use. The code
is on error/exit paths that can be recognized by compiler as cold and
moved out of the way so the impact is speculated to be low, if
measurable at all.

Signed-off-by: David Sterba <dsterba@suse.cz>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/ctree.h | 12 +++++++++---
 fs/btrfs/super.c |  8 --------
 2 files changed, 9 insertions(+), 11 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 6f364e1d8d3d..98b33477235b 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -4111,11 +4111,17 @@ static inline int __btrfs_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag)
  * Call btrfs_abort_transaction as early as possible when an error condition is
  * detected, that way the exact line number is reported.
  */
-
 #define btrfs_abort_transaction(trans, root, errno)		\
 do {								\
-	__btrfs_abort_transaction(trans, root, __func__,	\
-				  __LINE__, errno);		\
+	/* Report first abort since mount */			\
+	if (!test_and_set_bit(BTRFS_FS_STATE_TRANS_ABORTED,	\
+			&((root)->fs_info->fs_state))) {	\
+		WARN(1, KERN_DEBUG				\
+		"BTRFS: Transaction aborted (error %d)\n",	\
+		(errno));					\
+	}							\
+	__btrfs_abort_transaction((trans), (root), __func__,	\
+				  __LINE__, (errno));		\
 } while (0)
 
 #define btrfs_std_error(fs_info, errno)				\
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 9e66f5e724db..3c72eea12714 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -251,14 +251,6 @@ void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
 			       struct btrfs_root *root, const char *function,
 			       unsigned int line, int errno)
 {
-	/*
-	 * Report first abort since mount
-	 */
-	if (!test_and_set_bit(BTRFS_FS_STATE_TRANS_ABORTED,
-				&root->fs_info->fs_state)) {
-		WARN(1, KERN_DEBUG "BTRFS: Transaction aborted (error %d)\n",
-				errno);
-	}
 	trans->aborted = errno;
 	/* Nothing used. The other threads that have joined this
 	 * transaction may be able to continue. */
-- 
cgit v1.2.3-55-g7522


From c0d19e2b9a521bbdc33049ad92c94b517afda1f0 Mon Sep 17 00:00:00 2001
From: David Sterba
Date: Fri, 24 Apr 2015 19:11:57 +0200
Subject: btrfs: add 'cold' compiler annotations to all error handling
 functions

The annotated functios will be placed into .text.unlikely section. The
annotation also hints compiler to move the code out of the hot paths,
and may implicitly mark if-statement leading to that block as unlikely.

This is a heuristic, the impact on the generated code is not
significant.

Signed-off-by: David Sterba <dsterba@suse.cz>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/ctree.h | 4 ++++
 fs/btrfs/super.c | 3 +++
 2 files changed, 7 insertions(+)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 98b33477235b..670e4be7225b 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -4050,6 +4050,7 @@ void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...)
 
 #ifdef CONFIG_BTRFS_ASSERT
 
+__cold
 static inline void assfail(char *expr, char *file, int line)
 {
 	pr_err("BTRFS: assertion failed: %s, file: %s, line: %d",
@@ -4065,10 +4066,12 @@ static inline void assfail(char *expr, char *file, int line)
 
 #define btrfs_assert()
 __printf(5, 6)
+__cold
 void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
 		     unsigned int line, int errno, const char *fmt, ...);
 
 
+__cold
 void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
 			       struct btrfs_root *root, const char *function,
 			       unsigned int line, int errno);
@@ -4138,6 +4141,7 @@ do {								\
 } while (0)
 
 __printf(5, 6)
+__cold
 void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function,
 		   unsigned int line, int errno, const char *fmt, ...);
 
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 3c72eea12714..c761d8e068e8 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -135,6 +135,7 @@ static void btrfs_handle_error(struct btrfs_fs_info *fs_info)
  * __btrfs_std_error decodes expected errors from the caller and
  * invokes the approciate error response.
  */
+__cold
 void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
 		       unsigned int line, int errno, const char *fmt, ...)
 {
@@ -247,6 +248,7 @@ void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
  * We'll complete the cleanup in btrfs_end_transaction and
  * btrfs_commit_transaction.
  */
+__cold
 void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
 			       struct btrfs_root *root, const char *function,
 			       unsigned int line, int errno)
@@ -273,6 +275,7 @@ void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
  * __btrfs_panic decodes unexpected, fatal errors from the caller,
  * issues an alert, and either panics or BUGs, depending on mount options.
  */
+__cold
 void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function,
 		   unsigned int line, int errno, const char *fmt, ...)
 {
-- 
cgit v1.2.3-55-g7522


From 6d13f5497f9c44597ba566f741f7ce66ed099456 Mon Sep 17 00:00:00 2001
From: David Sterba
Date: Fri, 24 Apr 2015 19:12:01 +0200
Subject: btrfs: fix warnings after changes in btrfs_abort_transaction

fs/btrfs/volumes.c: In function ‘btrfs_create_uuid_tree’:
fs/btrfs/volumes.c:3909:3: warning: format ‘%d’ expects argument of type ‘int’, but argument 4 has type ‘long int’ [-Wformat=]
   btrfs_abort_transaction(trans, tree_root,
   ^
  CC [M]  fs/btrfs/ioctl.o
fs/btrfs/ioctl.c: In function ‘create_subvol’:
fs/btrfs/ioctl.c:549:3: warning: format ‘%d’ expects argument of type ‘int’, but argument 4 has type ‘long int’ [-Wformat=]
   btrfs_abort_transaction(trans, root, PTR_ERR(new_root));

PTR_ERR returns long, but we're really using 'int' for the error codes
everywhere so just set and use the local variable.

Signed-off-by: David Sterba <dsterba@suse.cz>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/ioctl.c   | 2 +-
 fs/btrfs/volumes.c | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 6f790bcddfc1..f77f6b3d24b9 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -553,8 +553,8 @@ static noinline int create_subvol(struct inode *dir,
 	key.offset = (u64)-1;
 	new_root = btrfs_read_fs_root_no_name(root->fs_info, &key);
 	if (IS_ERR(new_root)) {
-		btrfs_abort_transaction(trans, root, PTR_ERR(new_root));
 		ret = PTR_ERR(new_root);
+		btrfs_abort_transaction(trans, root, ret);
 		goto fail;
 	}
 
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index d0582b785485..1fcbd93ca0ec 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -3908,9 +3908,9 @@ int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info)
 	uuid_root = btrfs_create_tree(trans, fs_info,
 				      BTRFS_UUID_TREE_OBJECTID);
 	if (IS_ERR(uuid_root)) {
-		btrfs_abort_transaction(trans, tree_root,
-					PTR_ERR(uuid_root));
-		return PTR_ERR(uuid_root);
+		ret = PTR_ERR(uuid_root);
+		btrfs_abort_transaction(trans, tree_root, ret);
+		return ret;
 	}
 
 	fs_info->uuid_root = uuid_root;
-- 
cgit v1.2.3-55-g7522


From 816fcebe8f41051b29970e5baed4d4afe462d8a8 Mon Sep 17 00:00:00 2001
From: Anand Jain
Date: Mon, 27 Apr 2015 12:46:18 +0800
Subject: Btrfs: log when missing device is created

Signed-off-by: Anand Jain <anand.jain@oracle.com>
Reviewed-by: David Sterba <dsterba@suse.cz>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/volumes.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 1fcbd93ca0ec..38d3a0f3bcdd 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -6109,6 +6109,8 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
 				free_extent_map(em);
 				return -EIO;
 			}
+			btrfs_warn(root->fs_info, "devid %llu uuid %pU is missing",
+						devid, uuid);
 		}
 		map->stripes[i].dev->in_fs_metadata = 1;
 	}
-- 
cgit v1.2.3-55-g7522


From 35c766425adc8e6cd1bdc8554fa540364f218fb7 Mon Sep 17 00:00:00 2001
From: Filipe Manana
Date: Thu, 30 Apr 2015 17:47:05 +0100
Subject: Btrfs: fix mutex unlock without prior lock on space cache truncation

If the call to btrfs_truncate_inode_items() failed and we don't have a block
group, we were unlocking the cache_write_mutex without having locked it (we
do it only if we have a block group).

Fixes: 1bbc621ef284 ("Btrfs: allow block group cache writeout
                      outside critical section in commit")

Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.cz>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/free-space-cache.c | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 9dbe5b548fa6..fb5a6b1c62a6 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -231,6 +231,7 @@ int btrfs_truncate_free_space_cache(struct btrfs_root *root,
 {
 	int ret = 0;
 	struct btrfs_path *path = btrfs_alloc_path();
+	bool locked = false;
 
 	if (!path) {
 		ret = -ENOMEM;
@@ -238,6 +239,7 @@ int btrfs_truncate_free_space_cache(struct btrfs_root *root,
 	}
 
 	if (block_group) {
+		locked = true;
 		mutex_lock(&trans->transaction->cache_write_mutex);
 		if (!list_empty(&block_group->io_list)) {
 			list_del_init(&block_group->io_list);
@@ -269,18 +271,14 @@ int btrfs_truncate_free_space_cache(struct btrfs_root *root,
 	 */
 	ret = btrfs_truncate_inode_items(trans, root, inode,
 					 0, BTRFS_EXTENT_DATA_KEY);
-	if (ret) {
-		mutex_unlock(&trans->transaction->cache_write_mutex);
-		btrfs_abort_transaction(trans, root, ret);
-		return ret;
-	}
+	if (ret)
+		goto fail;
 
 	ret = btrfs_update_inode(trans, root, inode);
 
-	if (block_group)
-		mutex_unlock(&trans->transaction->cache_write_mutex);
-
 fail:
+	if (locked)
+		mutex_unlock(&trans->transaction->cache_write_mutex);
 	if (ret)
 		btrfs_abort_transaction(trans, root, ret);
 
-- 
cgit v1.2.3-55-g7522


From ab3680dd18f3dea341e32e5020ef16d54a4fb66f Mon Sep 17 00:00:00 2001
From: Christian Engelmayer
Date: Sat, 2 May 2015 17:19:55 +0200
Subject: btrfs: qgroup: Fix possible leak in btrfs_add_qgroup_relation()

Commit 9c8b35b1ba21 ("btrfs: quota: Automatically update related qgroups or
mark INCONSISTENT flags when assigning/deleting a qgroup relations.")
introduced the allocation of a temporary ulist in function
btrfs_add_qgroup_relation() and added the corresponding cleanup to the out
path. However, the allocation was introduced before the src/dst level check
that directly returns. Fix the possible leakage of the ulist by moving the
allocation after the input validation. Detected by Coverity CID 1295988.

Signed-off-by: Christian Engelmayer <cengelma@gmx.at>
Reviewed-by: David Sterba <dsterba@suse.cz>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/qgroup.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 3d6546581bb9..842ff86d4ae8 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1115,14 +1115,14 @@ int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans,
 	struct ulist *tmp;
 	int ret = 0;
 
-	tmp = ulist_alloc(GFP_NOFS);
-	if (!tmp)
-		return -ENOMEM;
-
 	/* Check the level of src and dst first */
 	if (btrfs_qgroup_level(src) >= btrfs_qgroup_level(dst))
 		return -EINVAL;
 
+	tmp = ulist_alloc(GFP_NOFS);
+	if (!tmp)
+		return -ENOMEM;
+
 	mutex_lock(&fs_info->qgroup_ioctl_lock);
 	quota_root = fs_info->quota_root;
 	if (!quota_root) {
-- 
cgit v1.2.3-55-g7522


From 619d8c4ef7c5dd346add55da82c9179cd2e3387e Mon Sep 17 00:00:00 2001
From: Filipe Manana
Date: Sun, 3 May 2015 01:56:00 +0100
Subject: Btrfs: incremental send, fix clone operations for compressed extents

Marc reported a problem where the receiving end of an incremental send
was performing clone operations that failed with -EINVAL. This happened
because, unlike for uncompressed extents, we were not checking if the
source clone offset and length, after summing the data offset, falls
within the source file's boundaries.

So make sure we do such checks when attempting to issue clone operations
for compressed extents.

Problem reproducible with the following steps:

  $ mkfs.btrfs -f /dev/sdb
  $ mount -o compress /dev/sdb /mnt
  $ mkfs.btrfs -f /dev/sdc
  $ mount -o compress /dev/sdc /mnt2

  # Create the file with a single extent of 128K. This creates a metadata file
  # extent item with a data start offset of 0 and a logical length of 128K.
  $ xfs_io -f -c "pwrite -S 0xaa 64K 128K" -c "fsync" /mnt/foo

  # Now rewrite the range 64K to 112K of our file. This will make the inode's
  # metadata continue to point to the 128K extent we created before, but now
  # with an extent item that points to the extent with a data start offset of
  # 112K and a logical length of 16K.
  # That metadata file extent item is associated with the logical file offset
  # at 176K and covers the logical file range 176K to 192K.
  $ xfs_io -c "pwrite -S 0xbb 64K 112K" -c "fsync" /mnt/foo

  # Now rewrite the range 180K to 12K. This will make the inode's metadata
  # continue to point the the 128K extent we created earlier, with a single
  # extent item that points to it with a start offset of 112K and a logical
  # length of 4K.
  # That metadata file extent item is associated with the logical file offset
  # at 176K and covers the logical file range 176K to 180K.
  $ xfs_io -c "pwrite -S 0xcc 180K 12K" -c "fsync" /mnt/foo

  $ btrfs subvolume snapshot -r /mnt /mnt/snap1

  $ touch /mnt/bar
  # Calls the btrfs clone ioctl.
  $ ~/xfstests/src/cloner -s $((176 * 1024)) -d $((176 * 1024)) \
    -l $((4 * 1024)) /mnt/foo /mnt/bar

  $ btrfs subvolume snapshot -r /mnt /mnt/snap2

  $ btrfs send /mnt/snap1 | btrfs receive /mnt2
  At subvol /mnt/snap1
  At subvol snap1

  $ btrfs send -p /mnt/snap1 /mnt/snap2 | btrfs receive /mnt2
  At subvol /mnt/snap2
  At snapshot snap2
  ERROR: failed to clone extents to bar
  Invalid argument

A test case for fstests follows soon.

Reported-by: Marc MERLIN <marc@merlins.org>
Tested-by: Marc MERLIN <marc@merlins.org>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Tested-by: David Sterba <dsterba@suse.cz>
Tested-by: Jan Alexander Steffens (heftig) <jan.steffens@gmail.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/send.c | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index a1216f9b4917..5cf7838fb5e5 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -1158,6 +1158,9 @@ struct backref_ctx {
 	/* may be truncated in case it's the last extent in a file */
 	u64 extent_len;
 
+	/* data offset in the file extent item */
+	u64 data_offset;
+
 	/* Just to check for bugs in backref resolving */
 	int found_itself;
 };
@@ -1221,7 +1224,7 @@ static int __iterate_backrefs(u64 ino, u64 offset, u64 root, void *ctx_)
 	if (ret < 0)
 		return ret;
 
-	if (offset + bctx->extent_len > i_size)
+	if (offset + bctx->data_offset + bctx->extent_len > i_size)
 		return 0;
 
 	/*
@@ -1363,6 +1366,19 @@ static int find_extent_clone(struct send_ctx *sctx,
 	backref_ctx->cur_offset = data_offset;
 	backref_ctx->found_itself = 0;
 	backref_ctx->extent_len = num_bytes;
+	/*
+	 * For non-compressed extents iterate_extent_inodes() gives us extent
+	 * offsets that already take into account the data offset, but not for
+	 * compressed extents, since the offset is logical and not relative to
+	 * the physical extent locations. We must take this into account to
+	 * avoid sending clone offsets that go beyond the source file's size,
+	 * which would result in the clone ioctl failing with -EINVAL on the
+	 * receiving end.
+	 */
+	if (compressed == BTRFS_COMPRESS_NONE)
+		backref_ctx->data_offset = 0;
+	else
+		backref_ctx->data_offset = btrfs_file_extent_offset(eb, fi);
 
 	/*
 	 * The last extent of a file may be too large due to page alignment.
-- 
cgit v1.2.3-55-g7522


From 1f6e4b3f9f7c859fed7ac4c0853e976a1a752873 Mon Sep 17 00:00:00 2001
From: Qu Wenruo
Date: Tue, 5 May 2015 10:53:15 +0800
Subject: btrfs: Fix superblock csum type check.

Old csum type check is wrong and can't catch csum_type 1(not supported).

Fix it to avoid hostile 0 division.

Reported-by: Lukas Lueg <lukas.lueg@gmail.com>
Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Reviewed-by: David Sterba <dsterba@suse.cz>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/ctree.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 670e4be7225b..9d7a6c38f0b1 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -174,7 +174,7 @@ struct btrfs_ordered_sum;
 /* csum types */
 #define BTRFS_CSUM_TYPE_CRC32	0
 
-static int btrfs_csum_sizes[] = { 4, 0 };
+static int btrfs_csum_sizes[] = { 4 };
 
 /* four bytes for CRC32 */
 #define BTRFS_EMPTY_DIR_SIZE 0
-- 
cgit v1.2.3-55-g7522


From 33b97e43279a60729eee502809cf5269b3a64cf3 Mon Sep 17 00:00:00 2001
From: Anand Jain
Date: Fri, 8 May 2015 04:34:35 +0800
Subject: Btrfs: check error before reporting missing device and add uuid

Report missing device when add is successful,
otherwise it would exit as ENOMEM. And add uuid
to the report.

Signed-off-by: Anand Jain <anand.jain@oracle.com>
Reviewed-by: David Sterba <dsterba@suse.cz>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/volumes.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 38d3a0f3bcdd..403ed1fdd901 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -6230,10 +6230,11 @@ static int read_one_dev(struct btrfs_root *root,
 		if (!btrfs_test_opt(root, DEGRADED))
 			return -EIO;
 
-		btrfs_warn(root->fs_info, "devid %llu missing", devid);
 		device = add_missing_dev(root, fs_devices, devid, dev_uuid);
 		if (!device)
 			return -ENOMEM;
+		btrfs_warn(root->fs_info, "devid %llu uuid %pU missing",
+				devid, dev_uuid);
 	} else {
 		if (!device->bdev && !btrfs_test_opt(root, DEGRADED))
 			return -EIO;
-- 
cgit v1.2.3-55-g7522


From 2e6e518335f81d651021a8873da1c938ca0bf1fc Mon Sep 17 00:00:00 2001
From: Filipe Manana
Date: Tue, 12 May 2015 00:28:11 +0100
Subject: Btrfs: fix block group ->space_info null pointer dereference

When we create a block group we add it to the rbtree of block groups
before setting its ->space_info field (while it's NULL). This is
problematic since other tasks can access the block group from the
rbtree and attempt to use its ->space_info before it is set by
btrfs_make_block_group().

This can happen for example when a concurrent fitrim ioctl operation
is ongoing, which produces a trace like the following when
CONFIG_DEBUG_PAGEALLOC is set.

[11509.604369] BUG: unable to handle kernel NULL pointer dereference at 0000000000000018
[11509.606373] IP: [<ffffffff8107d675>] __lock_acquire+0xb4/0xf02
[11509.608179] PGD 2296a8067 PUD 22f4a2067 PMD 0
[11509.608179] Oops: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC
[11509.608179] Modules linked in: btrfs crc32c_generic xor raid6_pq nfsd auth_rpcgss oid_registry nfs_acl nfs lockd grace fscache sunrpc loop fuse acpi_cpufreq processor i2c_piix4 psmou
[11509.608179] CPU: 10 PID: 8538 Comm: fstrim Tainted: G        W       4.0.0-rc5-btrfs-next-9+ #2
[11509.608179] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.7.5-0-ge51488c-20140602_164612-nilsson.home.kraxel.org 04/01/2014
[11509.608179] task: ffff88009f5c46d0 ti: ffff8801b3edc000 task.ti: ffff8801b3edc000
[11509.608179] RIP: 0010:[<ffffffff8107d675>]  [<ffffffff8107d675>] __lock_acquire+0xb4/0xf02
[11509.608179] RSP: 0018:ffff8801b3edf9e8  EFLAGS: 00010002
[11509.608179] RAX: 0000000000000046 RBX: 0000000000000000 RCX: 0000000000000000
[11509.608179] RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000018
[11509.608179] RBP: ffff8801b3edfaa8 R08: 0000000000000001 R09: 0000000000000000
[11509.608179] R10: 0000000000000000 R11: ffff88009f5c4f98 R12: 0000000000000000
[11509.608179] R13: 0000000000000000 R14: 0000000000000018 R15: ffff88009f5c46d0
[11509.608179] FS:  00007f280a10e840(0000) GS:ffff88023ed40000(0000) knlGS:0000000000000000
[11509.608179] CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
[11509.608179] CR2: 0000000000000018 CR3: 00000002119bc000 CR4: 00000000000006e0
[11509.608179] Stack:
[11509.608179]  0000000000000000 0000000000000000 0000000000000004 0000000000000000
[11509.608179]  ffff880100000000 ffffffff00000000 0000000000000001 ffffffff00000000
[11509.608179]  0000000000000001 0000000000000000 ffff880100000000 00000000000006c4
[11509.608179] Call Trace:
[11509.608179]  [<ffffffff8107dc57>] ? __lock_acquire+0x696/0xf02
[11509.608179]  [<ffffffff8107e806>] lock_acquire+0xa5/0x116
[11509.608179]  [<ffffffffa04cc876>] ? do_trimming+0x51/0x145 [btrfs]
[11509.608179]  [<ffffffff81434f37>] _raw_spin_lock+0x34/0x44
[11509.608179]  [<ffffffffa04cc876>] ? do_trimming+0x51/0x145 [btrfs]
[11509.608179]  [<ffffffffa04cc876>] do_trimming+0x51/0x145 [btrfs]
[11509.608179]  [<ffffffffa04cde7d>] btrfs_trim_block_group+0x201/0x491 [btrfs]
[11509.608179]  [<ffffffffa04849e2>] btrfs_trim_fs+0xe0/0x129 [btrfs]
[11509.608179]  [<ffffffffa04bb80a>] btrfs_ioctl_fitrim+0x138/0x167 [btrfs]
[11509.608179]  [<ffffffffa04c002f>] btrfs_ioctl+0x50d/0x21e8 [btrfs]
[11509.608179]  [<ffffffff81123bda>] ? might_fault+0x58/0xb5
[11509.608179]  [<ffffffff81123bda>] ? might_fault+0x58/0xb5
[11509.608179]  [<ffffffff81123bda>] ? might_fault+0x58/0xb5
[11509.608179]  [<ffffffff81158050>] ? cp_new_stat+0x147/0x15e
[11509.608179]  [<ffffffff81163041>] do_vfs_ioctl+0x3c6/0x479
[11509.608179]  [<ffffffff81158116>] ? SYSC_newfstat+0x25/0x2e
[11509.608179]  [<ffffffff81435b54>] ? ret_from_sys_call+0x1d/0x58
[11509.608179]  [<ffffffff8116b915>] ? __fget_light+0x2d/0x4f
[11509.608179]  [<ffffffff8116314e>] SyS_ioctl+0x5a/0x7f
[11509.608179]  [<ffffffff81435b32>] system_call_fastpath+0x12/0x17
[11509.608179] Code: f4 01 00 0f 85 c0 00 00 00 48 c7 c1 f3 1f 7d 81 48 c7 c2 aa cb 7c 81 be fc 0b 00 00 eb 70 83 3d 61 eb 9c 00 00 0f 84 a5 00 00 00 <49> 81 3e 40 a3 2b 82 b8 00 00 00
[11509.608179] RIP  [<ffffffff8107d675>] __lock_acquire+0xb4/0xf02
[11509.608179]  RSP <ffff8801b3edf9e8>
[11509.608179] CR2: 0000000000000018
[11509.608179] ---[ end trace 570a5c6769f0e49a ]---

Which corresponds to the following access in fs/btrfs/free-space-cache.c:

  static int do_trimming(struct btrfs_block_group_cache *block_group,
                         u64 *total_trimmed, u64 start, u64 bytes,
                         u64 reserved_start, u64 reserved_bytes,
                         struct btrfs_trim_range *trim_entry)
  {
       struct btrfs_space_info *space_info = block_group->space_info;
  (...)
       spin_lock(&space_info->lock);
       ^^^^^ - block_group->space_info is NULL...

Fix this by ensuring the block group's ->space_info is set before adding
the block group to the rbtree.

Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/extent-tree.c | 25 +++++++++++++++++++++++--
 1 file changed, 23 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 0ec3acd14cbf..1cbc71d8cb96 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3693,7 +3693,8 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
 		found->disk_total += total_bytes * factor;
 		found->bytes_used += bytes_used;
 		found->disk_used += bytes_used * factor;
-		found->full = 0;
+		if (total_bytes > 0)
+			found->full = 0;
 		spin_unlock(&found->lock);
 		*space_info = found;
 		return 0;
@@ -3721,7 +3722,10 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
 	found->bytes_reserved = 0;
 	found->bytes_readonly = 0;
 	found->bytes_may_use = 0;
-	found->full = 0;
+	if (total_bytes > 0)
+		found->full = 0;
+	else
+		found->full = 1;
 	found->force_alloc = CHUNK_ALLOC_NO_FORCE;
 	found->chunk_alloc = 0;
 	found->flush = 0;
@@ -9562,6 +9566,19 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
 
 	free_excluded_extents(root, cache);
 
+	/*
+	 * Call to ensure the corresponding space_info object is created and
+	 * assigned to our block group, but don't update its counters just yet.
+	 * We want our bg to be added to the rbtree with its ->space_info set.
+	 */
+	ret = update_space_info(root->fs_info, cache->flags, 0, 0,
+				&cache->space_info);
+	if (ret) {
+		btrfs_remove_free_space_cache(cache);
+		btrfs_put_block_group(cache);
+		return ret;
+	}
+
 	ret = btrfs_add_block_group_cache(root->fs_info, cache);
 	if (ret) {
 		btrfs_remove_free_space_cache(cache);
@@ -9569,6 +9586,10 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
 		return ret;
 	}
 
+	/*
+	 * Now that our block group has its ->space_info set and is inserted in
+	 * the rbtree, update the space info's counters.
+	 */
 	ret = update_space_info(root->fs_info, cache->flags, size, bytes_used,
 				&cache->space_info);
 	if (ret) {
-- 
cgit v1.2.3-55-g7522


From 01b810b889d557257580970e1a7ba9c85b54766b Mon Sep 17 00:00:00 2001
From: David Sterba
Date: Tue, 12 May 2015 19:14:49 +0200
Subject: btrfs: make root id query unprivileged

The INO_LOOKUP ioctl can lookup path for a given inode number and is
thus restricted. As a sideefect it can find the root id of the
containing subvolume and we're using this int the 'btrfs inspect rootid'
command.

The restriction is unnecessary in case we set the ioctl args
 args::treeid    = 0
 args::objectid  = 256 (BTRFS_FIRST_FREE_OBJECTID)

Then the path will be empty and the treeid is filled with the root id of
the inode on which the ioctl is called. This behaviour is unchanged,
after the root restriction is removed.

Signed-off-by: David Sterba <dsterba@suse.cz>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/ioctl.c | 20 ++++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index f77f6b3d24b9..f7c65ca056f8 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -2271,10 +2271,7 @@ static noinline int btrfs_ioctl_ino_lookup(struct file *file,
 {
 	 struct btrfs_ioctl_ino_lookup_args *args;
 	 struct inode *inode;
-	 int ret;
-
-	if (!capable(CAP_SYS_ADMIN))
-		return -EPERM;
+	int ret = 0;
 
 	args = memdup_user(argp, sizeof(*args));
 	if (IS_ERR(args))
@@ -2282,13 +2279,28 @@ static noinline int btrfs_ioctl_ino_lookup(struct file *file,
 
 	inode = file_inode(file);
 
+	/*
+	 * Unprivileged query to obtain the containing subvolume root id. The
+	 * path is reset so it's consistent with btrfs_search_path_in_tree.
+	 */
 	if (args->treeid == 0)
 		args->treeid = BTRFS_I(inode)->root->root_key.objectid;
 
+	if (args->objectid == BTRFS_FIRST_FREE_OBJECTID) {
+		args->name[0] = 0;
+		goto out;
+	}
+
+	if (!capable(CAP_SYS_ADMIN)) {
+		ret = -EPERM;
+		goto out;
+	}
+
 	ret = btrfs_search_path_in_tree(BTRFS_I(inode)->root->fs_info,
 					args->treeid, args->objectid,
 					args->name);
 
+out:
 	if (ret == 0 && copy_to_user(argp, args, sizeof(*args)))
 		ret = -EFAULT;
 
-- 
cgit v1.2.3-55-g7522


From 2037a0933bc2894a2f50ae57a1ccf6be192adb76 Mon Sep 17 00:00:00 2001
From: Sasha Levin
Date: Tue, 12 May 2015 19:31:37 -0400
Subject: btrfs: use after free when closing devices

__btrfs_close_devices() would call_rcu to free the device, which is racy with
list_for_each_entry() accessing the memory to retrieve the next device on the
list.

Signed-off-by: Sasha Levin <sasha.levin@oracle.com>
Reviewed-by: David Sterba <dsterba@suse.cz>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/volumes.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 403ed1fdd901..c99f29a52e0b 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -693,13 +693,13 @@ static void free_device(struct rcu_head *head)
 
 static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
 {
-	struct btrfs_device *device;
+	struct btrfs_device *device, *tmp;
 
 	if (--fs_devices->opened > 0)
 		return 0;
 
 	mutex_lock(&fs_devices->device_list_mutex);
-	list_for_each_entry(device, &fs_devices->devices, dev_list) {
+	list_for_each_entry_safe(device, tmp, &fs_devices->devices, dev_list) {
 		struct btrfs_device *new_device;
 		struct rcu_string *name;
 
-- 
cgit v1.2.3-55-g7522


From c152b63efc94d5fd486a39e9df5e61ae77e08e44 Mon Sep 17 00:00:00 2001
From: Filipe Manana
Date: Thu, 14 May 2015 10:46:03 +0100
Subject: Btrfs: fix chunk allocation regression leading to transaction abort

With commit 1b9845081633 ("Btrfs: fix find_free_dev_extent() malfunction
in case device tree has hole") introduced in the kernel 4.1 merge window,
we end up using part of a device hole for which there are already pending
chunks or pinned chunks. Before that commit we didn't use the hole and
would just move on to the next hole in the device.

However when we adjust the start offset for the chunk allocation and we
have pinned chunks, we set it blindly to the end offset of the pinned
chunk we are currently processing, which is dangerous because we can
have a pending chunk that has a start offset that matches the end offset
of our pinned chunk - leading us to a case where we end up getting two
pending chunks that start at the same physical device offset, which makes
us later abort the current transaction with -EEXIST when finishing the
chunk allocation at btrfs_create_pending_block_groups():

[194737.659017] ------------[ cut here ]------------
[194737.660192] WARNING: CPU: 15 PID: 31111 at fs/btrfs/super.c:260 __btrfs_abort_transaction+0x52/0x106 [btrfs]()
[194737.662209] BTRFS: Transaction aborted (error -17)
[194737.663175] Modules linked in: btrfs dm_snapshot dm_bufio dm_flakey dm_mod crc32c_generic xor raid6_pq nfsd auth_rpcgss oid_registry nfs_acl nfs lockd grace fscache sunrpc loop fuse
[194737.674015] CPU: 15 PID: 31111 Comm: xfs_io Tainted: G        W       4.0.0-rc5-btrfs-next-9+ #2
[194737.675986] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.7.5-0-ge51488c-20140602_164612-nilsson.home.kraxel.org 04/01/2014
[194737.682999]  0000000000000009 ffff8800564c7a98 ffffffff8142fa46 ffffffff8108b6a2
[194737.684540]  ffff8800564c7ae8 ffff8800564c7ad8 ffffffff81045ea5 ffff8800564c7b78
[194737.686017]  ffffffffa0383aa7 00000000ffffffef ffff88000c7ba000 ffff8801a1f66f40
[194737.687509] Call Trace:
[194737.688068]  [<ffffffff8142fa46>] dump_stack+0x4f/0x7b
[194737.689027]  [<ffffffff8108b6a2>] ? console_unlock+0x361/0x3ad
[194737.690095]  [<ffffffff81045ea5>] warn_slowpath_common+0xa1/0xbb
[194737.691198]  [<ffffffffa0383aa7>] ? __btrfs_abort_transaction+0x52/0x106 [btrfs]
[194737.693789]  [<ffffffff81045f05>] warn_slowpath_fmt+0x46/0x48
[194737.695065]  [<ffffffffa0383aa7>] __btrfs_abort_transaction+0x52/0x106 [btrfs]
[194737.696806]  [<ffffffffa039a3bd>] btrfs_create_pending_block_groups+0x101/0x130 [btrfs]
[194737.698683]  [<ffffffffa03aa433>] __btrfs_end_transaction+0x84/0x366 [btrfs]
[194737.700329]  [<ffffffffa03aa725>] btrfs_end_transaction+0x10/0x12 [btrfs]
[194737.701924]  [<ffffffffa0394b51>] btrfs_check_data_free_space+0x11f/0x27c [btrfs]
[194737.703675]  [<ffffffffa03b8ba4>] __btrfs_buffered_write+0x16a/0x4c8 [btrfs]
[194737.705417]  [<ffffffffa03bb502>] ? btrfs_file_write_iter+0x19a/0x431 [btrfs]
[194737.707058]  [<ffffffffa03bb511>] ? btrfs_file_write_iter+0x1a9/0x431 [btrfs]
[194737.708560]  [<ffffffffa03bb68d>] btrfs_file_write_iter+0x325/0x431 [btrfs]
[194737.710673]  [<ffffffff81067d85>] ? get_parent_ip+0xe/0x3e
[194737.712076]  [<ffffffff811534c3>] new_sync_write+0x7c/0xa0
[194737.713293]  [<ffffffff81153b58>] vfs_write+0xb2/0x117
[194737.714443]  [<ffffffff81154424>] SyS_pwrite64+0x64/0x82
[194737.715646]  [<ffffffff81435b32>] system_call_fastpath+0x12/0x17
[194737.717175] ---[ end trace f2d5dc04e56d7e48 ]---
[194737.718170] BTRFS: error (device sdc) in btrfs_create_pending_block_groups:9524: errno=-17 Object already exists

The -EEXIST failure comes from btrfs_finish_chunk_alloc(), called by
btrfs_create_pending_block_groups(), when it attempts to insert a
duplicated device extent item via btrfs_alloc_dev_extent().

This issue was reproducible with fstests generic/038 running in a loop for
several hours (it's very hard to hit) and using MOUNT_OPTIONS="-o discard".
Applying Jeff's recent patch titled "btrfs: add missing discards when
unpinning extents with -o discard" makes the issue much easier to reproduce
(usually within 4 to 5 hours), since it pins chunks for longer periods of
time when an unused block group is deleted by the cleaner kthread.

Fix this by making sure that we never adjust the start offset to a lower
value than it currently has.

Fixes: 1b9845081633 ("Btrfs: fix find_free_dev_extent() malfunction in case device tree has hole"
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/volumes.c | 22 +++++++++++++++++++---
 1 file changed, 19 insertions(+), 3 deletions(-)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index c99f29a52e0b..534be440dd56 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1067,15 +1067,31 @@ again:
 
 		map = (struct map_lookup *)em->bdev;
 		for (i = 0; i < map->num_stripes; i++) {
+			u64 end;
+
 			if (map->stripes[i].dev != device)
 				continue;
 			if (map->stripes[i].physical >= physical_start + len ||
 			    map->stripes[i].physical + em->orig_block_len <=
 			    physical_start)
 				continue;
-			*start = map->stripes[i].physical +
-				em->orig_block_len;
-			ret = 1;
+			/*
+			 * Make sure that while processing the pinned list we do
+			 * not override our *start with a lower value, because
+			 * we can have pinned chunks that fall within this
+			 * device hole and that have lower physical addresses
+			 * than the pending chunks we processed before. If we
+			 * do not take this special care we can end up getting
+			 * 2 pending chunks that start at the same physical
+			 * device offsets because the end offset of a pinned
+			 * chunk can be equal to the start offset of some
+			 * pending chunk.
+			 */
+			end = map->stripes[i].physical + em->orig_block_len;
+			if (end > *start) {
+				*start = end;
+				ret = 1;
+			}
 		}
 	}
 	if (search_list == &trans->transaction->pending_chunks) {
-- 
cgit v1.2.3-55-g7522


From 0f31871f4411b5c0d42fb4403dec83a21a96100b Mon Sep 17 00:00:00 2001
From: Filipe Manana
Date: Thu, 14 May 2015 20:41:07 +0100
Subject: Btrfs: wake up extent state waiters on unlock through
 clear_extent_bits

When we clear an extent state's EXTENT_LOCKED bit with clear_extent_bits()
through free_io_failure(), we weren't waking up any tasks waiting for the
extent's state EXTENT_LOCKED bit, leading to an hang.

So make sure clear_extent_bits() ends up waking up any waiters if the
bit EXTENT_LOCKED is supplied by its callers.

Zygo Blaxell was experiencing such hangs at inode eviction time after
file unlinks. Thanks to him for a set of scripts to reproduce the issue.

Reported-by: Zygo Blaxell <ce3g8jdj@umail.furryterror.org>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/extent_io.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index c32d226bfecc..856c2e8ea6ac 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1277,7 +1277,12 @@ int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
 int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
 		      unsigned bits, gfp_t mask)
 {
-	return clear_extent_bit(tree, start, end, bits, 0, 0, NULL, mask);
+	int wake = 0;
+
+	if (bits & EXTENT_LOCKED)
+		wake = 1;
+
+	return clear_extent_bit(tree, start, end, bits, wake, 0, NULL, mask);
 }
 
 int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
-- 
cgit v1.2.3-55-g7522


From 773cd04ec1911abb33cf9538b65f55b76cad5d92 Mon Sep 17 00:00:00 2001
From: Omar Sandoval
Date: Mon, 18 May 2015 02:16:26 -0700
Subject: Btrfs: lock superblock before remounting for rw subvol

Since commit 0723a0473fb4 ("btrfs: allow mounting btrfs subvolumes with
different ro/rw options"), when mounting a subvolume read/write when
another subvolume has previously been mounted read-only, we first do a
remount. However, this should be done with the superblock locked, as per
sync_filesystem():

	/*
	 * We need to be protected against the filesystem going from
	 * r/o to r/w or vice versa.
	 */
	WARN_ON(!rwsem_is_locked(&sb->s_umount));

This WARN_ON can easily be hit with:

mkfs.btrfs -f /dev/vdb
mount /dev/vdb /mnt
btrfs subvol create /mnt/vol1
btrfs subvol create /mnt/vol2
umount /mnt
mount -oro,subvol=/vol1 /dev/vdb /mnt
mount -orw,subvol=/vol2 /dev/vdb /mnt2

Fixes: 0723a0473fb4 ("btrfs: allow mounting btrfs subvolumes with different ro/rw options")
Reviewed-by: David Sterba <dsterba@suse.cz>
Signed-off-by: Omar Sandoval <osandov@osandov.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/super.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index c761d8e068e8..93f2edf6b603 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1209,7 +1209,9 @@ static struct dentry *mount_subvol(const char *subvol_name, int flags,
 				return ERR_CAST(mnt);
 			}
 
+			down_write(&mnt->mnt_sb->s_umount);
 			r = btrfs_remount(mnt->mnt_sb, &flags, NULL);
+			up_write(&mnt->mnt_sb->s_umount);
 			if (r < 0) {
 				/* FIXME: release vfsmount mnt ??*/
 				kfree(newargs);
-- 
cgit v1.2.3-55-g7522


From e6e4dbe894ef29bd4aa05a7702e0cded1943dd10 Mon Sep 17 00:00:00 2001
From: Omar Sandoval
Date: Mon, 18 May 2015 02:16:27 -0700
Subject: Btrfs: remove all subvol options before mounting top-level

Currently, setup_root_args() substitutes 's/subvol=[^,]*/subvolid=0/'.
But, this means that if the user passes both a subvol and subvolid for
some reason, we won't actually mount the top-level when we recursively
mount. For example, consider:

mkfs.btrfs -f /dev/sdb
mount /dev/sdb /mnt
btrfs subvol create /mnt/subvol1 # subvolid=257
btrfs subvol create /mnt/subvol2 # subvolid=258
umount /mnt
mount -osubvol=/subvol1,subvolid=258 /dev/sdb /mnt

In the final mount, subvol=/subvol1,subvolid=258 becomes
subvolid=0,subvolid=258, and the last option takes precedence, so we
mount subvol2 and try to look up subvol1 inside of it, which fails.

So, instead, do a thorough scan through the argument list and remove any
subvol= and subvolid= options, then append subvolid=0 to the end. This
implicitly makes subvol= take precedence over subvolid=, but we're about
to add a stricter check for that. This also makes setup_root_args() more
generic, which we'll need soon.

Reviewed-by: David Sterba <dsterba@suse.cz>
Signed-off-by: Omar Sandoval <osandov@osandov.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/super.c | 56 ++++++++++++++++++++------------------------------------
 1 file changed, 20 insertions(+), 36 deletions(-)

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 93f2edf6b603..1c57c0d0dc41 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1133,52 +1133,36 @@ static inline int is_subvolume_inode(struct inode *inode)
 }
 
 /*
- * This will strip out the subvol=%s argument for an argument string and add
- * subvolid=0 to make sure we get the actual tree root for path walking to the
- * subvol we want.
+ * This will add subvolid=0 to the argument string while removing any subvol=
+ * and subvolid= arguments to make sure we get the top-level root for path
+ * walking to the subvol we want.
  */
 static char *setup_root_args(char *args)
 {
-	unsigned len = strlen(args) + 2 + 1;
-	char *src, *dst, *buf;
+	char *buf, *dst, *sep;
 
-	/*
-	 * We need the same args as before, but with this substitution:
-	 * s!subvol=[^,]+!subvolid=0!
-	 *
-	 * Since the replacement string is up to 2 bytes longer than the
-	 * original, allocate strlen(args) + 2 + 1 bytes.
-	 */
-
-	src = strstr(args, "subvol=");
-	/* This shouldn't happen, but just in case.. */
-	if (!src)
-		return NULL;
+	if (!args)
+		return kstrdup("subvolid=0", GFP_NOFS);
 
-	buf = dst = kmalloc(len, GFP_NOFS);
+	/* The worst case is that we add ",subvolid=0" to the end. */
+	buf = dst = kmalloc(strlen(args) + strlen(",subvolid=0") + 1, GFP_NOFS);
 	if (!buf)
 		return NULL;
 
-	/*
-	 * If the subvol= arg is not at the start of the string,
-	 * copy whatever precedes it into buf.
-	 */
-	if (src != args) {
-		*src++ = '\0';
-		strcpy(buf, args);
-		dst += strlen(args);
+	while (1) {
+		sep = strchrnul(args, ',');
+		if (!strstarts(args, "subvol=") &&
+		    !strstarts(args, "subvolid=")) {
+			memcpy(dst, args, sep - args);
+			dst += sep - args;
+			*dst++ = ',';
+		}
+		if (*sep)
+			args = sep + 1;
+		else
+			break;
 	}
-
 	strcpy(dst, "subvolid=0");
-	dst += strlen("subvolid=0");
-
-	/*
-	 * If there is a "," after the original subvol=... string,
-	 * copy that suffix into our buffer.  Otherwise, we're done.
-	 */
-	src = strchr(src, ',');
-	if (src)
-		strcpy(dst, src);
 
 	return buf;
 }
-- 
cgit v1.2.3-55-g7522


From fa3306595071ae03e181c2336d0dcd8f5e3cf9b0 Mon Sep 17 00:00:00 2001
From: Omar Sandoval
Date: Mon, 18 May 2015 02:16:28 -0700
Subject: Btrfs: clean up error handling in mount_subvol()

In preparation for new functionality in mount_subvol(), give it
ownership of subvol_name and tidy up the error paths.

Reviewed-by: David Sterba <dsterba@suse.cz>
Signed-off-by: Omar Sandoval <osandov@osandov.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/super.c | 61 ++++++++++++++++++++++++++++++--------------------------
 1 file changed, 33 insertions(+), 28 deletions(-)

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 1c57c0d0dc41..dd89dd353c4a 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1171,55 +1171,61 @@ static struct dentry *mount_subvol(const char *subvol_name, int flags,
 				   const char *device_name, char *data)
 {
 	struct dentry *root;
-	struct vfsmount *mnt;
+	struct vfsmount *mnt = NULL;
 	char *newargs;
+	int ret;
 
 	newargs = setup_root_args(data);
-	if (!newargs)
-		return ERR_PTR(-ENOMEM);
-	mnt = vfs_kern_mount(&btrfs_fs_type, flags, device_name,
-			     newargs);
+	if (!newargs) {
+		root = ERR_PTR(-ENOMEM);
+		goto out;
+	}
 
-	if (PTR_RET(mnt) == -EBUSY) {
+	mnt = vfs_kern_mount(&btrfs_fs_type, flags, device_name, newargs);
+	if (PTR_ERR_OR_ZERO(mnt) == -EBUSY) {
 		if (flags & MS_RDONLY) {
-			mnt = vfs_kern_mount(&btrfs_fs_type, flags & ~MS_RDONLY, device_name,
-					     newargs);
+			mnt = vfs_kern_mount(&btrfs_fs_type, flags & ~MS_RDONLY,
+					     device_name, newargs);
 		} else {
-			int r;
-			mnt = vfs_kern_mount(&btrfs_fs_type, flags | MS_RDONLY, device_name,
-					     newargs);
+			mnt = vfs_kern_mount(&btrfs_fs_type, flags | MS_RDONLY,
+					     device_name, newargs);
 			if (IS_ERR(mnt)) {
-				kfree(newargs);
-				return ERR_CAST(mnt);
+				root = ERR_CAST(mnt);
+				mnt = NULL;
+				goto out;
 			}
 
 			down_write(&mnt->mnt_sb->s_umount);
-			r = btrfs_remount(mnt->mnt_sb, &flags, NULL);
+			ret = btrfs_remount(mnt->mnt_sb, &flags, NULL);
 			up_write(&mnt->mnt_sb->s_umount);
-			if (r < 0) {
-				/* FIXME: release vfsmount mnt ??*/
-				kfree(newargs);
-				return ERR_PTR(r);
+			if (ret < 0) {
+				root = ERR_PTR(ret);
+				goto out;
 			}
 		}
 	}
-
-	kfree(newargs);
-
-	if (IS_ERR(mnt))
-		return ERR_CAST(mnt);
+	if (IS_ERR(mnt)) {
+		root = ERR_CAST(mnt);
+		mnt = NULL;
+		goto out;
+	}
 
 	root = mount_subtree(mnt, subvol_name);
+	/* mount_subtree() drops our reference on the vfsmount. */
+	mnt = NULL;
 
 	if (!IS_ERR(root) && !is_subvolume_inode(d_inode(root))) {
 		struct super_block *s = root->d_sb;
 		dput(root);
 		root = ERR_PTR(-EINVAL);
 		deactivate_locked_super(s);
-		printk(KERN_ERR "BTRFS: '%s' is not a valid subvolume\n",
-				subvol_name);
+		pr_err("BTRFS: '%s' is not a valid subvolume\n", subvol_name);
 	}
 
+out:
+	mntput(mnt);
+	kfree(newargs);
+	kfree(subvol_name);
 	return root;
 }
 
@@ -1305,9 +1311,8 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
 	}
 
 	if (subvol_name) {
-		root = mount_subvol(subvol_name, flags, device_name, data);
-		kfree(subvol_name);
-		return root;
+		/* mount_subvol() will free subvol_name. */
+		return mount_subvol(subvol_name, flags, device_name, data);
 	}
 
 	security_init_mnt_opts(&new_sec_opts);
-- 
cgit v1.2.3-55-g7522


From bb289b7be62db84b9630ce00367444c810cada2c Mon Sep 17 00:00:00 2001
From: Omar Sandoval
Date: Mon, 18 May 2015 02:16:29 -0700
Subject: Btrfs: fail on mismatched subvol and subvolid mount options

There's nothing to stop a user from passing both subvol= and subvolid=
to mount, but if they don't refer to the same subvolume, someone is
going to be surprised at some point. Error out on this case, but allow
users to pass in both if they do match (which they could, for example,
get out of /proc/mounts).

Reviewed-by: David Sterba <dsterba@suse.cz>
Signed-off-by: Omar Sandoval <osandov@osandov.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/super.c | 33 +++++++++++++++++++++++++--------
 1 file changed, 25 insertions(+), 8 deletions(-)

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index dd89dd353c4a..3ad97953537c 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1167,8 +1167,9 @@ static char *setup_root_args(char *args)
 	return buf;
 }
 
-static struct dentry *mount_subvol(const char *subvol_name, int flags,
-				   const char *device_name, char *data)
+static struct dentry *mount_subvol(const char *subvol_name, u64 subvol_objectid,
+				   int flags, const char *device_name,
+				   char *data)
 {
 	struct dentry *root;
 	struct vfsmount *mnt = NULL;
@@ -1214,12 +1215,27 @@ static struct dentry *mount_subvol(const char *subvol_name, int flags,
 	/* mount_subtree() drops our reference on the vfsmount. */
 	mnt = NULL;
 
-	if (!IS_ERR(root) && !is_subvolume_inode(d_inode(root))) {
+	if (!IS_ERR(root)) {
 		struct super_block *s = root->d_sb;
-		dput(root);
-		root = ERR_PTR(-EINVAL);
-		deactivate_locked_super(s);
-		pr_err("BTRFS: '%s' is not a valid subvolume\n", subvol_name);
+		struct inode *root_inode = d_inode(root);
+		u64 root_objectid = BTRFS_I(root_inode)->root->root_key.objectid;
+
+		ret = 0;
+		if (!is_subvolume_inode(root_inode)) {
+			pr_err("BTRFS: '%s' is not a valid subvolume\n",
+			       subvol_name);
+			ret = -EINVAL;
+		}
+		if (subvol_objectid && root_objectid != subvol_objectid) {
+			pr_err("BTRFS: subvol '%s' does not match subvolid %llu\n",
+			       subvol_name, subvol_objectid);
+			ret = -EINVAL;
+		}
+		if (ret) {
+			dput(root);
+			root = ERR_PTR(ret);
+			deactivate_locked_super(s);
+		}
 	}
 
 out:
@@ -1312,7 +1328,8 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
 
 	if (subvol_name) {
 		/* mount_subvol() will free subvol_name. */
-		return mount_subvol(subvol_name, flags, device_name, data);
+		return mount_subvol(subvol_name, subvol_objectid, flags,
+				    device_name, data);
 	}
 
 	security_init_mnt_opts(&new_sec_opts);
-- 
cgit v1.2.3-55-g7522


From 05dbe6837b60465db6a61d1f0cabe745e20aadc9 Mon Sep 17 00:00:00 2001
From: Omar Sandoval
Date: Mon, 18 May 2015 02:16:30 -0700
Subject: Btrfs: unify subvol= and subvolid= mounting

Currently, mounting a subvolume with subvolid= takes a different code
path than mounting with subvol=. This isn't really a big deal except for
the fact that mounts done with subvolid= or the default subvolume don't
have a dentry that's connected to the dentry tree like in the subvol=
case. To unify the code paths, when given subvolid= or using the default
subvolume ID, translate it into a subvolume name by walking
ROOT_BACKREFs in the root tree and INODE_REFs in the filesystem trees.

Reviewed-by: David Sterba <dsterba@suse.cz>
Signed-off-by: Omar Sandoval <osandov@osandov.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/super.c | 238 +++++++++++++++++++++++++++++++++++++++----------------
 1 file changed, 171 insertions(+), 67 deletions(-)

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 3ad97953537c..6fa4aaf0543a 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -836,33 +836,153 @@ out:
 	return error;
 }
 
-static struct dentry *get_default_root(struct super_block *sb,
-				       u64 subvol_objectid)
+static char *get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info,
+					   u64 subvol_objectid)
 {
-	struct btrfs_fs_info *fs_info = btrfs_sb(sb);
 	struct btrfs_root *root = fs_info->tree_root;
-	struct btrfs_root *new_root;
-	struct btrfs_dir_item *di;
-	struct btrfs_path *path;
-	struct btrfs_key location;
-	struct inode *inode;
-	u64 dir_id;
-	int new = 0;
+	struct btrfs_root *fs_root;
+	struct btrfs_root_ref *root_ref;
+	struct btrfs_inode_ref *inode_ref;
+	struct btrfs_key key;
+	struct btrfs_path *path = NULL;
+	char *name = NULL, *ptr;
+	u64 dirid;
+	int len;
+	int ret;
+
+	path = btrfs_alloc_path();
+	if (!path) {
+		ret = -ENOMEM;
+		goto err;
+	}
+	path->leave_spinning = 1;
+
+	name = kmalloc(PATH_MAX, GFP_NOFS);
+	if (!name) {
+		ret = -ENOMEM;
+		goto err;
+	}
+	ptr = name + PATH_MAX - 1;
+	ptr[0] = '\0';
 
 	/*
-	 * We have a specific subvol we want to mount, just setup location and
-	 * go look up the root.
+	 * Walk up the subvolume trees in the tree of tree roots by root
+	 * backrefs until we hit the top-level subvolume.
 	 */
-	if (subvol_objectid) {
-		location.objectid = subvol_objectid;
-		location.type = BTRFS_ROOT_ITEM_KEY;
-		location.offset = (u64)-1;
-		goto find_root;
+	while (subvol_objectid != BTRFS_FS_TREE_OBJECTID) {
+		key.objectid = subvol_objectid;
+		key.type = BTRFS_ROOT_BACKREF_KEY;
+		key.offset = (u64)-1;
+
+		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+		if (ret < 0) {
+			goto err;
+		} else if (ret > 0) {
+			ret = btrfs_previous_item(root, path, subvol_objectid,
+						  BTRFS_ROOT_BACKREF_KEY);
+			if (ret < 0) {
+				goto err;
+			} else if (ret > 0) {
+				ret = -ENOENT;
+				goto err;
+			}
+		}
+
+		btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+		subvol_objectid = key.offset;
+
+		root_ref = btrfs_item_ptr(path->nodes[0], path->slots[0],
+					  struct btrfs_root_ref);
+		len = btrfs_root_ref_name_len(path->nodes[0], root_ref);
+		ptr -= len + 1;
+		if (ptr < name) {
+			ret = -ENAMETOOLONG;
+			goto err;
+		}
+		read_extent_buffer(path->nodes[0], ptr + 1,
+				   (unsigned long)(root_ref + 1), len);
+		ptr[0] = '/';
+		dirid = btrfs_root_ref_dirid(path->nodes[0], root_ref);
+		btrfs_release_path(path);
+
+		key.objectid = subvol_objectid;
+		key.type = BTRFS_ROOT_ITEM_KEY;
+		key.offset = (u64)-1;
+		fs_root = btrfs_read_fs_root_no_name(fs_info, &key);
+		if (IS_ERR(fs_root)) {
+			ret = PTR_ERR(fs_root);
+			goto err;
+		}
+
+		/*
+		 * Walk up the filesystem tree by inode refs until we hit the
+		 * root directory.
+		 */
+		while (dirid != BTRFS_FIRST_FREE_OBJECTID) {
+			key.objectid = dirid;
+			key.type = BTRFS_INODE_REF_KEY;
+			key.offset = (u64)-1;
+
+			ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0);
+			if (ret < 0) {
+				goto err;
+			} else if (ret > 0) {
+				ret = btrfs_previous_item(fs_root, path, dirid,
+							  BTRFS_INODE_REF_KEY);
+				if (ret < 0) {
+					goto err;
+				} else if (ret > 0) {
+					ret = -ENOENT;
+					goto err;
+				}
+			}
+
+			btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+			dirid = key.offset;
+
+			inode_ref = btrfs_item_ptr(path->nodes[0],
+						   path->slots[0],
+						   struct btrfs_inode_ref);
+			len = btrfs_inode_ref_name_len(path->nodes[0],
+						       inode_ref);
+			ptr -= len + 1;
+			if (ptr < name) {
+				ret = -ENAMETOOLONG;
+				goto err;
+			}
+			read_extent_buffer(path->nodes[0], ptr + 1,
+					   (unsigned long)(inode_ref + 1), len);
+			ptr[0] = '/';
+			btrfs_release_path(path);
+		}
 	}
 
+	btrfs_free_path(path);
+	if (ptr == name + PATH_MAX - 1) {
+		name[0] = '/';
+		name[1] = '\0';
+	} else {
+		memmove(name, ptr, name + PATH_MAX - ptr);
+	}
+	return name;
+
+err:
+	btrfs_free_path(path);
+	kfree(name);
+	return ERR_PTR(ret);
+}
+
+static int get_default_subvol_objectid(struct btrfs_fs_info *fs_info, u64 *objectid)
+{
+	struct btrfs_root *root = fs_info->tree_root;
+	struct btrfs_dir_item *di;
+	struct btrfs_path *path;
+	struct btrfs_key location;
+	u64 dir_id;
+
 	path = btrfs_alloc_path();
 	if (!path)
-		return ERR_PTR(-ENOMEM);
+		return -ENOMEM;
 	path->leave_spinning = 1;
 
 	/*
@@ -874,58 +994,23 @@ static struct dentry *get_default_root(struct super_block *sb,
 	di = btrfs_lookup_dir_item(NULL, root, path, dir_id, "default", 7, 0);
 	if (IS_ERR(di)) {
 		btrfs_free_path(path);
-		return ERR_CAST(di);
+		return PTR_ERR(di);
 	}
 	if (!di) {
 		/*
 		 * Ok the default dir item isn't there.  This is weird since
 		 * it's always been there, but don't freak out, just try and
-		 * mount to root most subvolume.
+		 * mount the top-level subvolume.
 		 */
 		btrfs_free_path(path);
-		dir_id = BTRFS_FIRST_FREE_OBJECTID;
-		new_root = fs_info->fs_root;
-		goto setup_root;
+		*objectid = BTRFS_FS_TREE_OBJECTID;
+		return 0;
 	}
 
 	btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
 	btrfs_free_path(path);
-
-find_root:
-	new_root = btrfs_read_fs_root_no_name(fs_info, &location);
-	if (IS_ERR(new_root))
-		return ERR_CAST(new_root);
-
-	if (!(sb->s_flags & MS_RDONLY)) {
-		int ret;
-		down_read(&fs_info->cleanup_work_sem);
-		ret = btrfs_orphan_cleanup(new_root);
-		up_read(&fs_info->cleanup_work_sem);
-		if (ret)
-			return ERR_PTR(ret);
-	}
-
-	dir_id = btrfs_root_dirid(&new_root->root_item);
-setup_root:
-	location.objectid = dir_id;
-	location.type = BTRFS_INODE_ITEM_KEY;
-	location.offset = 0;
-
-	inode = btrfs_iget(sb, &location, new_root, &new);
-	if (IS_ERR(inode))
-		return ERR_CAST(inode);
-
-	/*
-	 * If we're just mounting the root most subvol put the inode and return
-	 * a reference to the dentry.  We will have already gotten a reference
-	 * to the inode in btrfs_fill_super so we're good to go.
-	 */
-	if (!new && d_inode(sb->s_root) == inode) {
-		iput(inode);
-		return dget(sb->s_root);
-	}
-
-	return d_obtain_root(inode);
+	*objectid = location.objectid;
+	return 0;
 }
 
 static int btrfs_fill_super(struct super_block *sb,
@@ -1211,6 +1296,25 @@ static struct dentry *mount_subvol(const char *subvol_name, u64 subvol_objectid,
 		goto out;
 	}
 
+	if (!subvol_name) {
+		if (!subvol_objectid) {
+			ret = get_default_subvol_objectid(btrfs_sb(mnt->mnt_sb),
+							  &subvol_objectid);
+			if (ret) {
+				root = ERR_PTR(ret);
+				goto out;
+			}
+		}
+		subvol_name = get_subvol_name_from_objectid(btrfs_sb(mnt->mnt_sb),
+							    subvol_objectid);
+		if (IS_ERR(subvol_name)) {
+			root = ERR_CAST(subvol_name);
+			subvol_name = NULL;
+			goto out;
+		}
+
+	}
+
 	root = mount_subtree(mnt, subvol_name);
 	/* mount_subtree() drops our reference on the vfsmount. */
 	mnt = NULL;
@@ -1227,6 +1331,11 @@ static struct dentry *mount_subvol(const char *subvol_name, u64 subvol_objectid,
 			ret = -EINVAL;
 		}
 		if (subvol_objectid && root_objectid != subvol_objectid) {
+			/*
+			 * This will also catch a race condition where a
+			 * subvolume which was passed by ID is renamed and
+			 * another subvolume is renamed over the old location.
+			 */
 			pr_err("BTRFS: subvol '%s' does not match subvolid %llu\n",
 			       subvol_name, subvol_objectid);
 			ret = -EINVAL;
@@ -1306,7 +1415,6 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
 {
 	struct block_device *bdev = NULL;
 	struct super_block *s;
-	struct dentry *root;
 	struct btrfs_fs_devices *fs_devices = NULL;
 	struct btrfs_fs_info *fs_info = NULL;
 	struct security_mnt_opts new_sec_opts;
@@ -1326,7 +1434,7 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
 		return ERR_PTR(error);
 	}
 
-	if (subvol_name) {
+	if (subvol_name || subvol_objectid != BTRFS_FS_TREE_OBJECTID) {
 		/* mount_subvol() will free subvol_name. */
 		return mount_subvol(subvol_name, subvol_objectid, flags,
 				    device_name, data);
@@ -1395,23 +1503,19 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
 		error = btrfs_fill_super(s, fs_devices, data,
 					 flags & MS_SILENT ? 1 : 0);
 	}
-
-	root = !error ? get_default_root(s, subvol_objectid) : ERR_PTR(error);
-	if (IS_ERR(root)) {
+	if (error) {
 		deactivate_locked_super(s);
-		error = PTR_ERR(root);
 		goto error_sec_opts;
 	}
 
 	fs_info = btrfs_sb(s);
 	error = setup_security_options(fs_info, s, &new_sec_opts);
 	if (error) {
-		dput(root);
 		deactivate_locked_super(s);
 		goto error_sec_opts;
 	}
 
-	return root;
+	return dget(s->s_root);
 
 error_close_devices:
 	btrfs_close_devices(fs_devices);
-- 
cgit v1.2.3-55-g7522


From c8d3fe028f64054d75c72566efb9ecf3c75ee161 Mon Sep 17 00:00:00 2001
From: Omar Sandoval
Date: Mon, 18 May 2015 02:16:31 -0700
Subject: Btrfs: show subvol= and subvolid= in /proc/mounts

Now that we're guaranteed to have a meaningful root dentry, we can just
export seq_dentry() and use it in btrfs_show_options(). The subvolume ID
is easy to get and can also be useful, so put that in there, too.

Reviewed-by: David Sterba <dsterba@suse.cz>
Signed-off-by: Omar Sandoval <osandov@osandov.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/super.c | 4 ++++
 fs/seq_file.c    | 1 +
 2 files changed, 5 insertions(+)

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 6fa4aaf0543a..cd7ef34d2dce 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1188,6 +1188,10 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
 		seq_puts(seq, ",fatal_errors=panic");
 	if (info->commit_interval != BTRFS_DEFAULT_COMMIT_INTERVAL)
 		seq_printf(seq, ",commit=%d", info->commit_interval);
+	seq_printf(seq, ",subvolid=%llu",
+		  BTRFS_I(d_inode(dentry))->root->root_key.objectid);
+	seq_puts(seq, ",subvol=");
+	seq_dentry(seq, dentry, " \t\n\\");
 	return 0;
 }
 
diff --git a/fs/seq_file.c b/fs/seq_file.c
index 555f82155be8..52b492721603 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -538,6 +538,7 @@ int seq_dentry(struct seq_file *m, struct dentry *dentry, const char *esc)
 
 	return res;
 }
+EXPORT_SYMBOL(seq_dentry);
 
 static void *single_start(struct seq_file *p, loff_t *pos)
 {
-- 
cgit v1.2.3-55-g7522


From 0d2b2372e097cd3b4150d3ec91e79ac3c5cc750e Mon Sep 17 00:00:00 2001
From: Josef Bacik
Date: Tue, 19 May 2015 10:44:04 -0400
Subject: Btrfs: set UNWRITTEN for prealloc'ed extents in fiemap

We should be doing this, it's weird we hadn't been doing this.

Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/extent_io.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 856c2e8ea6ac..a3ec2c8610cc 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -4497,6 +4497,8 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 		}
 		if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
 			flags |= FIEMAP_EXTENT_ENCODED;
+		if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
+			flags |= FIEMAP_EXTENT_UNWRITTEN;
 
 		free_extent_map(em);
 		em = NULL;
-- 
cgit v1.2.3-55-g7522


From 4fbcdf6694544fd9d2aedbc1e73e52b90a4fcc20 Mon Sep 17 00:00:00 2001
From: Filipe Manana
Date: Wed, 20 May 2015 14:01:54 +0100
Subject: Btrfs: fix -ENOSPC when finishing block group creation

While creating a block group, we often end up getting ENOSPC while updating
the chunk tree, which leads to a transaction abortion that produces a trace
like the following:

[30670.116368] WARNING: CPU: 4 PID: 20735 at fs/btrfs/super.c:260 __btrfs_abort_transaction+0x52/0x106 [btrfs]()
[30670.117777] BTRFS: Transaction aborted (error -28)
(...)
[30670.163567] Call Trace:
[30670.163906]  [<ffffffff8142fa46>] dump_stack+0x4f/0x7b
[30670.164522]  [<ffffffff8108b6a2>] ? console_unlock+0x361/0x3ad
[30670.165171]  [<ffffffff81045ea5>] warn_slowpath_common+0xa1/0xbb
[30670.166323]  [<ffffffffa035daa7>] ? __btrfs_abort_transaction+0x52/0x106 [btrfs]
[30670.167213]  [<ffffffff81045f05>] warn_slowpath_fmt+0x46/0x48
[30670.167862]  [<ffffffffa035daa7>] __btrfs_abort_transaction+0x52/0x106 [btrfs]
[30670.169116]  [<ffffffffa03743d7>] btrfs_create_pending_block_groups+0x101/0x130 [btrfs]
[30670.170593]  [<ffffffffa038426a>] __btrfs_end_transaction+0x84/0x366 [btrfs]
[30670.171960]  [<ffffffffa038455c>] btrfs_end_transaction+0x10/0x12 [btrfs]
[30670.174649]  [<ffffffffa036eb6b>] btrfs_check_data_free_space+0x11f/0x27c [btrfs]
[30670.176092]  [<ffffffffa039450d>] btrfs_fallocate+0x7c8/0xb96 [btrfs]
[30670.177218]  [<ffffffff812459f2>] ? __this_cpu_preempt_check+0x13/0x15
[30670.178622]  [<ffffffff81152447>] vfs_fallocate+0x14c/0x1de
[30670.179642]  [<ffffffff8116b915>] ? __fget_light+0x2d/0x4f
[30670.180692]  [<ffffffff81152863>] SyS_fallocate+0x47/0x62
[30670.186737]  [<ffffffff81435b32>] system_call_fastpath+0x12/0x17
[30670.187792] ---[ end trace 0373e6b491c4a8cc ]---

This is because we don't do proper space reservation for the chunk block
reserve when we have multiple tasks allocating chunks in parallel.

So block group creation has 2 phases, and the first phase essentially
checks if there is enough space in the system space_info, allocating a
new system chunk if there isn't, while the second phase updates the
device, extent and chunk trees. However, because the updates to the
chunk tree happen in the second phase, if we have N tasks, each with
its own transaction handle, allocating new chunks in parallel and if
there is only enough space in the system space_info to allocate M chunks,
where M < N, none of the tasks ends up allocating a new system chunk in
the first phase and N - M tasks will get -ENOSPC when attempting to
update the chunk tree in phase 2 if they need to COW any nodes/leafs
from the chunk tree.

Fix this by doing proper reservation in the chunk block reserve.

The issue could be reproduced by running fstests generic/038 in a loop,
which eventually triggered the problem.

Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/ctree.h       |  1 +
 fs/btrfs/extent-tree.c | 44 ++++++++++++++++++++++++++++++++++++++++++--
 fs/btrfs/transaction.c |  6 ++++++
 fs/btrfs/transaction.h |  1 +
 4 files changed, 50 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 9d7a6c38f0b1..8ee5645ef9e1 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3458,6 +3458,7 @@ int btrfs_check_data_free_space(struct inode *inode, u64 bytes, u64 write_bytes)
 void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes);
 void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
 				struct btrfs_root *root);
+void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans);
 int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
 				  struct inode *inode);
 void btrfs_orphan_release_metadata(struct inode *inode);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 1cbc71d8cb96..4e08e47ace30 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -4116,11 +4116,19 @@ static void check_system_chunk(struct btrfs_trans_handle *trans,
 	struct btrfs_space_info *info;
 	u64 left;
 	u64 thresh;
+	int ret = 0;
+
+	/*
+	 * Needed because we can end up allocating a system chunk and for an
+	 * atomic and race free space reservation in the chunk block reserve.
+	 */
+	ASSERT(mutex_is_locked(&root->fs_info->chunk_mutex));
 
 	info = __find_space_info(root->fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
 	spin_lock(&info->lock);
 	left = info->total_bytes - info->bytes_used - info->bytes_pinned -
-		info->bytes_reserved - info->bytes_readonly;
+		info->bytes_reserved - info->bytes_readonly -
+		info->bytes_may_use;
 	spin_unlock(&info->lock);
 
 	thresh = get_system_chunk_thresh(root, type);
@@ -4134,7 +4142,21 @@ static void check_system_chunk(struct btrfs_trans_handle *trans,
 		u64 flags;
 
 		flags = btrfs_get_alloc_profile(root->fs_info->chunk_root, 0);
-		btrfs_alloc_chunk(trans, root, flags);
+		/*
+		 * Ignore failure to create system chunk. We might end up not
+		 * needing it, as we might not need to COW all nodes/leafs from
+		 * the paths we visit in the chunk tree (they were already COWed
+		 * or created in the current transaction for example).
+		 */
+		ret = btrfs_alloc_chunk(trans, root, flags);
+	}
+
+	if (!ret) {
+		ret = btrfs_block_rsv_add(root->fs_info->chunk_root,
+					  &root->fs_info->chunk_block_rsv,
+					  thresh, BTRFS_RESERVE_NO_FLUSH);
+		if (!ret)
+			trans->chunk_bytes_reserved += thresh;
 	}
 }
 
@@ -5192,6 +5214,24 @@ void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
 	trans->bytes_reserved = 0;
 }
 
+/*
+ * To be called after all the new block groups attached to the transaction
+ * handle have been created (btrfs_create_pending_block_groups()).
+ */
+void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans)
+{
+	struct btrfs_fs_info *fs_info = trans->root->fs_info;
+
+	if (!trans->chunk_bytes_reserved)
+		return;
+
+	WARN_ON_ONCE(!list_empty(&trans->new_bgs));
+
+	block_rsv_release_bytes(fs_info, &fs_info->chunk_block_rsv, NULL,
+				trans->chunk_bytes_reserved);
+	trans->chunk_bytes_reserved = 0;
+}
+
 /* Can only return 0 or -ENOSPC */
 int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
 				  struct inode *inode)
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 5628e25250c0..03a3ec7e31ea 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -509,6 +509,7 @@ again:
 	h->transaction = cur_trans;
 	h->blocks_used = 0;
 	h->bytes_reserved = 0;
+	h->chunk_bytes_reserved = 0;
 	h->root = root;
 	h->delayed_ref_updates = 0;
 	h->use_count = 1;
@@ -792,6 +793,8 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
 	if (!list_empty(&trans->new_bgs))
 		btrfs_create_pending_block_groups(trans, root);
 
+	btrfs_trans_release_chunk_metadata(trans);
+
 	if (lock && !atomic_read(&root->fs_info->open_ioctl_trans) &&
 	    should_end_transaction(trans, root) &&
 	    ACCESS_ONCE(cur_trans->state) == TRANS_STATE_RUNNING) {
@@ -2054,6 +2057,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 	clear_bit(BTRFS_INODE_BTREE_LOG1_ERR, &btree_ino->runtime_flags);
 	clear_bit(BTRFS_INODE_BTREE_LOG2_ERR, &btree_ino->runtime_flags);
 
+	btrfs_trans_release_chunk_metadata(trans);
+
 	spin_lock(&root->fs_info->trans_lock);
 	cur_trans->state = TRANS_STATE_UNBLOCKED;
 	root->fs_info->running_transaction = NULL;
@@ -2123,6 +2128,7 @@ scrub_continue:
 	btrfs_scrub_continue(root);
 cleanup_transaction:
 	btrfs_trans_release_metadata(trans, root);
+	btrfs_trans_release_chunk_metadata(trans);
 	trans->block_rsv = NULL;
 	if (trans->qgroup_reserved) {
 		btrfs_qgroup_free(root, trans->qgroup_reserved);
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 0b24755596ba..036fa83d6ccb 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -102,6 +102,7 @@ struct btrfs_transaction {
 struct btrfs_trans_handle {
 	u64 transid;
 	u64 bytes_reserved;
+	u64 chunk_bytes_reserved;
 	u64 qgroup_reserved;
 	unsigned long use_count;
 	unsigned long blocks_reserved;
-- 
cgit v1.2.3-55-g7522


From 39c2d7faccc5ca5a1be682b01c0db5fafa8adeda Mon Sep 17 00:00:00 2001
From: Filipe Manana
Date: Wed, 20 May 2015 14:01:55 +0100
Subject: Btrfs: fix -ENOSPC on block group removal

Unlike when attempting to allocate a new block group, where we check
that we have enough space in the system space_info to update the device
items and insert a new chunk item in the chunk tree, we were not checking
if the system space_info had enough space for updating the device items
and deleting the chunk item in the chunk tree. This often lead to -ENOSPC
error when attempting to allocate blocks for the chunk tree (during btree
node/leaf COW operations) while updating the device items or deleting the
chunk item, which resulted in the current transaction being aborted and
turning the filesystem into read-only mode.

While running fstests generic/038, which stresses allocation of block
groups and removal of unused block groups, with a large scratch device
(750Gb) this happened often, despite more than enough unallocated space,
and resulted in the following trace:

[68663.586604] WARNING: CPU: 3 PID: 1521 at fs/btrfs/super.c:260 __btrfs_abort_transaction+0x52/0x114 [btrfs]()
[68663.600407] BTRFS: Transaction aborted (error -28)
(...)
[68663.730829] Call Trace:
[68663.732585]  [<ffffffff8142fa46>] dump_stack+0x4f/0x7b
[68663.734334]  [<ffffffff8108b6a2>] ? console_unlock+0x361/0x3ad
[68663.739980]  [<ffffffff81045ea5>] warn_slowpath_common+0xa1/0xbb
[68663.757153]  [<ffffffffa036ca6d>] ? __btrfs_abort_transaction+0x52/0x114 [btrfs]
[68663.760925]  [<ffffffff81045f05>] warn_slowpath_fmt+0x46/0x48
[68663.762854]  [<ffffffffa03b159d>] ? btrfs_update_device+0x15a/0x16c [btrfs]
[68663.764073]  [<ffffffffa036ca6d>] __btrfs_abort_transaction+0x52/0x114 [btrfs]
[68663.765130]  [<ffffffffa03b3638>] btrfs_remove_chunk+0x597/0x5ee [btrfs]
[68663.765998]  [<ffffffffa0384663>] ? btrfs_delete_unused_bgs+0x245/0x296 [btrfs]
[68663.767068]  [<ffffffffa0384676>] btrfs_delete_unused_bgs+0x258/0x296 [btrfs]
[68663.768227]  [<ffffffff8143527f>] ? _raw_spin_unlock_irq+0x2d/0x4c
[68663.769081]  [<ffffffffa038b109>] cleaner_kthread+0x13d/0x16c [btrfs]
[68663.799485]  [<ffffffffa038afcc>] ? btrfs_alloc_root+0x28/0x28 [btrfs]
[68663.809208]  [<ffffffff8105f367>] kthread+0xef/0xf7
[68663.828795]  [<ffffffff810e603f>] ? time_hardirqs_on+0x15/0x28
[68663.844942]  [<ffffffff8105f278>] ? __kthread_parkme+0xad/0xad
[68663.846486]  [<ffffffff81435a88>] ret_from_fork+0x58/0x90
[68663.847760]  [<ffffffff8105f278>] ? __kthread_parkme+0xad/0xad
[68663.849503] ---[ end trace 798477c6d6dbaad6 ]---
[68663.850525] BTRFS: error (device sdc) in btrfs_remove_chunk:2652: errno=-28 No space left

So fix this by verifying that enough space exists in system space_info,
and reserving the space in the chunk block reserve, before attempting to
delete the block group and allocate a new system chunk if we don't have
enough space to perform the necessary updates and delete in the chunk
tree. Like for the block group creation case, we don't error our if we
fail to allocate a new system chunk, since we might end up not needing
it (no node/leaf splits happen during the COW operations and/or we end
up not needing to COW any btree nodes or leafs because they were already
COWed in the current transaction and their writeback didn't start yet).

Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/ctree.h       |  4 ++++
 fs/btrfs/extent-tree.c | 31 +++++++++++++++++++++++--------
 fs/btrfs/volumes.c     |  3 +++
 3 files changed, 30 insertions(+), 8 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 8ee5645ef9e1..92e908394403 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3516,6 +3516,10 @@ int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans,
 int __get_raid_index(u64 flags);
 int btrfs_start_write_no_snapshoting(struct btrfs_root *root);
 void btrfs_end_write_no_snapshoting(struct btrfs_root *root);
+void check_system_chunk(struct btrfs_trans_handle *trans,
+			struct btrfs_root *root,
+			const u64 type,
+			const bool is_allocation);
 /* ctree.c */
 int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
 		     int level, int *slot);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 4e08e47ace30..e78ab29f8f1b 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -4092,7 +4092,7 @@ static int should_alloc_chunk(struct btrfs_root *root,
 	return 1;
 }
 
-static u64 get_system_chunk_thresh(struct btrfs_root *root, u64 type)
+static u64 get_profile_num_devs(struct btrfs_root *root, u64 type)
 {
 	u64 num_dev;
 
@@ -4106,17 +4106,24 @@ static u64 get_system_chunk_thresh(struct btrfs_root *root, u64 type)
 	else
 		num_dev = 1;	/* DUP or single */
 
-	/* metadata for updaing devices and chunk tree */
-	return btrfs_calc_trans_metadata_size(root, num_dev + 1);
+	return num_dev;
 }
 
-static void check_system_chunk(struct btrfs_trans_handle *trans,
-			       struct btrfs_root *root, u64 type)
+/*
+ * If @is_allocation is true, reserve space in the system space info necessary
+ * for allocating a chunk, otherwise if it's false, reserve space necessary for
+ * removing a chunk.
+ */
+void check_system_chunk(struct btrfs_trans_handle *trans,
+			struct btrfs_root *root,
+			u64 type,
+			const bool is_allocation)
 {
 	struct btrfs_space_info *info;
 	u64 left;
 	u64 thresh;
 	int ret = 0;
+	u64 num_devs;
 
 	/*
 	 * Needed because we can end up allocating a system chunk and for an
@@ -4131,7 +4138,15 @@ static void check_system_chunk(struct btrfs_trans_handle *trans,
 		info->bytes_may_use;
 	spin_unlock(&info->lock);
 
-	thresh = get_system_chunk_thresh(root, type);
+	num_devs = get_profile_num_devs(root, type);
+
+	/* num_devs device items to update and 1 chunk item to add or remove */
+	if (is_allocation)
+		thresh = btrfs_calc_trans_metadata_size(root, num_devs + 1);
+	else
+		thresh = btrfs_calc_trans_metadata_size(root, num_devs) +
+			btrfs_calc_trunc_metadata_size(root, 1);
+
 	if (left < thresh && btrfs_test_opt(root, ENOSPC_DEBUG)) {
 		btrfs_info(root->fs_info, "left=%llu, need=%llu, flags=%llu",
 			left, thresh, type);
@@ -4243,7 +4258,7 @@ again:
 	 * Check if we have enough space in SYSTEM chunk because we may need
 	 * to update devices.
 	 */
-	check_system_chunk(trans, extent_root, flags);
+	check_system_chunk(trans, extent_root, flags, true);
 
 	ret = btrfs_alloc_chunk(trans, extent_root, flags);
 	trans->allocating_chunk = false;
@@ -8905,7 +8920,7 @@ out:
 	if (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM) {
 		alloc_flags = update_block_group_flags(root, cache->flags);
 		lock_chunks(root->fs_info->chunk_root);
-		check_system_chunk(trans, root, alloc_flags);
+		check_system_chunk(trans, root, alloc_flags, true);
 		unlock_chunks(root->fs_info->chunk_root);
 	}
 	mutex_unlock(&root->fs_info->ro_block_group_mutex);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 534be440dd56..d7668756b9d0 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -2625,6 +2625,9 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans,
 		return -EINVAL;
 	}
 	map = (struct map_lookup *)em->bdev;
+	lock_chunks(root->fs_info->chunk_root);
+	check_system_chunk(trans, extent_root, map->type, false);
+	unlock_chunks(root->fs_info->chunk_root);
 
 	for (i = 0; i < map->num_stripes; i++) {
 		struct btrfs_device *device = map->stripes[i].dev;
-- 
cgit v1.2.3-55-g7522


From 0c304304feab8a576ed6ba6ec964255d00d2886e Mon Sep 17 00:00:00 2001
From: Liu Bo
Date: Mon, 25 May 2015 11:20:22 +0800
Subject: Btrfs: remove csum_bytes_left

After commit 8407f553268a
("Btrfs: fix data corruption after fast fsync and writeback error"),
during wait_ordered_extents(), we wait for ordered extent setting
BTRFS_ORDERED_IO_DONE or BTRFS_ORDERED_IOERR, at which point we've
already got checksum information, so we don't need to check
(csum_bytes_left == 0) in the whole logging path.

Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/ordered-data.c | 7 -------
 fs/btrfs/ordered-data.h | 3 ---
 fs/btrfs/tree-log.c     | 6 ------
 3 files changed, 16 deletions(-)

diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 760c4a5e096b..47966cb3d4b6 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -198,9 +198,6 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
 	entry->file_offset = file_offset;
 	entry->start = start;
 	entry->len = len;
-	if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) &&
-	    !(type == BTRFS_ORDERED_NOCOW))
-		entry->csum_bytes_left = disk_len;
 	entry->disk_len = disk_len;
 	entry->bytes_left = len;
 	entry->inode = igrab(inode);
@@ -286,10 +283,6 @@ void btrfs_add_ordered_sum(struct inode *inode,
 	tree = &BTRFS_I(inode)->ordered_tree;
 	spin_lock_irq(&tree->lock);
 	list_add_tail(&sum->list, &entry->list);
-	WARN_ON(entry->csum_bytes_left < sum->len);
-	entry->csum_bytes_left -= sum->len;
-	if (entry->csum_bytes_left == 0)
-		wake_up(&entry->wait);
 	spin_unlock_irq(&tree->lock);
 }
 
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index e96cd4ccd805..a82cd7535d3c 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -89,9 +89,6 @@ struct btrfs_ordered_extent {
 	/* number of bytes that still need writing */
 	u64 bytes_left;
 
-	/* number of bytes that still need csumming */
-	u64 csum_bytes_left;
-
 	/*
 	 * the end of the ordered extent which is behind it but
 	 * didn't update disk_i_size. Please see the comment of
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index d04968374e9d..1ce80c1c4eb6 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -3881,12 +3881,6 @@ static int wait_ordered_extents(struct btrfs_trans_handle *trans,
 				     &ordered->flags))
 			continue;
 
-		if (ordered->csum_bytes_left) {
-			btrfs_start_ordered_extent(inode, ordered, 0);
-			wait_event(ordered->wait,
-				   ordered->csum_bytes_left == 0);
-		}
-
 		list_for_each_entry(sum, &ordered->list, list) {
 			ret = btrfs_csum_file_blocks(trans, log, sum);
 			if (ret)
-- 
cgit v1.2.3-55-g7522


From 8635eda91ee11690bd8f73b2504ee19431fd6380 Mon Sep 17 00:00:00 2001
From: Liu Bo
Date: Mon, 25 May 2015 17:30:14 +0800
Subject: Btrfs: add missing free_extent_buffer

read_tree_block may take a reference on the 'eb', a following
free_extent_buffer is necessary.

Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
Reviewed-by: David Sterba <dsterba@suse.cz>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/extent-tree.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index e78ab29f8f1b..076fd7484a82 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -7981,6 +7981,7 @@ walk_down:
 			eb = read_tree_block(root, child_bytenr, child_gen);
 			if (!eb || !extent_buffer_uptodate(eb)) {
 				ret = -EIO;
+				free_extent_buffer(eb);
 				goto out;
 			}
 
-- 
cgit v1.2.3-55-g7522


From 64c043de466d5746e7ca306dab9d418cd871cefc Mon Sep 17 00:00:00 2001
From: Liu Bo
Date: Mon, 25 May 2015 17:30:15 +0800
Subject: Btrfs: fix up read_tree_block to return proper error

The return value of read_tree_block() can confuse callers as it always
returns NULL for either -ENOMEM or -EIO, so it's likely that callers
parse it to a wrong error, for instance, in btrfs_read_tree_root().

This fixes the above issue.

Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
Reviewed-by: David Sterba <dsterba@suse.cz>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/backref.c     |  9 +++++++--
 fs/btrfs/ctree.c       | 16 ++++++++++------
 fs/btrfs/disk-io.c     | 28 +++++++++++++++-------------
 fs/btrfs/extent-tree.c | 11 ++++++++---
 fs/btrfs/relocation.c  | 19 ++++++++++++++-----
 5 files changed, 54 insertions(+), 29 deletions(-)

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 614aaa1969bd..679dc97354be 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -491,7 +491,9 @@ static int __add_missing_keys(struct btrfs_fs_info *fs_info,
 		BUG_ON(!ref->wanted_disk_byte);
 		eb = read_tree_block(fs_info->tree_root, ref->wanted_disk_byte,
 				     0);
-		if (!eb || !extent_buffer_uptodate(eb)) {
+		if (IS_ERR(eb)) {
+			return PTR_ERR(eb);
+		} else if (!extent_buffer_uptodate(eb)) {
 			free_extent_buffer(eb);
 			return -EIO;
 		}
@@ -1034,7 +1036,10 @@ again:
 
 				eb = read_tree_block(fs_info->extent_root,
 							   ref->parent, 0);
-				if (!eb || !extent_buffer_uptodate(eb)) {
+				if (IS_ERR(eb)) {
+					ret = PTR_ERR(eb);
+					goto out;
+				} else if (!extent_buffer_uptodate(eb)) {
 					free_extent_buffer(eb);
 					ret = -EIO;
 					goto out;
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 0f11ebc92f02..54114b4887dd 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1439,8 +1439,9 @@ get_old_root(struct btrfs_root *root, u64 time_seq)
 		btrfs_tree_read_unlock(eb_root);
 		free_extent_buffer(eb_root);
 		old = read_tree_block(root, logical, 0);
-		if (WARN_ON(!old || !extent_buffer_uptodate(old))) {
-			free_extent_buffer(old);
+		if (WARN_ON(IS_ERR(old) || !extent_buffer_uptodate(old))) {
+			if (!IS_ERR(old))
+				free_extent_buffer(old);
 			btrfs_warn(root->fs_info,
 				"failed to read tree block %llu from get_old_root", logical);
 		} else {
@@ -1685,7 +1686,9 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
 		if (!cur || !uptodate) {
 			if (!cur) {
 				cur = read_tree_block(root, blocknr, gen);
-				if (!cur || !extent_buffer_uptodate(cur)) {
+				if (IS_ERR(cur)) {
+					return PTR_ERR(cur);
+				} else if (!extent_buffer_uptodate(cur)) {
 					free_extent_buffer(cur);
 					return -EIO;
 				}
@@ -1864,8 +1867,9 @@ static noinline struct extent_buffer *read_node_slot(struct btrfs_root *root,
 
 	eb = read_tree_block(root, btrfs_node_blockptr(parent, slot),
 			     btrfs_node_ptr_generation(parent, slot));
-	if (eb && !extent_buffer_uptodate(eb)) {
-		free_extent_buffer(eb);
+	if (IS_ERR(eb) || !extent_buffer_uptodate(eb)) {
+		if (!IS_ERR(eb))
+			free_extent_buffer(eb);
 		eb = NULL;
 	}
 
@@ -2494,7 +2498,7 @@ read_block_for_search(struct btrfs_trans_handle *trans,
 
 	ret = -EAGAIN;
 	tmp = read_tree_block(root, blocknr, 0);
-	if (tmp) {
+	if (!IS_ERR(tmp)) {
 		/*
 		 * If the read above didn't mark this buffer up to date,
 		 * it will never end up being up to date.  Set ret to EIO now
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 2ef9a4b72d06..7f8377871283 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1149,12 +1149,12 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
 
 	buf = btrfs_find_create_tree_block(root, bytenr);
 	if (!buf)
-		return NULL;
+		return ERR_PTR(-ENOMEM);
 
 	ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid);
 	if (ret) {
 		free_extent_buffer(buf);
-		return NULL;
+		return ERR_PTR(ret);
 	}
 	return buf;
 
@@ -1509,20 +1509,19 @@ static struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root,
 	generation = btrfs_root_generation(&root->root_item);
 	root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
 				     generation);
-	if (!root->node) {
-		ret = -ENOMEM;
+	if (IS_ERR(root->node)) {
+		ret = PTR_ERR(root->node);
 		goto find_fail;
 	} else if (!btrfs_buffer_uptodate(root->node, generation, 0)) {
 		ret = -EIO;
-		goto read_fail;
+		free_extent_buffer(root->node);
+		goto find_fail;
 	}
 	root->commit_root = btrfs_root_node(root);
 out:
 	btrfs_free_path(path);
 	return root;
 
-read_fail:
-	free_extent_buffer(root->node);
 find_fail:
 	kfree(root);
 alloc_fail:
@@ -2320,8 +2319,11 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info,
 
 	log_tree_root->node = read_tree_block(tree_root, bytenr,
 			fs_info->generation + 1);
-	if (!log_tree_root->node ||
-	    !extent_buffer_uptodate(log_tree_root->node)) {
+	if (IS_ERR(log_tree_root->node)) {
+		printk(KERN_ERR "BTRFS: failed to read log tree\n");
+		kfree(log_tree_root);
+		return PTR_ERR(log_tree_root->node);
+	} else if (!extent_buffer_uptodate(log_tree_root->node)) {
 		printk(KERN_ERR "BTRFS: failed to read log tree\n");
 		free_extent_buffer(log_tree_root->node);
 		kfree(log_tree_root);
@@ -2797,8 +2799,8 @@ int open_ctree(struct super_block *sb,
 	chunk_root->node = read_tree_block(chunk_root,
 					   btrfs_super_chunk_root(disk_super),
 					   generation);
-	if (!chunk_root->node ||
-	    !test_bit(EXTENT_BUFFER_UPTODATE, &chunk_root->node->bflags)) {
+	if (IS_ERR(chunk_root->node) ||
+	    !extent_buffer_uptodate(chunk_root->node)) {
 		printk(KERN_ERR "BTRFS: failed to read chunk root on %s\n",
 		       sb->s_id);
 		goto fail_tree_roots;
@@ -2834,8 +2836,8 @@ retry_root_backup:
 	tree_root->node = read_tree_block(tree_root,
 					  btrfs_super_root(disk_super),
 					  generation);
-	if (!tree_root->node ||
-	    !test_bit(EXTENT_BUFFER_UPTODATE, &tree_root->node->bflags)) {
+	if (IS_ERR(tree_root->node) ||
+	    !extent_buffer_uptodate(tree_root->node)) {
 		printk(KERN_WARNING "BTRFS: failed to read tree root on %s\n",
 		       sb->s_id);
 
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 076fd7484a82..f1d1216f7feb 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -7979,9 +7979,12 @@ walk_down:
 			child_gen = btrfs_node_ptr_generation(eb, parent_slot);
 
 			eb = read_tree_block(root, child_bytenr, child_gen);
-			if (!eb || !extent_buffer_uptodate(eb)) {
-				ret = -EIO;
+			if (IS_ERR(eb)) {
+				ret = PTR_ERR(eb);
+				goto out;
+			} else if (!extent_buffer_uptodate(eb)) {
 				free_extent_buffer(eb);
+				ret = -EIO;
 				goto out;
 			}
 
@@ -8211,7 +8214,9 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
 		if (reada && level == 1)
 			reada_walk_down(trans, root, wc, path);
 		next = read_tree_block(root, bytenr, generation);
-		if (!next || !extent_buffer_uptodate(next)) {
+		if (IS_ERR(next)) {
+			return PTR_ERR(next);
+		} else if (!extent_buffer_uptodate(next)) {
 			free_extent_buffer(next);
 			return -EIO;
 		}
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 74b24b01d574..827951fbf7fc 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -1847,8 +1847,10 @@ again:
 			}
 
 			eb = read_tree_block(dest, old_bytenr, old_ptr_gen);
-			if (!eb || !extent_buffer_uptodate(eb)) {
-				ret = (!eb) ? -ENOMEM : -EIO;
+			if (IS_ERR(eb)) {
+				ret = PTR_ERR(eb);
+			} else if (!extent_buffer_uptodate(eb)) {
+				ret = -EIO;
 				free_extent_buffer(eb);
 				break;
 			}
@@ -2002,7 +2004,9 @@ int walk_down_reloc_tree(struct btrfs_root *root, struct btrfs_path *path,
 
 		bytenr = btrfs_node_blockptr(eb, path->slots[i]);
 		eb = read_tree_block(root, bytenr, ptr_gen);
-		if (!eb || !extent_buffer_uptodate(eb)) {
+		if (IS_ERR(eb)) {
+			return PTR_ERR(eb);
+		} else if (!extent_buffer_uptodate(eb)) {
 			free_extent_buffer(eb);
 			return -EIO;
 		}
@@ -2710,7 +2714,10 @@ static int do_relocation(struct btrfs_trans_handle *trans,
 		blocksize = root->nodesize;
 		generation = btrfs_node_ptr_generation(upper->eb, slot);
 		eb = read_tree_block(root, bytenr, generation);
-		if (!eb || !extent_buffer_uptodate(eb)) {
+		if (IS_ERR(eb)) {
+			err = PTR_ERR(eb);
+			goto next;
+		} else if (!extent_buffer_uptodate(eb)) {
 			free_extent_buffer(eb);
 			err = -EIO;
 			goto next;
@@ -2873,7 +2880,9 @@ static int get_tree_block_key(struct reloc_control *rc,
 	BUG_ON(block->key_ready);
 	eb = read_tree_block(rc->extent_root, block->bytenr,
 			     block->key.offset);
-	if (!eb || !extent_buffer_uptodate(eb)) {
+	if (IS_ERR(eb)) {
+		return PTR_ERR(eb);
+	} else if (!extent_buffer_uptodate(eb)) {
 		free_extent_buffer(eb);
 		return -EIO;
 	}
-- 
cgit v1.2.3-55-g7522


From 6ca0709756710c47ec604dd08b9fc45929d36390 Mon Sep 17 00:00:00 2001
From: Filipe Manana
Date: Tue, 26 May 2015 00:55:42 +0100
Subject: Btrfs: fix hang during inode eviction due to concurrent readahead

Zygo Blaxell and other users have reported occasional hangs while an
inode is being evicted, leading to traces like the following:

[ 5281.972322] INFO: task rm:20488 blocked for more than 120 seconds.
[ 5281.973836]       Not tainted 4.0.0-rc5-btrfs-next-9+ #2
[ 5281.974818] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[ 5281.976364] rm              D ffff8800724cfc38     0 20488   7747 0x00000000
[ 5281.977506]  ffff8800724cfc38 ffff8800724cfc38 ffff880065da5c50 0000000000000001
[ 5281.978461]  ffff8800724cffd8 ffff8801540a5f50 0000000000000008 ffff8801540a5f78
[ 5281.979541]  ffff8801540a5f50 ffff8800724cfc58 ffffffff8143107e 0000000000000123
[ 5281.981396] Call Trace:
[ 5281.982066]  [<ffffffff8143107e>] schedule+0x74/0x83
[ 5281.983341]  [<ffffffffa03b33cf>] wait_on_state+0xac/0xcd [btrfs]
[ 5281.985127]  [<ffffffff81075cd6>] ? signal_pending_state+0x31/0x31
[ 5281.986715]  [<ffffffffa03b4b71>] wait_extent_bit.constprop.32+0x7c/0xde [btrfs]
[ 5281.988680]  [<ffffffffa03b540b>] lock_extent_bits+0x5d/0x88 [btrfs]
[ 5281.990200]  [<ffffffffa03a621d>] btrfs_evict_inode+0x24e/0x5be [btrfs]
[ 5281.991781]  [<ffffffff8116964d>] evict+0xa0/0x148
[ 5281.992735]  [<ffffffff8116a43d>] iput+0x18f/0x1e5
[ 5281.993796]  [<ffffffff81160d4a>] do_unlinkat+0x15b/0x1fa
[ 5281.994806]  [<ffffffff81435b54>] ? ret_from_sys_call+0x1d/0x58
[ 5281.996120]  [<ffffffff8107d314>] ? trace_hardirqs_on_caller+0x18f/0x1ab
[ 5281.997562]  [<ffffffff8123960b>] ? trace_hardirqs_on_thunk+0x3a/0x3f
[ 5281.998815]  [<ffffffff81161a16>] SyS_unlinkat+0x29/0x2b
[ 5281.999920]  [<ffffffff81435b32>] system_call_fastpath+0x12/0x17
[ 5282.001299] 1 lock held by rm/20488:
[ 5282.002066]  #0:  (sb_writers#12){.+.+.+}, at: [<ffffffff8116dd81>] mnt_want_write+0x24/0x4b

This happens when we have readahead, which calls readpages(), happening
right before the inode eviction handler is invoked. So the reason is
essentially:

1) readpages() is called while a reference on the inode is held, so
   eviction can not be triggered before readpages() returns. It also
   locks one or more ranges in the inode's io_tree (which is done at
   extent_io.c:__do_contiguous_readpages());

2) readpages() submits several read bios, all with an end io callback
   that runs extent_io.c:end_bio_extent_readpage() and that is executed
   by other task when a bio finishes, corresponding to a work queue
   (fs_info->end_io_workers) worker kthread. This callback unlocks
   the ranges in the inode's io_tree that were previously locked in
   step 1;

3) readpages() returns, the reference on the inode is dropped;

4) One or more of the read bios previously submitted are still not
   complete (their end io callback was not yet invoked or has not
   yet finished execution);

5) Inode eviction is triggered (through an unlink call for example).
   The inode reference count was not incremented before submitting
   the read bios, therefore this is possible;

6) The eviction handler starts executing and enters the loop that
   iterates over all extent states in the inode's io_tree;

7) The loop picks one extent state record and uses its ->start and
   ->end fields, after releasing the inode's io_tree spinlock, to
   call lock_extent_bits() and clear_extent_bit(). The call to lock
   the range [state->start, state->end] blocks because the whole
   range or a part of it was locked by the previous call to
   readpages() and the corresponding end io callback, which unlocks
   the range was not yet executed;

8) The end io callback for the read bio is executed and unlocks the
   range [state->start, state->end] (or a superset of that range).
   And at clear_extent_bit() the extent_state record state is used
   as a second argument to split_state(), which sets state->start to
   a larger value;

9) The task executing the eviction handler is woken up by the task
   executing the bio's end io callback (through clear_state_bit) and
   the eviction handler locks the range
   [old value for state->start, state->end]. Shortly after, when
   calling clear_extent_bit(), it unlocks the range
   [new value for state->start, state->end], so it ends up unlocking
   only part of the range that it locked, leaving an extent state
   record in the io_tree that represents the unlocked subrange;

10) The eviction handler loop, in its next iteration, gets the
    extent_state record for the subrange that it did not unlock in the
    previous step and then tries to lock it, resulting in an hang.

So fix this by not using the ->start and ->end fields of an existing
extent_state record. This is a simple solution, and an alternative
could be to bump the inode's reference count before submitting each
read bio and having it dropped in the bio's end io callback. But that
would be a more invasive/complex change and would not protect against
other possible places that are not holding a reference on the inode
as well. Something to consider in the future.

Many thanks to Zygo Blaxell for reporting, in the mailing list, the
issue, a set of scripts to trigger it and testing this fix.

Reported-by: Zygo Blaxell <ce3g8jdj@umail.furryterror.org>
Tested-by: Zygo Blaxell <ce3g8jdj@umail.furryterror.org>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/inode.c | 26 +++++++++++++++++++++-----
 1 file changed, 21 insertions(+), 5 deletions(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 8bb013672aee..855935f6671a 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -4986,24 +4986,40 @@ static void evict_inode_truncate_pages(struct inode *inode)
 	}
 	write_unlock(&map_tree->lock);
 
+	/*
+	 * Keep looping until we have no more ranges in the io tree.
+	 * We can have ongoing bios started by readpages (called from readahead)
+	 * that didn't get their end io callbacks called yet or they are still
+	 * in progress ((extent_io.c:end_bio_extent_readpage()). This means some
+	 * ranges can still be locked and eviction started because before
+	 * submitting those bios, which are executed by a separate task (work
+	 * queue kthread), inode references (inode->i_count) were not taken
+	 * (which would be dropped in the end io callback of each bio).
+	 * Therefore here we effectively end up waiting for those bios and
+	 * anyone else holding locked ranges without having bumped the inode's
+	 * reference count - if we don't do it, when they access the inode's
+	 * io_tree to unlock a range it may be too late, leading to an
+	 * use-after-free issue.
+	 */
 	spin_lock(&io_tree->lock);
 	while (!RB_EMPTY_ROOT(&io_tree->state)) {
 		struct extent_state *state;
 		struct extent_state *cached_state = NULL;
+		u64 start;
+		u64 end;
 
 		node = rb_first(&io_tree->state);
 		state = rb_entry(node, struct extent_state, rb_node);
-		atomic_inc(&state->refs);
+		start = state->start;
+		end = state->end;
 		spin_unlock(&io_tree->lock);
 
-		lock_extent_bits(io_tree, state->start, state->end,
-				 0, &cached_state);
-		clear_extent_bit(io_tree, state->start, state->end,
+		lock_extent_bits(io_tree, start, end, 0, &cached_state);
+		clear_extent_bit(io_tree, start, end,
 				 EXTENT_LOCKED | EXTENT_DIRTY |
 				 EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
 				 EXTENT_DEFRAG, 1, 1,
 				 &cached_state, GFP_NOFS);
-		free_extent_state(state);
 
 		cond_resched();
 		spin_lock(&io_tree->lock);
-- 
cgit v1.2.3-55-g7522


From b659ef027792219b590d67a2baf1643a93727d29 Mon Sep 17 00:00:00 2001
From: Filipe Manana
Date: Tue, 31 Mar 2015 14:16:52 +0100
Subject: Btrfs: avoid syncing log in the fast fsync path when not necessary

Commit 3a8b36f37806 ("Btrfs: fix data loss in the fast fsync path") added
a performance regression for that causes an unnecessary sync of the log
trees (fs/subvol and root log trees) when 2 consecutive fsyncs are done
against a file, without no writes or any metadata updates to the inode in
between them and if a transaction is committed before the second fsync is
called.

Huang Ying reported this to lkml (https://lkml.org/lkml/2015/3/18/99)
after a test sysbench test that measured a -62% decrease of file io
requests per second for that tests' workload.

The test is:

  echo performance > /sys/devices/system/cpu/cpu0/cpufreq/scaling_governor
  echo performance > /sys/devices/system/cpu/cpu1/cpufreq/scaling_governor
  echo performance > /sys/devices/system/cpu/cpu2/cpufreq/scaling_governor
  echo performance > /sys/devices/system/cpu/cpu3/cpufreq/scaling_governor
  mkfs -t btrfs /dev/sda2
  mount -t btrfs /dev/sda2 /fs/sda2
  cd /fs/sda2
  for ((i = 0; i < 1024; i++)); do fallocate -l 67108864 testfile.$i; done
  sysbench --test=fileio --max-requests=0 --num-threads=4 --max-time=600 \
    --file-test-mode=rndwr --file-total-size=68719476736 --file-io-mode=sync \
    --file-num=1024 run

A test on kvm guest, running a debug kernel gave me the following results:

Without 3a8b36f378060d:             16.01 reqs/sec
With 3a8b36f378060d:                 3.39 reqs/sec
With 3a8b36f378060d and this patch: 16.04 reqs/sec

Reported-by: Huang Ying <ying.huang@intel.com>
Tested-by: Huang, Ying <ying.huang@intel.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/file.c         |  9 ++++++---
 fs/btrfs/ordered-data.c | 14 ++++++++++++++
 fs/btrfs/ordered-data.h |  3 +++
 3 files changed, 23 insertions(+), 3 deletions(-)

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index b072e17479aa..795d754327a7 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1868,6 +1868,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
 	struct btrfs_log_ctx ctx;
 	int ret = 0;
 	bool full_sync = 0;
+	const u64 len = end - start + 1;
 
 	trace_btrfs_sync_file(file, datasync);
 
@@ -1896,7 +1897,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
 		 * all extents are persisted and the respective file extent
 		 * items are in the fs/subvol btree.
 		 */
-		ret = btrfs_wait_ordered_range(inode, start, end - start + 1);
+		ret = btrfs_wait_ordered_range(inode, start, len);
 	} else {
 		/*
 		 * Start any new ordered operations before starting to log the
@@ -1968,8 +1969,10 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
 	 */
 	smp_mb();
 	if (btrfs_inode_in_log(inode, root->fs_info->generation) ||
-	    (full_sync && BTRFS_I(inode)->last_trans <=
-	     root->fs_info->last_trans_committed)) {
+	    (BTRFS_I(inode)->last_trans <=
+	     root->fs_info->last_trans_committed &&
+	     (full_sync ||
+	      !btrfs_have_ordered_extents_in_range(inode, start, len)))) {
 		/*
 		 * We'v had everything committed since the last time we were
 		 * modified so clear this flag in case it was set for whatever
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 47966cb3d4b6..ceccd078c93d 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -837,6 +837,20 @@ out:
 	return entry;
 }
 
+bool btrfs_have_ordered_extents_in_range(struct inode *inode,
+					 u64 file_offset,
+					 u64 len)
+{
+	struct btrfs_ordered_extent *oe;
+
+	oe = btrfs_lookup_ordered_range(inode, file_offset, len);
+	if (oe) {
+		btrfs_put_ordered_extent(oe);
+		return true;
+	}
+	return false;
+}
+
 /*
  * lookup and return any extent before 'file_offset'.  NULL is returned
  * if none is found
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index a82cd7535d3c..7176cc0fe43f 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -188,6 +188,9 @@ btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset);
 struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode,
 							u64 file_offset,
 							u64 len);
+bool btrfs_have_ordered_extents_in_range(struct inode *inode,
+					 u64 file_offset,
+					 u64 len);
 int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
 				struct btrfs_ordered_extent *ordered);
 int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
-- 
cgit v1.2.3-55-g7522


From 7558c8bc17481c1f856e009af8503ab40fec348a Mon Sep 17 00:00:00 2001
From: Filipe Manana
Date: Fri, 17 Apr 2015 17:08:37 +0100
Subject: Btrfs: don't attach unnecessary extents to transaction on fsync

We don't need to attach ordered extents that have completed to the current
transaction. Doing so only makes us hold memory for longer than necessary
and delaying the iput of the inode until the transaction is committed (for
each created ordered extent we do an igrab and then schedule an asynchronous
iput when the ordered extent's reference count drops to 0), preventing the
inode from being evictable until the transaction commits.

Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/ordered-data.c | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index ceccd078c93d..89656d799ff6 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -502,7 +502,21 @@ void btrfs_wait_logged_extents(struct btrfs_trans_handle *trans,
 		wait_event(ordered->wait, test_bit(BTRFS_ORDERED_IO_DONE,
 						   &ordered->flags));
 
-		list_add_tail(&ordered->trans_list, &trans->ordered);
+		/*
+		 * If our ordered extent completed it means it updated the
+		 * fs/subvol and csum trees already, so no need to make the
+		 * current transaction's commit wait for it, as we end up
+		 * holding memory unnecessarily and delaying the inode's iput
+		 * until the transaction commit (we schedule an iput for the
+		 * inode when the ordered extent's refcount drops to 0), which
+		 * prevents it from being evictable until the transaction
+		 * commits.
+		 */
+		if (test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags))
+			btrfs_put_ordered_extent(ordered);
+		else
+			list_add_tail(&ordered->trans_list, &trans->ordered);
+
 		spin_lock_irq(&log->log_extents_lock[index]);
 	}
 	spin_unlock_irq(&log->log_extents_lock[index]);
-- 
cgit v1.2.3-55-g7522


From 4617ea3a52cfe8ae407ef406ab999f40a558c369 Mon Sep 17 00:00:00 2001
From: Filipe Manana
Date: Tue, 9 Jun 2015 17:48:21 +0100
Subject: Btrfs: fix necessary chunk tree space calculation when allocating a
 chunk

When allocating a new chunk or removing one we need to update num_devs
device items and insert or remove a chunk item in the chunk tree, so
in the worst case the space needed in the chunk space_info is:

  btrfs_calc_trunc_metadata_size(chunk_root, num_devs) +
     btrfs_calc_trans_metadata_size(chunk_root, 1)

That is, in the worst case we need to cow num_devs paths and cow 1 other
path that can result in splitting every node and leaf, and each path
consisting of BTRFS_MAX_LEVEL - 1 nodes and 1 leaf. We were requiring
some additional chunk_root->nodesize * BTRFS_MAX_LEVEL * num_devs bytes,
which were unnecessary since updating the existing device items does
not result in splitting the nodes and leaf since after updating them
they remain with the same size.

Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/ctree.h       |  3 +--
 fs/btrfs/extent-tree.c | 14 +++++---------
 fs/btrfs/volumes.c     |  2 +-
 3 files changed, 7 insertions(+), 12 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 92e908394403..5e09834ac2ef 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3518,8 +3518,7 @@ int btrfs_start_write_no_snapshoting(struct btrfs_root *root);
 void btrfs_end_write_no_snapshoting(struct btrfs_root *root);
 void check_system_chunk(struct btrfs_trans_handle *trans,
 			struct btrfs_root *root,
-			const u64 type,
-			const bool is_allocation);
+			const u64 type);
 /* ctree.c */
 int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
 		     int level, int *slot);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index f1d1216f7feb..4eefabcc838f 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -4116,8 +4116,7 @@ static u64 get_profile_num_devs(struct btrfs_root *root, u64 type)
  */
 void check_system_chunk(struct btrfs_trans_handle *trans,
 			struct btrfs_root *root,
-			u64 type,
-			const bool is_allocation)
+			u64 type)
 {
 	struct btrfs_space_info *info;
 	u64 left;
@@ -4141,11 +4140,8 @@ void check_system_chunk(struct btrfs_trans_handle *trans,
 	num_devs = get_profile_num_devs(root, type);
 
 	/* num_devs device items to update and 1 chunk item to add or remove */
-	if (is_allocation)
-		thresh = btrfs_calc_trans_metadata_size(root, num_devs + 1);
-	else
-		thresh = btrfs_calc_trans_metadata_size(root, num_devs) +
-			btrfs_calc_trunc_metadata_size(root, 1);
+	thresh = btrfs_calc_trunc_metadata_size(root, num_devs) +
+		btrfs_calc_trans_metadata_size(root, 1);
 
 	if (left < thresh && btrfs_test_opt(root, ENOSPC_DEBUG)) {
 		btrfs_info(root->fs_info, "left=%llu, need=%llu, flags=%llu",
@@ -4258,7 +4254,7 @@ again:
 	 * Check if we have enough space in SYSTEM chunk because we may need
 	 * to update devices.
 	 */
-	check_system_chunk(trans, extent_root, flags, true);
+	check_system_chunk(trans, extent_root, flags);
 
 	ret = btrfs_alloc_chunk(trans, extent_root, flags);
 	trans->allocating_chunk = false;
@@ -8926,7 +8922,7 @@ out:
 	if (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM) {
 		alloc_flags = update_block_group_flags(root, cache->flags);
 		lock_chunks(root->fs_info->chunk_root);
-		check_system_chunk(trans, root, alloc_flags, true);
+		check_system_chunk(trans, root, alloc_flags);
 		unlock_chunks(root->fs_info->chunk_root);
 	}
 	mutex_unlock(&root->fs_info->ro_block_group_mutex);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index d7668756b9d0..c208d6a4dddb 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -2626,7 +2626,7 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans,
 	}
 	map = (struct map_lookup *)em->bdev;
 	lock_chunks(root->fs_info->chunk_root);
-	check_system_chunk(trans, extent_root, map->type, false);
+	check_system_chunk(trans, extent_root, map->type);
 	unlock_chunks(root->fs_info->chunk_root);
 
 	for (i = 0; i < map->num_stripes; i++) {
-- 
cgit v1.2.3-55-g7522


From e4826a5b2430f23ad4ec7823efcd413fce9f2d64 Mon Sep 17 00:00:00 2001
From: chandan
Date: Tue, 9 Jun 2015 17:38:32 +0530
Subject: Btrfs: btrfs_defrag_file: Fix ra_index computation.

Read-ahead is done for the pages in the range [ra_index, ra_index + cluster -
1]. So the next read-ahead should be starting from the page at index 'ra_index
+ cluster' (unless we deemed that the extent at 'ra_index + cluster' as
non-defraggable) rather than from the page at index 'ra_index +
max_cluster'. This patch fixes this. I did run the xfstests suite to make sure
that the code does not regress.

Signed-off-by: Chandan Rajendra <chandan@linux.vnet.ibm.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/ioctl.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index f7c65ca056f8..25c422b11bdb 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1368,7 +1368,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
 			ra_index = max(i, ra_index);
 			btrfs_force_ra(inode->i_mapping, ra, file, ra_index,
 				       cluster);
-			ra_index += max_cluster;
+			ra_index += cluster;
 		}
 
 		mutex_lock(&inode->i_mutex);
-- 
cgit v1.2.3-55-g7522


From 070034bdf98544b23a7fcf500618fd31dec06ab2 Mon Sep 17 00:00:00 2001
From: chandan
Date: Tue, 9 Jun 2015 10:35:11 +0530
Subject: Btrfs: btrfs_defrag_file: Fix calculation of max_to_defrag.

max_to_defrag represents the number of pages to defrag rather than the last
page of the file range to be defragged.

Consider a file having 10 4k blocks (i.e. blocks in the range [0 - 9]). If the
defrag ioctl was invoked for the block range [3 - 6], then max_to_defrag
should actually have the value 4. Instead in the current code we end up
setting it to 6.

Now, this does not (yet) cause an issue since the first part of the while loop
condition in btrfs_defrag_file() (i.e. "i <= last_index") causes the control
to flow out of the while loop before any buggy behavior is actually caused. So
the patch just makes sure that max_to_defrag ends up having the right value
rather than fixing a bug. I did run the xfstests suite to make sure that the
code does not regress.

Changelog: v1->v2:
Provide a much descriptive commit message.

Signed-off-by: Chandan Rajendra <chandan@linux.vnet.ibm.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/ioctl.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 25c422b11bdb..9041f154cc32 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1318,7 +1318,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
 		i = range->start >> PAGE_CACHE_SHIFT;
 	}
 	if (!max_to_defrag)
-		max_to_defrag = last_index + 1;
+		max_to_defrag = last_index - i + 1;
 
 	/*
 	 * make writeback starts from i, so the defrag range can be
-- 
cgit v1.2.3-55-g7522


From e1d227a42ea2b4664f94212bd1106b9a3413ffb8 Mon Sep 17 00:00:00 2001
From: Mark Fasheh
Date: Mon, 8 Jun 2015 15:05:25 -0700
Subject: btrfs: Handle unaligned length in extent_same

The extent-same code rejects requests with an unaligned length. This
poses a problem when we want to dedupe the tail extent of files as we
skip cloning the portion between i_size and the extent boundary.

If we don't clone the entire extent, it won't be deleted. So the
combination of these behaviors winds up giving us worst-case dedupe on
many files.

We can fix this by allowing a length that extents to i_size and
internally aligining those to the end of the block. This is what
btrfs_ioctl_clone() so we can just copy that check over.

Signed-off-by: Mark Fasheh <mfasheh@suse.de>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/ioctl.c | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 9041f154cc32..c86b835da7a8 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -2889,12 +2889,19 @@ static int btrfs_cmp_data(struct inode *src, u64 loff, struct inode *dst,
 	return ret;
 }
 
-static int extent_same_check_offsets(struct inode *inode, u64 off, u64 len)
+static int extent_same_check_offsets(struct inode *inode, u64 off, u64 *plen,
+				     u64 olen)
 {
+	u64 len = *plen;
 	u64 bs = BTRFS_I(inode)->root->fs_info->sb->s_blocksize;
 
-	if (off + len > inode->i_size || off + len < off)
+	if (off + olen > inode->i_size || off + olen < off)
 		return -EINVAL;
+
+	/* if we extend to eof, continue to block boundary */
+	if (off + len == inode->i_size)
+		*plen = len = ALIGN(inode->i_size, bs) - off;
+
 	/* Check that we are block aligned - btrfs_clone() requires this */
 	if (!IS_ALIGNED(off, bs) || !IS_ALIGNED(off + len, bs))
 		return -EINVAL;
@@ -2902,10 +2909,11 @@ static int extent_same_check_offsets(struct inode *inode, u64 off, u64 len)
 	return 0;
 }
 
-static int btrfs_extent_same(struct inode *src, u64 loff, u64 len,
+static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen,
 			     struct inode *dst, u64 dst_loff)
 {
 	int ret;
+	u64 len = olen;
 
 	/*
 	 * btrfs_clone() can't handle extents in the same file
@@ -2920,11 +2928,11 @@ static int btrfs_extent_same(struct inode *src, u64 loff, u64 len,
 
 	btrfs_double_lock(src, loff, dst, dst_loff, len);
 
-	ret = extent_same_check_offsets(src, loff, len);
+	ret = extent_same_check_offsets(src, loff, &len, olen);
 	if (ret)
 		goto out_unlock;
 
-	ret = extent_same_check_offsets(dst, dst_loff, len);
+	ret = extent_same_check_offsets(dst, dst_loff, &len, olen);
 	if (ret)
 		goto out_unlock;
 
@@ -2937,7 +2945,7 @@ static int btrfs_extent_same(struct inode *src, u64 loff, u64 len,
 
 	ret = btrfs_cmp_data(src, loff, dst, dst_loff, len);
 	if (ret == 0)
-		ret = btrfs_clone(src, dst, loff, len, len, dst_loff);
+		ret = btrfs_clone(src, dst, loff, olen, len, dst_loff);
 
 out_unlock:
 	btrfs_double_unlock(src, loff, dst, dst_loff, len);
-- 
cgit v1.2.3-55-g7522


From 20b2e3029eef277cd93a46a991004260057e1a9e Mon Sep 17 00:00:00 2001
From: Zhao Lei
Date: Thu, 4 Jun 2015 20:09:15 +0800
Subject: btrfs: Fix lockdep warning of wr_ctx->wr_lock in scrub_free_wr_ctx()

lockdep report following warning in test:
 [25176.843958] =================================
 [25176.844519] [ INFO: inconsistent lock state ]
 [25176.845047] 4.1.0-rc3 #22 Tainted: G        W
 [25176.845591] ---------------------------------
 [25176.846153] inconsistent {SOFTIRQ-ON-W} -> {IN-SOFTIRQ-W} usage.
 [25176.846713] fsstress/26661 [HC0[0]:SC1[1]:HE1:SE0] takes:
 [25176.847246]  (&wr_ctx->wr_lock){+.?...}, at: [<ffffffffa04cdc6d>] scrub_free_ctx+0x2d/0xf0 [btrfs]
 [25176.847838] {SOFTIRQ-ON-W} state was registered at:
 [25176.848396]   [<ffffffff810bf460>] __lock_acquire+0x6a0/0xe10
 [25176.848955]   [<ffffffff810bfd1e>] lock_acquire+0xce/0x2c0
 [25176.849491]   [<ffffffff816489af>] mutex_lock_nested+0x7f/0x410
 [25176.850029]   [<ffffffffa04d04ff>] scrub_stripe+0x4df/0x1080 [btrfs]
 [25176.850575]   [<ffffffffa04d11b1>] scrub_chunk.isra.19+0x111/0x130 [btrfs]
 [25176.851110]   [<ffffffffa04d144c>] scrub_enumerate_chunks+0x27c/0x510 [btrfs]
 [25176.851660]   [<ffffffffa04d3b87>] btrfs_scrub_dev+0x1c7/0x6c0 [btrfs]
 [25176.852189]   [<ffffffffa04e918e>] btrfs_dev_replace_start+0x36e/0x450 [btrfs]
 [25176.852771]   [<ffffffffa04a98e0>] btrfs_ioctl+0x1e10/0x2d20 [btrfs]
 [25176.853315]   [<ffffffff8121c5b8>] do_vfs_ioctl+0x318/0x570
 [25176.853868]   [<ffffffff8121c851>] SyS_ioctl+0x41/0x80
 [25176.854406]   [<ffffffff8164da17>] system_call_fastpath+0x12/0x6f
 [25176.854935] irq event stamp: 51506
 [25176.855511] hardirqs last  enabled at (51506): [<ffffffff810d4ce5>] vprintk_emit+0x225/0x5e0
 [25176.856059] hardirqs last disabled at (51505): [<ffffffff810d4b77>] vprintk_emit+0xb7/0x5e0
 [25176.856642] softirqs last  enabled at (50886): [<ffffffff81067a23>] __do_softirq+0x363/0x640
 [25176.857184] softirqs last disabled at (50949): [<ffffffff8106804d>] irq_exit+0x10d/0x120
 [25176.857746]
 other info that might help us debug this:
 [25176.858845]  Possible unsafe locking scenario:
 [25176.859981]        CPU0
 [25176.860537]        ----
 [25176.861059]   lock(&wr_ctx->wr_lock);
 [25176.861705]   <Interrupt>
 [25176.862272]     lock(&wr_ctx->wr_lock);
 [25176.862881]
  *** DEADLOCK ***

Reason:
 Above warning is caused by:
 Interrupt
 -> bio_endio()
 -> ...
 -> scrub_put_ctx()
 -> scrub_free_ctx() *1
 -> ...
 -> mutex_lock(&wr_ctx->wr_lock);

 scrub_put_ctx() is allowed to be called in end_bio interrupt, but
 in code design, it will never call scrub_free_ctx(sctx) in interrupe
 context(above *1), because btrfs_scrub_dev() get one additional
 reference of sctx->refs, which makes scrub_free_ctx() only called
 withine btrfs_scrub_dev().

 Now the code runs out of our wish, because free sequence in
 scrub_pending_bio_dec() have a gap.

 Current code:
 -----------------------------------+-----------------------------------
 scrub_pending_bio_dec()            |  btrfs_scrub_dev
 -----------------------------------+-----------------------------------
 atomic_dec(&sctx->bios_in_flight); |
 wake_up(&sctx->list_wait);         |
                                    | scrub_put_ctx()
                                    | -> atomic_dec_and_test(&sctx->refs)
 scrub_put_ctx(sctx);               |
 -> atomic_dec_and_test(&sctx->refs)|
 -> scrub_free_ctx()                |
 -----------------------------------+-----------------------------------

 We expected:
 -----------------------------------+-----------------------------------
 scrub_pending_bio_dec()            |  btrfs_scrub_dev
 -----------------------------------+-----------------------------------
 atomic_dec(&sctx->bios_in_flight); |
 wake_up(&sctx->list_wait);         |
 scrub_put_ctx(sctx);               |
 -> atomic_dec_and_test(&sctx->refs)|
                                    | scrub_put_ctx()
                                    | -> atomic_dec_and_test(&sctx->refs)
                                    | -> scrub_free_ctx()
 -----------------------------------+-----------------------------------

Fix:
 Move scrub_pending_bio_dec() to a workqueue, to avoid this function run
 in interrupt context.
 Tested by check tracelog in debug.

Changelog v1->v2:
 Use workqueue instead of adjust function call sequence in v1,
 because v1 will introduce a bug pointed out by:
 Filipe David Manana <fdmanana@gmail.com>

Reported-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Signed-off-by: Zhao Lei <zhaolei@cn.fujitsu.com>
Reviewed-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/async-thread.c |  1 +
 fs/btrfs/async-thread.h |  2 ++
 fs/btrfs/ctree.h        |  1 +
 fs/btrfs/scrub.c        | 26 +++++++++++++++++++++++---
 4 files changed, 27 insertions(+), 3 deletions(-)

diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index df9932b00d08..1ce06c849a86 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -85,6 +85,7 @@ BTRFS_WORK_HELPER(extent_refs_helper);
 BTRFS_WORK_HELPER(scrub_helper);
 BTRFS_WORK_HELPER(scrubwrc_helper);
 BTRFS_WORK_HELPER(scrubnc_helper);
+BTRFS_WORK_HELPER(scrubparity_helper);
 
 static struct __btrfs_workqueue *
 __btrfs_alloc_workqueue(const char *name, unsigned int flags, int max_active,
diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h
index ec2ee477f8ba..b0b093b6afec 100644
--- a/fs/btrfs/async-thread.h
+++ b/fs/btrfs/async-thread.h
@@ -64,6 +64,8 @@ BTRFS_WORK_HELPER_PROTO(extent_refs_helper);
 BTRFS_WORK_HELPER_PROTO(scrub_helper);
 BTRFS_WORK_HELPER_PROTO(scrubwrc_helper);
 BTRFS_WORK_HELPER_PROTO(scrubnc_helper);
+BTRFS_WORK_HELPER_PROTO(scrubparity_helper);
+
 
 struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name,
 					      unsigned int flags,
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 5e09834ac2ef..881549a35fca 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1698,6 +1698,7 @@ struct btrfs_fs_info {
 	struct btrfs_workqueue *scrub_workers;
 	struct btrfs_workqueue *scrub_wr_completion_workers;
 	struct btrfs_workqueue *scrub_nocow_workers;
+	struct btrfs_workqueue *scrub_parity_workers;
 
 #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
 	u32 check_integrity_print_mask;
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index ab5811545a98..9f2feabe99f2 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -2662,18 +2662,30 @@ static void scrub_free_parity(struct scrub_parity *sparity)
 	kfree(sparity);
 }
 
+static void scrub_parity_bio_endio_worker(struct btrfs_work *work)
+{
+	struct scrub_parity *sparity = container_of(work, struct scrub_parity,
+						    work);
+	struct scrub_ctx *sctx = sparity->sctx;
+
+	scrub_free_parity(sparity);
+	scrub_pending_bio_dec(sctx);
+}
+
 static void scrub_parity_bio_endio(struct bio *bio, int error)
 {
 	struct scrub_parity *sparity = (struct scrub_parity *)bio->bi_private;
-	struct scrub_ctx *sctx = sparity->sctx;
 
 	if (error)
 		bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap,
 			  sparity->nsectors);
 
-	scrub_free_parity(sparity);
-	scrub_pending_bio_dec(sctx);
 	bio_put(bio);
+
+	btrfs_init_work(&sparity->work, btrfs_scrubparity_helper,
+			scrub_parity_bio_endio_worker, NULL, NULL);
+	btrfs_queue_work(sparity->sctx->dev_root->fs_info->scrub_parity_workers,
+			 &sparity->work);
 }
 
 static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
@@ -3589,6 +3601,13 @@ static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
 			ret = -ENOMEM;
 			goto out;
 		}
+		fs_info->scrub_parity_workers =
+			btrfs_alloc_workqueue("btrfs-scrubparity", flags,
+					      max_active, 2);
+		if (!fs_info->scrub_parity_workers) {
+			ret = -ENOMEM;
+			goto out;
+		}
 	}
 	++fs_info->scrub_workers_refcnt;
 out:
@@ -3601,6 +3620,7 @@ static noinline_for_stack void scrub_workers_put(struct btrfs_fs_info *fs_info)
 		btrfs_destroy_workqueue(fs_info->scrub_workers);
 		btrfs_destroy_workqueue(fs_info->scrub_wr_completion_workers);
 		btrfs_destroy_workqueue(fs_info->scrub_nocow_workers);
+		btrfs_destroy_workqueue(fs_info->scrub_parity_workers);
 	}
 	WARN_ON(fs_info->scrub_workers_refcnt < 0);
 }
-- 
cgit v1.2.3-55-g7522


From 00db646d3fb3f5f62c2327abcf3630f4cc1075ba Mon Sep 17 00:00:00 2001
From: Qu Wenruo
Date: Wed, 1 Apr 2015 14:39:06 +0800
Subject: btrfs: backref: Don't merge refs which are not for same block.

Old __merge_refs() in backref.c will even merge refs whose root_id are
different, which makes qgroup gives wrong result.

Fix it by checking ref_for_same_block() before any mode specific works.

Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/backref.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 679dc97354be..a3316f1707e6 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -509,7 +509,7 @@ static int __add_missing_keys(struct btrfs_fs_info *fs_info,
 }
 
 /*
- * merge two lists of backrefs and adjust counts accordingly
+ * merge backrefs and adjust counts accordingly
  *
  * mode = 1: merge identical keys, if key is set
  *    FIXME: if we add more keys in __add_prelim_ref, we can merge more here.
@@ -537,9 +537,9 @@ static void __merge_refs(struct list_head *head, int mode)
 
 			ref2 = list_entry(pos2, struct __prelim_ref, list);
 
+			if (!ref_for_same_block(ref1, ref2))
+				continue;
 			if (mode == 1) {
-				if (!ref_for_same_block(ref1, ref2))
-					continue;
 				if (!ref1->parent && ref2->parent) {
 					xchg = ref1;
 					ref1 = ref2;
-- 
cgit v1.2.3-55-g7522


From c6fc24549960f26910cd0c6e4b5f48f2f306b11d Mon Sep 17 00:00:00 2001
From: Qu Wenruo
Date: Mon, 30 Mar 2015 17:03:00 +0800
Subject: btrfs: delayed-ref: Use list to replace the ref_root in ref_head.

This patch replace the rbtree used in ref_head to list.
This has the following advantage:
1) Easier merge logic.
With the new list implement, we only need to care merging the tail
ref_node with the new ref_node.
And this can be done quite easy at insert time, no need to do a
indicated merge at run_delayed_refs().

Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/backref.c     |   9 +--
 fs/btrfs/delayed-ref.c | 156 ++++++++++++++++++++++++++-----------------------
 fs/btrfs/delayed-ref.h |  18 +++++-
 fs/btrfs/disk-io.c     |   8 +--
 fs/btrfs/extent-tree.c |  46 +++------------
 5 files changed, 114 insertions(+), 123 deletions(-)

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index a3316f1707e6..49bc5a41c1f8 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -574,8 +574,8 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
 			      struct list_head *prefs, u64 *total_refs,
 			      u64 inum)
 {
+	struct btrfs_delayed_ref_node *node;
 	struct btrfs_delayed_extent_op *extent_op = head->extent_op;
-	struct rb_node *n = &head->node.rb_node;
 	struct btrfs_key key;
 	struct btrfs_key op_key = {0};
 	int sgn;
@@ -585,12 +585,7 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
 		btrfs_disk_key_to_cpu(&op_key, &extent_op->key);
 
 	spin_lock(&head->lock);
-	n = rb_first(&head->ref_root);
-	while (n) {
-		struct btrfs_delayed_ref_node *node;
-		node = rb_entry(n, struct btrfs_delayed_ref_node,
-				rb_node);
-		n = rb_next(n);
+	list_for_each_entry(node, &head->ref_list, list) {
 		if (node->seq > seq)
 			continue;
 
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 8f8ed7d20bac..4dbc31636d14 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -268,7 +268,7 @@ static inline void drop_delayed_ref(struct btrfs_trans_handle *trans,
 		rb_erase(&head->href_node, &delayed_refs->href_root);
 	} else {
 		assert_spin_locked(&head->lock);
-		rb_erase(&ref->rb_node, &head->ref_root);
+		list_del(&ref->list);
 	}
 	ref->in_tree = 0;
 	btrfs_put_delayed_ref(ref);
@@ -328,48 +328,6 @@ static int merge_ref(struct btrfs_trans_handle *trans,
 	return done;
 }
 
-void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans,
-			      struct btrfs_fs_info *fs_info,
-			      struct btrfs_delayed_ref_root *delayed_refs,
-			      struct btrfs_delayed_ref_head *head)
-{
-	struct rb_node *node;
-	u64 seq = 0;
-
-	assert_spin_locked(&head->lock);
-	/*
-	 * We don't have too much refs to merge in the case of delayed data
-	 * refs.
-	 */
-	if (head->is_data)
-		return;
-
-	spin_lock(&fs_info->tree_mod_seq_lock);
-	if (!list_empty(&fs_info->tree_mod_seq_list)) {
-		struct seq_list *elem;
-
-		elem = list_first_entry(&fs_info->tree_mod_seq_list,
-					struct seq_list, list);
-		seq = elem->seq;
-	}
-	spin_unlock(&fs_info->tree_mod_seq_lock);
-
-	node = rb_first(&head->ref_root);
-	while (node) {
-		struct btrfs_delayed_ref_node *ref;
-
-		ref = rb_entry(node, struct btrfs_delayed_ref_node,
-			       rb_node);
-		/* We can't merge refs that are outside of our seq count */
-		if (seq && ref->seq >= seq)
-			break;
-		if (merge_ref(trans, delayed_refs, head, ref, seq))
-			node = rb_first(&head->ref_root);
-		else
-			node = rb_next(&ref->rb_node);
-	}
-}
-
 int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info,
 			    struct btrfs_delayed_ref_root *delayed_refs,
 			    u64 seq)
@@ -484,6 +442,74 @@ update_existing_ref(struct btrfs_trans_handle *trans,
 	}
 }
 
+/*
+ * Helper to insert the ref_node to the tail or merge with tail.
+ *
+ * Return 0 for insert.
+ * Return >0 for merge.
+ */
+static int
+add_delayed_ref_tail_merge(struct btrfs_trans_handle *trans,
+			   struct btrfs_delayed_ref_root *root,
+			   struct btrfs_delayed_ref_head *href,
+			   struct btrfs_delayed_ref_node *ref)
+{
+	struct btrfs_delayed_ref_node *exist;
+	int mod;
+	int ret = 0;
+
+	spin_lock(&href->lock);
+	/* Check whether we can merge the tail node with ref */
+	if (list_empty(&href->ref_list))
+		goto add_tail;
+	exist = list_entry(href->ref_list.prev, struct btrfs_delayed_ref_node,
+			   list);
+	/* No need to compare bytenr nor is_head */
+	if (exist->type != ref->type || exist->no_quota != ref->no_quota ||
+	    exist->seq != ref->seq)
+		goto add_tail;
+
+	if ((exist->type == BTRFS_TREE_BLOCK_REF_KEY ||
+	     exist->type == BTRFS_SHARED_BLOCK_REF_KEY) &&
+	    comp_tree_refs(btrfs_delayed_node_to_tree_ref(exist),
+			   btrfs_delayed_node_to_tree_ref(ref),
+			   ref->type))
+		goto add_tail;
+	if ((exist->type == BTRFS_EXTENT_DATA_REF_KEY ||
+	     exist->type == BTRFS_SHARED_DATA_REF_KEY) &&
+	    comp_data_refs(btrfs_delayed_node_to_data_ref(exist),
+			   btrfs_delayed_node_to_data_ref(ref)))
+		goto add_tail;
+
+	/* Now we are sure we can merge */
+	ret = 1;
+	if (exist->action == ref->action) {
+		mod = ref->ref_mod;
+	} else {
+		/* Need to change action */
+		if (exist->ref_mod < ref->ref_mod) {
+			exist->action = ref->action;
+			mod = -exist->ref_mod;
+			exist->ref_mod = ref->ref_mod;
+		} else
+			mod = -ref->ref_mod;
+	}
+	exist->ref_mod += mod;
+
+	/* remove existing tail if its ref_mod is zero */
+	if (exist->ref_mod == 0)
+		drop_delayed_ref(trans, root, href, exist);
+	spin_unlock(&href->lock);
+	return ret;
+
+add_tail:
+	list_add_tail(&ref->list, &href->ref_list);
+	atomic_inc(&root->num_entries);
+	trans->delayed_ref_updates++;
+	spin_unlock(&href->lock);
+	return ret;
+}
+
 /*
  * helper function to update the accounting in the head ref
  * existing and update must have the same bytenr
@@ -618,7 +644,7 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info,
 	head_ref = btrfs_delayed_node_to_head(ref);
 	head_ref->must_insert_reserved = must_insert_reserved;
 	head_ref->is_data = is_data;
-	head_ref->ref_root = RB_ROOT;
+	INIT_LIST_HEAD(&head_ref->ref_list);
 	head_ref->processing = 0;
 	head_ref->total_ref_mod = count_mod;
 
@@ -659,10 +685,10 @@ add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
 		     u64 num_bytes, u64 parent, u64 ref_root, int level,
 		     int action, int no_quota)
 {
-	struct btrfs_delayed_ref_node *existing;
 	struct btrfs_delayed_tree_ref *full_ref;
 	struct btrfs_delayed_ref_root *delayed_refs;
 	u64 seq = 0;
+	int ret;
 
 	if (action == BTRFS_ADD_DELAYED_EXTENT)
 		action = BTRFS_ADD_DELAYED_REF;
@@ -693,21 +719,14 @@ add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
 
 	trace_add_delayed_tree_ref(ref, full_ref, action);
 
-	spin_lock(&head_ref->lock);
-	existing = tree_insert(&head_ref->ref_root, &ref->rb_node);
-	if (existing) {
-		update_existing_ref(trans, delayed_refs, head_ref, existing,
-				    ref);
-		/*
-		 * we've updated the existing ref, free the newly
-		 * allocated ref
-		 */
+	ret = add_delayed_ref_tail_merge(trans, delayed_refs, head_ref, ref);
+
+	/*
+	 * XXX: memory should be freed at the same level allocated.
+	 * But bad practice is anywhere... Follow it now. Need cleanup.
+	 */
+	if (ret > 0)
 		kmem_cache_free(btrfs_delayed_tree_ref_cachep, full_ref);
-	} else {
-		atomic_inc(&delayed_refs->num_entries);
-		trans->delayed_ref_updates++;
-	}
-	spin_unlock(&head_ref->lock);
 }
 
 /*
@@ -721,10 +740,10 @@ add_delayed_data_ref(struct btrfs_fs_info *fs_info,
 		     u64 num_bytes, u64 parent, u64 ref_root, u64 owner,
 		     u64 offset, int action, int no_quota)
 {
-	struct btrfs_delayed_ref_node *existing;
 	struct btrfs_delayed_data_ref *full_ref;
 	struct btrfs_delayed_ref_root *delayed_refs;
 	u64 seq = 0;
+	int ret;
 
 	if (action == BTRFS_ADD_DELAYED_EXTENT)
 		action = BTRFS_ADD_DELAYED_REF;
@@ -758,21 +777,10 @@ add_delayed_data_ref(struct btrfs_fs_info *fs_info,
 
 	trace_add_delayed_data_ref(ref, full_ref, action);
 
-	spin_lock(&head_ref->lock);
-	existing = tree_insert(&head_ref->ref_root, &ref->rb_node);
-	if (existing) {
-		update_existing_ref(trans, delayed_refs, head_ref, existing,
-				    ref);
-		/*
-		 * we've updated the existing ref, free the newly
-		 * allocated ref
-		 */
+	ret = add_delayed_ref_tail_merge(trans, delayed_refs, head_ref, ref);
+
+	if (ret > 0)
 		kmem_cache_free(btrfs_delayed_data_ref_cachep, full_ref);
-	} else {
-		atomic_inc(&delayed_refs->num_entries);
-		trans->delayed_ref_updates++;
-	}
-	spin_unlock(&head_ref->lock);
 }
 
 /*
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index 5eb0892396d0..362ca57cfeb7 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -24,9 +24,25 @@
 #define BTRFS_ADD_DELAYED_EXTENT 3 /* record a full extent allocation */
 #define BTRFS_UPDATE_DELAYED_HEAD 4 /* not changing ref count on head ref */
 
+/*
+ * XXX: Qu: I really hate the design that ref_head and tree/data ref shares the
+ * same ref_node structure.
+ * Ref_head is in a higher logic level than tree/data ref, and duplicated
+ * bytenr/num_bytes in ref_node is really a waste or memory, they should be
+ * referred from ref_head.
+ * This gets more disgusting after we use list to store tree/data ref in
+ * ref_head. Must clean this mess up later.
+ */
 struct btrfs_delayed_ref_node {
+	/*
+	 * ref_head use rb tree, stored in ref_root->href.
+	 * indexed by bytenr
+	 */
 	struct rb_node rb_node;
 
+	/*data/tree ref use list, stored in ref_head->ref_list. */
+	struct list_head list;
+
 	/* the starting bytenr of the extent */
 	u64 bytenr;
 
@@ -83,7 +99,7 @@ struct btrfs_delayed_ref_head {
 	struct mutex mutex;
 
 	spinlock_t lock;
-	struct rb_root ref_root;
+	struct list_head ref_list;
 
 	struct rb_node href_node;
 
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 7f8377871283..695363ae1c28 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -4062,6 +4062,7 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
 
 	while ((node = rb_first(&delayed_refs->href_root)) != NULL) {
 		struct btrfs_delayed_ref_head *head;
+		struct btrfs_delayed_ref_node *tmp;
 		bool pin_bytes = false;
 
 		head = rb_entry(node, struct btrfs_delayed_ref_head,
@@ -4077,11 +4078,10 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
 			continue;
 		}
 		spin_lock(&head->lock);
-		while ((node = rb_first(&head->ref_root)) != NULL) {
-			ref = rb_entry(node, struct btrfs_delayed_ref_node,
-				       rb_node);
+		list_for_each_entry_safe_reverse(ref, tmp, &head->ref_list,
+						 list) {
 			ref->in_tree = 0;
-			rb_erase(&ref->rb_node, &head->ref_root);
+			list_del(&ref->list);
 			atomic_dec(&delayed_refs->num_entries);
 			btrfs_put_delayed_ref(ref);
 		}
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 4eefabcc838f..adf0eedde502 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2323,28 +2323,14 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
 	return ret;
 }
 
-static noinline struct btrfs_delayed_ref_node *
+static inline struct btrfs_delayed_ref_node *
 select_delayed_ref(struct btrfs_delayed_ref_head *head)
 {
-	struct rb_node *node;
-	struct btrfs_delayed_ref_node *ref, *last = NULL;;
+	if (list_empty(&head->ref_list))
+		return NULL;
 
-	/*
-	 * select delayed ref of type BTRFS_ADD_DELAYED_REF first.
-	 * this prevents ref count from going down to zero when
-	 * there still are pending delayed ref.
-	 */
-	node = rb_first(&head->ref_root);
-	while (node) {
-		ref = rb_entry(node, struct btrfs_delayed_ref_node,
-				rb_node);
-		if (ref->action == BTRFS_ADD_DELAYED_REF)
-			return ref;
-		else if (last == NULL)
-			last = ref;
-		node = rb_next(node);
-	}
-	return last;
+	return list_entry(head->ref_list.next, struct btrfs_delayed_ref_node,
+			  list);
 }
 
 /*
@@ -2396,16 +2382,7 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
 			}
 		}
 
-		/*
-		 * We need to try and merge add/drops of the same ref since we
-		 * can run into issues with relocate dropping the implicit ref
-		 * and then it being added back again before the drop can
-		 * finish.  If we merged anything we need to re-loop so we can
-		 * get a good ref.
-		 */
 		spin_lock(&locked_ref->lock);
-		btrfs_merge_delayed_refs(trans, fs_info, delayed_refs,
-					 locked_ref);
 
 		/*
 		 * locked_ref is the head node, so we have to go one
@@ -2482,7 +2459,7 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
 			spin_unlock(&locked_ref->lock);
 			spin_lock(&delayed_refs->lock);
 			spin_lock(&locked_ref->lock);
-			if (rb_first(&locked_ref->ref_root) ||
+			if (!list_empty(&locked_ref->ref_list) ||
 			    locked_ref->extent_op) {
 				spin_unlock(&locked_ref->lock);
 				spin_unlock(&delayed_refs->lock);
@@ -2496,7 +2473,7 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
 		} else {
 			actual_count++;
 			ref->in_tree = 0;
-			rb_erase(&ref->rb_node, &locked_ref->ref_root);
+			list_del(&ref->list);
 		}
 		atomic_dec(&delayed_refs->num_entries);
 
@@ -2905,7 +2882,6 @@ static noinline int check_delayed_ref(struct btrfs_trans_handle *trans,
 	struct btrfs_delayed_ref_node *ref;
 	struct btrfs_delayed_data_ref *data_ref;
 	struct btrfs_delayed_ref_root *delayed_refs;
-	struct rb_node *node;
 	int ret = 0;
 
 	delayed_refs = &trans->transaction->delayed_refs;
@@ -2934,11 +2910,7 @@ static noinline int check_delayed_ref(struct btrfs_trans_handle *trans,
 	spin_unlock(&delayed_refs->lock);
 
 	spin_lock(&head->lock);
-	node = rb_first(&head->ref_root);
-	while (node) {
-		ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
-		node = rb_next(node);
-
+	list_for_each_entry(ref, &head->ref_list, list) {
 		/* If it's a shared ref we know a cross reference exists */
 		if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) {
 			ret = 1;
@@ -6448,7 +6420,7 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
 		goto out_delayed_unlock;
 
 	spin_lock(&head->lock);
-	if (rb_first(&head->ref_root))
+	if (!list_empty(&head->ref_list))
 		goto out;
 
 	if (head->extent_op) {
-- 
cgit v1.2.3-55-g7522


From c43d160fcd5e4e4f7d2983f5d5253ca33a1596ca Mon Sep 17 00:00:00 2001
From: Qu Wenruo
Date: Mon, 30 Mar 2015 17:12:29 +0800
Subject: btrfs: delayed-ref: Cleanup the unneeded functions.

Cleanup the rb_tree merge/insert/update functions, since now we use list
instead of rb_tree now.

Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/delayed-ref.c | 174 -------------------------------------------------
 1 file changed, 174 deletions(-)

diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 4dbc31636d14..fc9563d42693 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -84,87 +84,6 @@ static int comp_data_refs(struct btrfs_delayed_data_ref *ref2,
 	return 0;
 }
 
-/*
- * entries in the rb tree are ordered by the byte number of the extent,
- * type of the delayed backrefs and content of delayed backrefs.
- */
-static int comp_entry(struct btrfs_delayed_ref_node *ref2,
-		      struct btrfs_delayed_ref_node *ref1,
-		      bool compare_seq)
-{
-	if (ref1->bytenr < ref2->bytenr)
-		return -1;
-	if (ref1->bytenr > ref2->bytenr)
-		return 1;
-	if (ref1->is_head && ref2->is_head)
-		return 0;
-	if (ref2->is_head)
-		return -1;
-	if (ref1->is_head)
-		return 1;
-	if (ref1->type < ref2->type)
-		return -1;
-	if (ref1->type > ref2->type)
-		return 1;
-	if (ref1->no_quota > ref2->no_quota)
-		return 1;
-	if (ref1->no_quota < ref2->no_quota)
-		return -1;
-	/* merging of sequenced refs is not allowed */
-	if (compare_seq) {
-		if (ref1->seq < ref2->seq)
-			return -1;
-		if (ref1->seq > ref2->seq)
-			return 1;
-	}
-	if (ref1->type == BTRFS_TREE_BLOCK_REF_KEY ||
-	    ref1->type == BTRFS_SHARED_BLOCK_REF_KEY) {
-		return comp_tree_refs(btrfs_delayed_node_to_tree_ref(ref2),
-				      btrfs_delayed_node_to_tree_ref(ref1),
-				      ref1->type);
-	} else if (ref1->type == BTRFS_EXTENT_DATA_REF_KEY ||
-		   ref1->type == BTRFS_SHARED_DATA_REF_KEY) {
-		return comp_data_refs(btrfs_delayed_node_to_data_ref(ref2),
-				      btrfs_delayed_node_to_data_ref(ref1));
-	}
-	BUG();
-	return 0;
-}
-
-/*
- * insert a new ref into the rbtree.  This returns any existing refs
- * for the same (bytenr,parent) tuple, or NULL if the new node was properly
- * inserted.
- */
-static struct btrfs_delayed_ref_node *tree_insert(struct rb_root *root,
-						  struct rb_node *node)
-{
-	struct rb_node **p = &root->rb_node;
-	struct rb_node *parent_node = NULL;
-	struct btrfs_delayed_ref_node *entry;
-	struct btrfs_delayed_ref_node *ins;
-	int cmp;
-
-	ins = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
-	while (*p) {
-		parent_node = *p;
-		entry = rb_entry(parent_node, struct btrfs_delayed_ref_node,
-				 rb_node);
-
-		cmp = comp_entry(entry, ins, 1);
-		if (cmp < 0)
-			p = &(*p)->rb_left;
-		else if (cmp > 0)
-			p = &(*p)->rb_right;
-		else
-			return entry;
-	}
-
-	rb_link_node(node, parent_node, p);
-	rb_insert_color(node, root);
-	return NULL;
-}
-
 /* insert a new ref to head ref rbtree */
 static struct btrfs_delayed_ref_head *htree_insert(struct rb_root *root,
 						   struct rb_node *node)
@@ -277,57 +196,6 @@ static inline void drop_delayed_ref(struct btrfs_trans_handle *trans,
 		trans->delayed_ref_updates--;
 }
 
-static int merge_ref(struct btrfs_trans_handle *trans,
-		     struct btrfs_delayed_ref_root *delayed_refs,
-		     struct btrfs_delayed_ref_head *head,
-		     struct btrfs_delayed_ref_node *ref, u64 seq)
-{
-	struct rb_node *node;
-	int mod = 0;
-	int done = 0;
-
-	node = rb_next(&ref->rb_node);
-	while (!done && node) {
-		struct btrfs_delayed_ref_node *next;
-
-		next = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
-		node = rb_next(node);
-		if (seq && next->seq >= seq)
-			break;
-		if (comp_entry(ref, next, 0))
-			continue;
-
-		if (ref->action == next->action) {
-			mod = next->ref_mod;
-		} else {
-			if (ref->ref_mod < next->ref_mod) {
-				struct btrfs_delayed_ref_node *tmp;
-
-				tmp = ref;
-				ref = next;
-				next = tmp;
-				done = 1;
-			}
-			mod = -next->ref_mod;
-		}
-
-		drop_delayed_ref(trans, delayed_refs, head, next);
-		ref->ref_mod += mod;
-		if (ref->ref_mod == 0) {
-			drop_delayed_ref(trans, delayed_refs, head, ref);
-			done = 1;
-		} else {
-			/*
-			 * You can't have multiples of the same ref on a tree
-			 * block.
-			 */
-			WARN_ON(ref->type == BTRFS_TREE_BLOCK_REF_KEY ||
-				ref->type == BTRFS_SHARED_BLOCK_REF_KEY);
-		}
-	}
-	return done;
-}
-
 int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info,
 			    struct btrfs_delayed_ref_root *delayed_refs,
 			    u64 seq)
@@ -400,48 +268,6 @@ again:
 	return head;
 }
 
-/*
- * helper function to update an extent delayed ref in the
- * rbtree.  existing and update must both have the same
- * bytenr and parent
- *
- * This may free existing if the update cancels out whatever
- * operation it was doing.
- */
-static noinline void
-update_existing_ref(struct btrfs_trans_handle *trans,
-		    struct btrfs_delayed_ref_root *delayed_refs,
-		    struct btrfs_delayed_ref_head *head,
-		    struct btrfs_delayed_ref_node *existing,
-		    struct btrfs_delayed_ref_node *update)
-{
-	if (update->action != existing->action) {
-		/*
-		 * this is effectively undoing either an add or a
-		 * drop.  We decrement the ref_mod, and if it goes
-		 * down to zero we just delete the entry without
-		 * every changing the extent allocation tree.
-		 */
-		existing->ref_mod--;
-		if (existing->ref_mod == 0)
-			drop_delayed_ref(trans, delayed_refs, head, existing);
-		else
-			WARN_ON(existing->type == BTRFS_TREE_BLOCK_REF_KEY ||
-				existing->type == BTRFS_SHARED_BLOCK_REF_KEY);
-	} else {
-		WARN_ON(existing->type == BTRFS_TREE_BLOCK_REF_KEY ||
-			existing->type == BTRFS_SHARED_BLOCK_REF_KEY);
-		/*
-		 * the action on the existing ref matches
-		 * the action on the ref we're trying to add.
-		 * Bump the ref_mod by one so the backref that
-		 * is eventually added/removed has the correct
-		 * reference count
-		 */
-		existing->ref_mod += update->ref_mod;
-	}
-}
-
 /*
  * Helper to insert the ref_node to the tail or merge with tail.
  *
-- 
cgit v1.2.3-55-g7522


From 9c542136fd94941572762e7955e6a054b23e97f7 Mon Sep 17 00:00:00 2001
From: Qu Wenruo
Date: Thu, 12 Mar 2015 16:10:13 +0800
Subject: btrfs: qgroup: Cleanup open-coded old/new_refcnt update and read.

Use inline functions to do such things, to improve readability.

Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Acked-by: David Sterba <dsterba@suse.cz>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/qgroup.c | 95 +++++++++++++++++++++++++++++++------------------------
 1 file changed, 54 insertions(+), 41 deletions(-)

diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 842ff86d4ae8..26f9da26e17e 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -84,11 +84,42 @@ struct btrfs_qgroup {
 
 	/*
 	 * temp variables for accounting operations
+	 * Refer to qgroup_shared_accouting() for details.
 	 */
 	u64 old_refcnt;
 	u64 new_refcnt;
 };
 
+static void btrfs_qgroup_update_old_refcnt(struct btrfs_qgroup *qg, u64 seq,
+					   int mod)
+{
+	if (qg->old_refcnt < seq)
+		qg->old_refcnt = seq;
+	qg->old_refcnt += mod;
+}
+
+static void btrfs_qgroup_update_new_refcnt(struct btrfs_qgroup *qg, u64 seq,
+					   int mod)
+{
+	if (qg->new_refcnt < seq)
+		qg->new_refcnt = seq;
+	qg->new_refcnt += mod;
+}
+
+static inline u64 btrfs_qgroup_get_old_refcnt(struct btrfs_qgroup *qg, u64 seq)
+{
+	if (qg->old_refcnt < seq)
+		return 0;
+	return qg->old_refcnt - seq;
+}
+
+static inline u64 btrfs_qgroup_get_new_refcnt(struct btrfs_qgroup *qg, u64 seq)
+{
+	if (qg->new_refcnt < seq)
+		return 0;
+	return qg->new_refcnt - seq;
+}
+
 /*
  * glue structure to represent the relations between qgroups.
  */
@@ -1601,6 +1632,7 @@ static int qgroup_calc_old_refcnt(struct btrfs_fs_info *fs_info,
 		ULIST_ITER_INIT(&tmp_uiter);
 		while ((tmp_unode = ulist_next(tmp, &tmp_uiter))) {
 			struct btrfs_qgroup_list *glist;
+			int mod;
 
 			qg = u64_to_ptr(tmp_unode->aux);
 			/*
@@ -1612,20 +1644,15 @@ static int qgroup_calc_old_refcnt(struct btrfs_fs_info *fs_info,
 			 * upper level qgroups in order to determine exclusive
 			 * counts.
 			 *
-			 * For rescan we want to set old_refcnt to seq so our
-			 * exclusive calculations end up correct.
+			 * For rescan none of the extent is recorded before so
+			 * we just don't add old_refcnt.
 			 */
 			if (rescan)
-				qg->old_refcnt = seq;
-			else if (qg->old_refcnt < seq)
-				qg->old_refcnt = seq + 1;
-			else
-				qg->old_refcnt++;
-
-			if (qg->new_refcnt < seq)
-				qg->new_refcnt = seq + 1;
+				mod = 0;
 			else
-				qg->new_refcnt++;
+				mod = 1;
+			btrfs_qgroup_update_old_refcnt(qg, seq, mod);
+			btrfs_qgroup_update_new_refcnt(qg, seq, 1);
 			list_for_each_entry(glist, &qg->groups, next_group) {
 				ret = ulist_add(qgroups, glist->group->qgroupid,
 						ptr_to_u64(glist->group),
@@ -1719,14 +1746,8 @@ next:
 		struct btrfs_qgroup_list *glist;
 
 		qg = u64_to_ptr(unode->aux);
-		if (qg->old_refcnt < seq)
-			qg->old_refcnt = seq + 1;
-		else
-			qg->old_refcnt++;
-		if (qg->new_refcnt < seq)
-			qg->new_refcnt = seq + 1;
-		else
-			qg->new_refcnt++;
+		btrfs_qgroup_update_old_refcnt(qg, seq, 1);
+		btrfs_qgroup_update_new_refcnt(qg, seq, 1);
 		list_for_each_entry(glist, &qg->groups, next_group) {
 			ret = ulist_add(qgroups, glist->group->qgroupid,
 					ptr_to_u64(glist->group), GFP_ATOMIC);
@@ -1767,17 +1788,10 @@ static int qgroup_calc_new_refcnt(struct btrfs_fs_info *fs_info,
 		struct btrfs_qgroup_list *glist;
 
 		qg = u64_to_ptr(unode->aux);
-		if (oper->type == BTRFS_QGROUP_OPER_ADD_SHARED) {
-			if (qg->new_refcnt < seq)
-				qg->new_refcnt = seq + 1;
-			else
-				qg->new_refcnt++;
-		} else {
-			if (qg->old_refcnt < seq)
-				qg->old_refcnt = seq + 1;
-			else
-				qg->old_refcnt++;
-		}
+		if (oper->type == BTRFS_QGROUP_OPER_ADD_SHARED)
+			btrfs_qgroup_update_new_refcnt(qg, seq, 1);
+		else
+			btrfs_qgroup_update_old_refcnt(qg, seq, 1);
 		list_for_each_entry(glist, &qg->groups, next_group) {
 			ret = ulist_add(tmp, glist->group->qgroupid,
 					ptr_to_u64(glist->group), GFP_ATOMIC);
@@ -1810,11 +1824,14 @@ static int qgroup_adjust_counters(struct btrfs_fs_info *fs_info,
 		bool dirty = false;
 
 		qg = u64_to_ptr(unode->aux);
+		cur_old_count = btrfs_qgroup_get_old_refcnt(qg, seq);
+		cur_new_count = btrfs_qgroup_get_new_refcnt(qg, seq);
+
 		/*
 		 * Wasn't referenced before but is now, add to the reference
 		 * counters.
 		 */
-		if (qg->old_refcnt <= seq && qg->new_refcnt > seq) {
+		if (cur_old_count == 0 && cur_new_count > 0) {
 			qg->rfer += num_bytes;
 			qg->rfer_cmpr += num_bytes;
 			dirty = true;
@@ -1824,21 +1841,12 @@ static int qgroup_adjust_counters(struct btrfs_fs_info *fs_info,
 		 * Was referenced before but isn't now, subtract from the
 		 * reference counters.
 		 */
-		if (qg->old_refcnt > seq && qg->new_refcnt <= seq) {
+		if (cur_old_count > 0 && cur_new_count == 0) {
 			qg->rfer -= num_bytes;
 			qg->rfer_cmpr -= num_bytes;
 			dirty = true;
 		}
 
-		if (qg->old_refcnt < seq)
-			cur_old_count = 0;
-		else
-			cur_old_count = qg->old_refcnt - seq;
-		if (qg->new_refcnt < seq)
-			cur_new_count = 0;
-		else
-			cur_new_count = qg->new_refcnt - seq;
-
 		/*
 		 * If our refcount was the same as the roots previously but our
 		 * new count isn't the same as the number of roots now then we
@@ -2036,6 +2044,11 @@ static int qgroup_shared_accounting(struct btrfs_trans_handle *trans,
 		new_roots = old_roots;
 		old_roots++;
 	}
+
+	/*
+	 * Bump qgroup_seq to avoid seq overlap
+	 * XXX: This makes qgroup_seq mismatch with oper->seq.
+	 */
 	fs_info->qgroup_seq += old_roots + 1;
 
 
-- 
cgit v1.2.3-55-g7522


From c682f9b3c2e091f3211ca68585be39f2a2beb8d0 Mon Sep 17 00:00:00 2001
From: Qu Wenruo
Date: Tue, 17 Mar 2015 16:59:47 +0800
Subject: btrfs: extent-tree: Use ref_node to replace unneeded parameters in
 __inc_extent_ref() and __free_extent()

__btrfs_inc_extent_ref() and __btrfs_free_extent() have already had too
many parameters, but three of them can be extracted from
btrfs_delayed_ref_node struct.

So use btrfs_delayed_ref_node struct as a single parameter to replace
the bytenr/num_byte/no_quota parameters.

The real objective of this patch is to allow btrfs_qgroup_record_ref()
get the delayed_ref_node in incoming qgroup patches.

Other functions calling btrfs_qgroup_record_ref() are not affected since
the rest will only add/sub exclusive extents, where node is not used.

Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/extent-tree.c | 42 +++++++++++++++++++++---------------------
 1 file changed, 21 insertions(+), 21 deletions(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index adf0eedde502..236a12f7d5f7 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -79,11 +79,10 @@ static int update_block_group(struct btrfs_trans_handle *trans,
 			      u64 num_bytes, int alloc);
 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
 				struct btrfs_root *root,
-				u64 bytenr, u64 num_bytes, u64 parent,
+				struct btrfs_delayed_ref_node *node, u64 parent,
 				u64 root_objectid, u64 owner_objectid,
 				u64 owner_offset, int refs_to_drop,
-				struct btrfs_delayed_extent_op *extra_op,
-				int no_quota);
+				struct btrfs_delayed_extent_op *extra_op);
 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
 				    struct extent_buffer *leaf,
 				    struct btrfs_extent_item *ei);
@@ -1967,10 +1966,9 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 
 static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 				  struct btrfs_root *root,
-				  u64 bytenr, u64 num_bytes,
+				  struct btrfs_delayed_ref_node *node,
 				  u64 parent, u64 root_objectid,
 				  u64 owner, u64 offset, int refs_to_add,
-				  int no_quota,
 				  struct btrfs_delayed_extent_op *extent_op)
 {
 	struct btrfs_fs_info *fs_info = root->fs_info;
@@ -1978,8 +1976,11 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 	struct extent_buffer *leaf;
 	struct btrfs_extent_item *item;
 	struct btrfs_key key;
+	u64 bytenr = node->bytenr;
+	u64 num_bytes = node->num_bytes;
 	u64 refs;
 	int ret;
+	int no_quota = node->no_quota;
 	enum btrfs_qgroup_operation_type type = BTRFS_QGROUP_OPER_ADD_EXCL;
 
 	path = btrfs_alloc_path();
@@ -2087,17 +2088,15 @@ static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
 						 ref->objectid, ref->offset,
 						 &ins, node->ref_mod);
 	} else if (node->action == BTRFS_ADD_DELAYED_REF) {
-		ret = __btrfs_inc_extent_ref(trans, root, node->bytenr,
-					     node->num_bytes, parent,
+		ret = __btrfs_inc_extent_ref(trans, root, node, parent,
 					     ref_root, ref->objectid,
 					     ref->offset, node->ref_mod,
-					     node->no_quota, extent_op);
+					     extent_op);
 	} else if (node->action == BTRFS_DROP_DELAYED_REF) {
-		ret = __btrfs_free_extent(trans, root, node->bytenr,
-					  node->num_bytes, parent,
+		ret = __btrfs_free_extent(trans, root, node, parent,
 					  ref_root, ref->objectid,
 					  ref->offset, node->ref_mod,
-					  extent_op, node->no_quota);
+					  extent_op);
 	} else {
 		BUG();
 	}
@@ -2255,15 +2254,14 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
 						ref->level, &ins,
 						node->no_quota);
 	} else if (node->action == BTRFS_ADD_DELAYED_REF) {
-		ret = __btrfs_inc_extent_ref(trans, root, node->bytenr,
-					     node->num_bytes, parent, ref_root,
-					     ref->level, 0, 1, node->no_quota,
+		ret = __btrfs_inc_extent_ref(trans, root, node,
+					     parent, ref_root,
+					     ref->level, 0, 1,
 					     extent_op);
 	} else if (node->action == BTRFS_DROP_DELAYED_REF) {
-		ret = __btrfs_free_extent(trans, root, node->bytenr,
-					  node->num_bytes, parent, ref_root,
-					  ref->level, 0, 1, extent_op,
-					  node->no_quota);
+		ret = __btrfs_free_extent(trans, root, node,
+					  parent, ref_root,
+					  ref->level, 0, 1, extent_op);
 	} else {
 		BUG();
 	}
@@ -6119,11 +6117,10 @@ static void add_pinned_bytes(struct btrfs_fs_info *fs_info, u64 num_bytes,
 
 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
 				struct btrfs_root *root,
-				u64 bytenr, u64 num_bytes, u64 parent,
+				struct btrfs_delayed_ref_node *node, u64 parent,
 				u64 root_objectid, u64 owner_objectid,
 				u64 owner_offset, int refs_to_drop,
-				struct btrfs_delayed_extent_op *extent_op,
-				int no_quota)
+				struct btrfs_delayed_extent_op *extent_op)
 {
 	struct btrfs_key key;
 	struct btrfs_path *path;
@@ -6137,8 +6134,11 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
 	int extent_slot = 0;
 	int found_extent = 0;
 	int num_to_del = 1;
+	int no_quota = node->no_quota;
 	u32 item_size;
 	u64 refs;
+	u64 bytenr = node->bytenr;
+	u64 num_bytes = node->num_bytes;
 	int last_ref = 0;
 	enum btrfs_qgroup_operation_type type = BTRFS_QGROUP_OPER_SUB_EXCL;
 	bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
-- 
cgit v1.2.3-55-g7522


From d810ef2be5b8b15c9b6f88f9a09b1b4b16e30871 Mon Sep 17 00:00:00 2001
From: Qu Wenruo
Date: Sun, 12 Apr 2015 16:52:34 +0800
Subject: btrfs: qgroup: Add function qgroup_update_refcnt().

This function is used to update refcnt for qgroups.
And is one of the two core functions used in the new qgroup implement.

This is based on the old update_old/new_refcnt, but provides a unified
logic and behavior.

Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/qgroup.c | 58 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 58 insertions(+)

diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 26f9da26e17e..81773176d01f 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1806,6 +1806,64 @@ static int qgroup_calc_new_refcnt(struct btrfs_fs_info *fs_info,
 	return 0;
 }
 
+#define UPDATE_NEW	0
+#define UPDATE_OLD	1
+/*
+ * Walk all of the roots that points to the bytenr and adjust their refcnts.
+ */
+static int qgroup_update_refcnt(struct btrfs_fs_info *fs_info,
+				struct ulist *roots, struct ulist *tmp,
+				struct ulist *qgroups, u64 seq, int update_old)
+{
+	struct ulist_node *unode;
+	struct ulist_iterator uiter;
+	struct ulist_node *tmp_unode;
+	struct ulist_iterator tmp_uiter;
+	struct btrfs_qgroup *qg;
+	int ret = 0;
+
+	if (!roots)
+		return 0;
+	ULIST_ITER_INIT(&uiter);
+	while ((unode = ulist_next(roots, &uiter))) {
+		qg = find_qgroup_rb(fs_info, unode->val);
+		if (!qg)
+			continue;
+
+		ulist_reinit(tmp);
+		ret = ulist_add(qgroups, qg->qgroupid, ptr_to_u64(qg),
+				GFP_ATOMIC);
+		if (ret < 0)
+			return ret;
+		ret = ulist_add(tmp, qg->qgroupid, ptr_to_u64(qg), GFP_ATOMIC);
+		if (ret < 0)
+			return ret;
+		ULIST_ITER_INIT(&tmp_uiter);
+		while ((tmp_unode = ulist_next(tmp, &tmp_uiter))) {
+			struct btrfs_qgroup_list *glist;
+
+			qg = u64_to_ptr(tmp_unode->aux);
+			if (update_old)
+				btrfs_qgroup_update_old_refcnt(qg, seq, 1);
+			else
+				btrfs_qgroup_update_new_refcnt(qg, seq, 1);
+			list_for_each_entry(glist, &qg->groups, next_group) {
+				ret = ulist_add(qgroups, glist->group->qgroupid,
+						ptr_to_u64(glist->group),
+						GFP_ATOMIC);
+				if (ret < 0)
+					return ret;
+				ret = ulist_add(tmp, glist->group->qgroupid,
+						ptr_to_u64(glist->group),
+						GFP_ATOMIC);
+				if (ret < 0)
+					return ret;
+			}
+		}
+	}
+	return 0;
+}
+
 /*
  * This adjusts the counters for all referenced qgroups if need be.
  */
-- 
cgit v1.2.3-55-g7522


From 823ae5b8e340003dacbe7cd08a355efe018c9f1b Mon Sep 17 00:00:00 2001
From: Qu Wenruo
Date: Sun, 12 Apr 2015 16:59:57 +0800
Subject: btrfs: qgroup: Add function qgroup_update_counters().

Add function qgroup_update_counters(), which will update related
qgroups' rfer/excl according to old/new_roots.

This is one of the two core functions for the new qgroup implement.

This is based on btrfs_adjust_coutners() but with clearer logic and
comment.

Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/qgroup.c | 120 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 120 insertions(+)

diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 81773176d01f..2f185eee2387 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1864,6 +1864,126 @@ static int qgroup_update_refcnt(struct btrfs_fs_info *fs_info,
 	return 0;
 }
 
+/*
+ * Update qgroup rfer/excl counters.
+ * Rfer update is easy, codes can explain themselves.
+ * Excl update is tricky, the update is split into 2 part.
+ * Part 1: Possible exclusive <-> sharing detect:
+ *	|	A	|	!A	|
+ *  -------------------------------------
+ *  B	|	*	|	-	|
+ *  -------------------------------------
+ *  !B	|	+	|	**	|
+ *  -------------------------------------
+ *
+ * Conditions:
+ * A:	cur_old_roots < nr_old_roots	(not exclusive before)
+ * !A:	cur_old_roots == nr_old_roots	(possible exclusive before)
+ * B:	cur_new_roots < nr_new_roots	(not exclusive now)
+ * !B:	cur_new_roots == nr_new_roots	(possible exclsuive now)
+ *
+ * Results:
+ * +: Possible sharing -> exclusive	-: Possible exclusive -> sharing
+ * *: Definitely not changed.		**: Possible unchanged.
+ *
+ * For !A and !B condition, the exception is cur_old/new_roots == 0 case.
+ *
+ * To make the logic clear, we first use condition A and B to split
+ * combination into 4 results.
+ *
+ * Then, for result "+" and "-", check old/new_roots == 0 case, as in them
+ * only on variant maybe 0.
+ *
+ * Lastly, check result **, since there are 2 variants maybe 0, split them
+ * again(2x2).
+ * But this time we don't need to consider other things, the codes and logic
+ * is easy to understand now.
+ */
+static int qgroup_update_counters(struct btrfs_fs_info *fs_info,
+				  struct ulist *qgroups,
+				  u64 nr_old_roots,
+				  u64 nr_new_roots,
+				  u64 num_bytes, u64 seq)
+{
+	struct ulist_node *unode;
+	struct ulist_iterator uiter;
+	struct btrfs_qgroup *qg;
+	u64 cur_new_count, cur_old_count;
+
+	ULIST_ITER_INIT(&uiter);
+	while ((unode = ulist_next(qgroups, &uiter))) {
+		bool dirty = false;
+
+		qg = u64_to_ptr(unode->aux);
+		cur_old_count = btrfs_qgroup_get_old_refcnt(qg, seq);
+		cur_new_count = btrfs_qgroup_get_new_refcnt(qg, seq);
+
+		/* Rfer update part */
+		if (cur_old_count == 0 && cur_new_count > 0) {
+			qg->rfer += num_bytes;
+			qg->rfer_cmpr += num_bytes;
+			dirty = true;
+		}
+		if (cur_old_count > 0 && cur_new_count == 0) {
+			qg->rfer -= num_bytes;
+			qg->rfer_cmpr -= num_bytes;
+			dirty = true;
+		}
+
+		/* Excl update part */
+		/* Exclusive/none -> shared case */
+		if (cur_old_count == nr_old_roots &&
+		    cur_new_count < nr_new_roots) {
+			/* Exclusive -> shared */
+			if (cur_old_count != 0) {
+				qg->excl -= num_bytes;
+				qg->excl_cmpr -= num_bytes;
+				dirty = true;
+			}
+		}
+
+		/* Shared -> exclusive/none case */
+		if (cur_old_count < nr_old_roots &&
+		    cur_new_count == nr_new_roots) {
+			/* Shared->exclusive */
+			if (cur_new_count != 0) {
+				qg->excl += num_bytes;
+				qg->excl_cmpr += num_bytes;
+				dirty = true;
+			}
+		}
+
+		/* Exclusive/none -> exclusive/none case */
+		if (cur_old_count == nr_old_roots &&
+		    cur_new_count == nr_new_roots) {
+			if (cur_old_count == 0) {
+				/* None -> exclusive/none */
+
+				if (cur_new_count != 0) {
+					/* None -> exclusive */
+					qg->excl += num_bytes;
+					qg->excl_cmpr += num_bytes;
+					dirty = true;
+				}
+				/* None -> none, nothing changed */
+			} else {
+				/* Exclusive -> exclusive/none */
+
+				if (cur_new_count == 0) {
+					/* Exclusive -> none */
+					qg->excl -= num_bytes;
+					qg->excl_cmpr -= num_bytes;
+					dirty = true;
+				}
+				/* Exclusive -> exclusive, nothing changed */
+			}
+		}
+		if (dirty)
+			qgroup_dirty(fs_info, qg);
+	}
+	return 0;
+}
+
 /*
  * This adjusts the counters for all referenced qgroups if need be.
  */
-- 
cgit v1.2.3-55-g7522


From 3368d001ba5df44930d986e82b1b497d4da285ba Mon Sep 17 00:00:00 2001
From: Qu Wenruo
Date: Thu, 16 Apr 2015 14:34:17 +0800
Subject: btrfs: qgroup: Record possible quota-related extent for qgroup.

Add hook in add_delayed_ref_head() to record quota-related extent record
into delayed_ref_root->dirty_extent_record rb-tree for later qgroup
accounting.

Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/delayed-ref.c | 50 +++++++++++++++++++++++++++++++++++++++++++-------
 fs/btrfs/delayed-ref.h |  3 +++
 fs/btrfs/qgroup.c      | 31 +++++++++++++++++++++++++++++++
 fs/btrfs/qgroup.h      | 17 +++++++++++++++++
 fs/btrfs/transaction.c |  1 +
 5 files changed, 95 insertions(+), 7 deletions(-)

diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index fc9563d42693..fd64fd0f011a 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -22,6 +22,7 @@
 #include "ctree.h"
 #include "delayed-ref.h"
 #include "transaction.h"
+#include "qgroup.h"
 
 struct kmem_cache *btrfs_delayed_ref_head_cachep;
 struct kmem_cache *btrfs_delayed_tree_ref_cachep;
@@ -420,12 +421,14 @@ update_existing_head_ref(struct btrfs_delayed_ref_root *delayed_refs,
 static noinline struct btrfs_delayed_ref_head *
 add_delayed_ref_head(struct btrfs_fs_info *fs_info,
 		     struct btrfs_trans_handle *trans,
-		     struct btrfs_delayed_ref_node *ref, u64 bytenr,
-		     u64 num_bytes, int action, int is_data)
+		     struct btrfs_delayed_ref_node *ref,
+		     struct btrfs_qgroup_extent_record *qrecord,
+		     u64 bytenr, u64 num_bytes, int action, int is_data)
 {
 	struct btrfs_delayed_ref_head *existing;
 	struct btrfs_delayed_ref_head *head_ref = NULL;
 	struct btrfs_delayed_ref_root *delayed_refs;
+	struct btrfs_qgroup_extent_record *qexisting;
 	int count_mod = 1;
 	int must_insert_reserved = 0;
 
@@ -474,6 +477,18 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info,
 	head_ref->processing = 0;
 	head_ref->total_ref_mod = count_mod;
 
+	/* Record qgroup extent info if provided */
+	if (qrecord) {
+		qrecord->bytenr = bytenr;
+		qrecord->num_bytes = num_bytes;
+		qrecord->old_roots = NULL;
+
+		qexisting = btrfs_qgroup_insert_dirty_extent(delayed_refs,
+							     qrecord);
+		if (qexisting)
+			kfree(qrecord);
+	}
+
 	spin_lock_init(&head_ref->lock);
 	mutex_init(&head_ref->mutex);
 
@@ -624,6 +639,7 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
 	struct btrfs_delayed_tree_ref *ref;
 	struct btrfs_delayed_ref_head *head_ref;
 	struct btrfs_delayed_ref_root *delayed_refs;
+	struct btrfs_qgroup_extent_record *record = NULL;
 
 	if (!is_fstree(ref_root) || !fs_info->quota_enabled)
 		no_quota = 0;
@@ -639,6 +655,15 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
 		return -ENOMEM;
 	}
 
+	if (fs_info->quota_enabled && is_fstree(ref_root)) {
+		record = kmalloc(sizeof(*record), GFP_NOFS);
+		if (!record) {
+			kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref);
+			kmem_cache_free(btrfs_delayed_ref_head_cachep, ref);
+			return -ENOMEM;
+		}
+	}
+
 	head_ref->extent_op = extent_op;
 
 	delayed_refs = &trans->transaction->delayed_refs;
@@ -648,7 +673,7 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
 	 * insert both the head node and the new ref without dropping
 	 * the spin lock
 	 */
-	head_ref = add_delayed_ref_head(fs_info, trans, &head_ref->node,
+	head_ref = add_delayed_ref_head(fs_info, trans, &head_ref->node, record,
 					bytenr, num_bytes, action, 0);
 
 	add_delayed_tree_ref(fs_info, trans, head_ref, &ref->node, bytenr,
@@ -673,6 +698,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
 	struct btrfs_delayed_data_ref *ref;
 	struct btrfs_delayed_ref_head *head_ref;
 	struct btrfs_delayed_ref_root *delayed_refs;
+	struct btrfs_qgroup_extent_record *record = NULL;
 
 	if (!is_fstree(ref_root) || !fs_info->quota_enabled)
 		no_quota = 0;
@@ -688,6 +714,16 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
 		return -ENOMEM;
 	}
 
+	if (fs_info->quota_enabled && is_fstree(ref_root)) {
+		record = kmalloc(sizeof(*record), GFP_NOFS);
+		if (!record) {
+			kmem_cache_free(btrfs_delayed_data_ref_cachep, ref);
+			kmem_cache_free(btrfs_delayed_ref_head_cachep,
+					head_ref);
+			return -ENOMEM;
+		}
+	}
+
 	head_ref->extent_op = extent_op;
 
 	delayed_refs = &trans->transaction->delayed_refs;
@@ -697,7 +733,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
 	 * insert both the head node and the new ref without dropping
 	 * the spin lock
 	 */
-	head_ref = add_delayed_ref_head(fs_info, trans, &head_ref->node,
+	head_ref = add_delayed_ref_head(fs_info, trans, &head_ref->node, record,
 					bytenr, num_bytes, action, 1);
 
 	add_delayed_data_ref(fs_info, trans, head_ref, &ref->node, bytenr,
@@ -725,9 +761,9 @@ int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
 	delayed_refs = &trans->transaction->delayed_refs;
 	spin_lock(&delayed_refs->lock);
 
-	add_delayed_ref_head(fs_info, trans, &head_ref->node, bytenr,
-				   num_bytes, BTRFS_UPDATE_DELAYED_HEAD,
-				   extent_op->is_data);
+	add_delayed_ref_head(fs_info, trans, &head_ref->node, NULL, bytenr,
+			     num_bytes, BTRFS_UPDATE_DELAYED_HEAD,
+			     extent_op->is_data);
 
 	spin_unlock(&delayed_refs->lock);
 	return 0;
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index 362ca57cfeb7..4016f963599e 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -148,6 +148,9 @@ struct btrfs_delayed_ref_root {
 	/* head ref rbtree */
 	struct rb_root href_root;
 
+	/* dirty extent records */
+	struct rb_root dirty_extent_root;
+
 	/* this spin lock protects the rbtree and the entries inside */
 	spinlock_t lock;
 
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 2f185eee2387..55465d5d788e 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1553,6 +1553,37 @@ int btrfs_qgroup_record_ref(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
+struct btrfs_qgroup_extent_record
+*btrfs_qgroup_insert_dirty_extent(struct btrfs_delayed_ref_root *delayed_refs,
+				  struct btrfs_qgroup_extent_record *record)
+{
+	struct rb_node **p = &delayed_refs->dirty_extent_root.rb_node;
+	struct rb_node *parent_node = NULL;
+	struct btrfs_qgroup_extent_record *entry;
+	u64 bytenr = record->bytenr;
+
+	while (*p) {
+		parent_node = *p;
+		entry = rb_entry(parent_node, struct btrfs_qgroup_extent_record,
+				 node);
+		if (bytenr < entry->bytenr)
+			p = &(*p)->rb_left;
+		else if (bytenr > entry->bytenr)
+			p = &(*p)->rb_right;
+		else
+			return entry;
+	}
+
+	rb_link_node(&record->node, parent_node, p);
+	rb_insert_color(&record->node, &delayed_refs->dirty_extent_root);
+	return NULL;
+}
+
+/*
+ * The easy accounting, if we are adding/removing the only ref for an extent
+ * then this qgroup and all of the parent qgroups get their refrence and
+ * exclusive counts adjusted.
+ */
 static int qgroup_excl_accounting(struct btrfs_fs_info *fs_info,
 				  struct btrfs_qgroup_operation *oper)
 {
diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
index c5242aa9a4b2..e58155d0390c 100644
--- a/fs/btrfs/qgroup.h
+++ b/fs/btrfs/qgroup.h
@@ -19,6 +19,9 @@
 #ifndef __BTRFS_QGROUP__
 #define __BTRFS_QGROUP__
 
+#include "ulist.h"
+#include "delayed-ref.h"
+
 /*
  * A description of the operations, all of these operations only happen when we
  * are adding the 1st reference for that subvolume in the case of adding space
@@ -58,6 +61,17 @@ struct btrfs_qgroup_operation {
 	struct list_head list;
 };
 
+/*
+ * Record a dirty extent, and info qgroup to update quota on it
+ * TODO: Use kmem cache to alloc it.
+ */
+struct btrfs_qgroup_extent_record {
+	struct rb_node node;
+	u64 bytenr;
+	u64 num_bytes;
+	struct ulist *old_roots;
+};
+
 int btrfs_quota_enable(struct btrfs_trans_handle *trans,
 		       struct btrfs_fs_info *fs_info);
 int btrfs_quota_disable(struct btrfs_trans_handle *trans,
@@ -84,6 +98,9 @@ int btrfs_qgroup_record_ref(struct btrfs_trans_handle *trans,
 			    u64 bytenr, u64 num_bytes,
 			    enum btrfs_qgroup_operation_type type,
 			    int mod_seq);
+struct btrfs_qgroup_extent_record
+*btrfs_qgroup_insert_dirty_extent(struct btrfs_delayed_ref_root *delayed_refs,
+				  struct btrfs_qgroup_extent_record *record);
 int btrfs_delayed_qgroup_accounting(struct btrfs_trans_handle *trans,
 				    struct btrfs_fs_info *fs_info);
 void btrfs_remove_qgroup_operation(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 03a3ec7e31ea..3694d57e759f 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -225,6 +225,7 @@ loop:
 	cur_trans->dirty_bg_run = 0;
 
 	cur_trans->delayed_refs.href_root = RB_ROOT;
+	cur_trans->delayed_refs.dirty_extent_root = RB_ROOT;
 	atomic_set(&cur_trans->delayed_refs.num_entries, 0);
 	cur_trans->delayed_refs.num_heads_ready = 0;
 	cur_trans->delayed_refs.pending_csums = 0;
-- 
cgit v1.2.3-55-g7522


From 3b7d00f99c60b31e1cff0efc6b9178eea3696e27 Mon Sep 17 00:00:00 2001
From: Qu Wenruo
Date: Thu, 16 Apr 2015 16:40:39 +0800
Subject: btrfs: qgroup: Add new function to record old_roots.

Add function btrfs_qgroup_prepare_account_extents() to get old_roots
which are needed for qgroup.

We do it in commit_transaction() and before switch_roots(), and only
search commit_root, so it gives a quite accurate view for previous
transaction.

With old_roots from previous transaction, we can use it to do accurate
account with current transaction.

Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/qgroup.c | 27 +++++++++++++++++++++++++++
 fs/btrfs/qgroup.h |  2 ++
 2 files changed, 29 insertions(+)

diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 55465d5d788e..7b18fff558ca 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1553,6 +1553,33 @@ int btrfs_qgroup_record_ref(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
+int btrfs_qgroup_prepare_account_extents(struct btrfs_trans_handle *trans,
+					 struct btrfs_fs_info *fs_info)
+{
+	struct btrfs_qgroup_extent_record *record;
+	struct btrfs_delayed_ref_root *delayed_refs;
+	struct rb_node *node;
+	int ret = 0;
+
+	delayed_refs = &trans->transaction->delayed_refs;
+
+	/*
+	 * No need to do lock, since this function will only be called in
+	 * btrfs_commmit_transaction().
+	 */
+	node = rb_first(&delayed_refs->dirty_extent_root);
+	while (node) {
+		record = rb_entry(node, struct btrfs_qgroup_extent_record,
+				  node);
+		ret = btrfs_find_all_roots(NULL, fs_info, record->bytenr, 0,
+					   &record->old_roots);
+		if (ret < 0)
+			break;
+		node = rb_next(node);
+	}
+	return ret;
+}
+
 struct btrfs_qgroup_extent_record
 *btrfs_qgroup_insert_dirty_extent(struct btrfs_delayed_ref_root *delayed_refs,
 				  struct btrfs_qgroup_extent_record *record)
diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
index e58155d0390c..6fe249f078ac 100644
--- a/fs/btrfs/qgroup.h
+++ b/fs/btrfs/qgroup.h
@@ -98,6 +98,8 @@ int btrfs_qgroup_record_ref(struct btrfs_trans_handle *trans,
 			    u64 bytenr, u64 num_bytes,
 			    enum btrfs_qgroup_operation_type type,
 			    int mod_seq);
+int btrfs_qgroup_prepare_account_extents(struct btrfs_trans_handle *trans,
+					 struct btrfs_fs_info *fs_info);
 struct btrfs_qgroup_extent_record
 *btrfs_qgroup_insert_dirty_extent(struct btrfs_delayed_ref_root *delayed_refs,
 				  struct btrfs_qgroup_extent_record *record);
-- 
cgit v1.2.3-55-g7522


From 21633fc6037f8ceb2bb927dacc3f9ef46ccae891 Mon Sep 17 00:00:00 2001
From: Qu Wenruo
Date: Thu, 16 Apr 2015 14:54:50 +0800
Subject: btrfs: backref: Add special time_seq == (u64)-1 case for
 btrfs_find_all_roots().

Allow btrfs_find_all_roots() to skip all delayed_ref_head lock and tree
lock to do tree search.

This is important for later qgroup implement which will call
find_all_roots() after fs trees are committed.

Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/backref.c | 35 +++++++++++++++++++++++++++++------
 1 file changed, 29 insertions(+), 6 deletions(-)

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 49bc5a41c1f8..802fabb30e15 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -250,8 +250,12 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
 	 * the first item to check. But sometimes, we may enter it with
 	 * slot==nritems. In that case, go to the next leaf before we continue.
 	 */
-	if (path->slots[0] >= btrfs_header_nritems(path->nodes[0]))
-		ret = btrfs_next_old_leaf(root, path, time_seq);
+	if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
+		if (time_seq == (u64)-1)
+			ret = btrfs_next_leaf(root, path);
+		else
+			ret = btrfs_next_old_leaf(root, path, time_seq);
+	}
 
 	while (!ret && count < total_refs) {
 		eb = path->nodes[0];
@@ -291,7 +295,10 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
 			eie = NULL;
 		}
 next:
-		ret = btrfs_next_old_item(root, path, time_seq);
+		if (time_seq == (u64)-1)
+			ret = btrfs_next_item(root, path);
+		else
+			ret = btrfs_next_old_item(root, path, time_seq);
 	}
 
 	if (ret > 0)
@@ -334,6 +341,8 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
 
 	if (path->search_commit_root)
 		root_level = btrfs_header_level(root->commit_root);
+	else if (time_seq == (u64)-1)
+		root_level = btrfs_header_level(root->node);
 	else
 		root_level = btrfs_old_root_level(root, time_seq);
 
@@ -343,7 +352,12 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
 	}
 
 	path->lowest_level = level;
-	ret = btrfs_search_old_slot(root, &ref->key_for_search, path, time_seq);
+	if (time_seq == (u64)-1)
+		ret = btrfs_search_slot(NULL, root, &ref->key_for_search, path,
+					0, 0);
+	else
+		ret = btrfs_search_old_slot(root, &ref->key_for_search, path,
+					    time_seq);
 
 	/* root node has been locked, we can release @subvol_srcu safely here */
 	srcu_read_unlock(&fs_info->subvol_srcu, index);
@@ -879,6 +893,11 @@ static int __add_keyed_refs(struct btrfs_fs_info *fs_info,
  *
  * NOTE: This can return values > 0
  *
+ * If time_seq is set to (u64)-1, it will not search delayed_refs, and behave
+ * much like trans == NULL case, the difference only lies in it will not
+ * commit root.
+ * The special case is for qgroup to search roots in commit_transaction().
+ *
  * FIXME some caching might speed things up
  */
 static int find_parent_nodes(struct btrfs_trans_handle *trans,
@@ -917,6 +936,9 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans,
 		path->skip_locking = 1;
 	}
 
+	if (time_seq == (u64)-1)
+		path->skip_locking = 1;
+
 	/*
 	 * grab both a lock on the path and a lock on the delayed ref head.
 	 * We need both to get a consistent picture of how the refs look
@@ -931,9 +953,10 @@ again:
 	BUG_ON(ret == 0);
 
 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
-	if (trans && likely(trans->type != __TRANS_DUMMY)) {
+	if (trans && likely(trans->type != __TRANS_DUMMY) &&
+	    time_seq != (u64)-1) {
 #else
-	if (trans) {
+	if (trans && time_seq != (u64)-1) {
 #endif
 		/*
 		 * look if there are updates for this ref queued and lock the
-- 
cgit v1.2.3-55-g7522


From 550d7a2ed5db35756222ec17cff3376ff38d78e2 Mon Sep 17 00:00:00 2001
From: Qu Wenruo
Date: Thu, 16 Apr 2015 15:37:33 +0800
Subject: btrfs: qgroup: Add new qgroup calculation function
 btrfs_qgroup_account_extents().

The new btrfs_qgroup_account_extents() function should be called in
btrfs_commit_transaction() and it will update all the qgroup according
to delayed_ref_root->dirty_extent_root.

The new function can handle both normal operation during
commit_transaction() or in rescan in a unified method with clearer
logic.

Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/extent-tree.h |   0
 fs/btrfs/qgroup.c      | 116 +++++++++++++++++++++++++++++++++++++++++++++++++
 fs/btrfs/qgroup.h      |   2 +
 3 files changed, 118 insertions(+)
 create mode 100644 fs/btrfs/extent-tree.h

diff --git a/fs/btrfs/extent-tree.h b/fs/btrfs/extent-tree.h
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 7b18fff558ca..607c6ca34955 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -2455,6 +2455,122 @@ static int btrfs_qgroup_account(struct btrfs_trans_handle *trans,
 	return ret;
 }
 
+static int
+btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans,
+			    struct btrfs_fs_info *fs_info,
+			    u64 bytenr, u64 num_bytes,
+			    struct ulist *old_roots, struct ulist *new_roots)
+{
+	struct ulist *qgroups = NULL;
+	struct ulist *tmp = NULL;
+	u64 seq;
+	u64 nr_new_roots = 0;
+	u64 nr_old_roots = 0;
+	int ret = 0;
+
+	if (new_roots)
+		nr_new_roots = new_roots->nnodes;
+	if (old_roots)
+		nr_old_roots = old_roots->nnodes;
+
+	if (!fs_info->quota_enabled)
+		goto out_free;
+	BUG_ON(!fs_info->quota_root);
+
+	qgroups = ulist_alloc(GFP_NOFS);
+	if (!qgroups) {
+		ret = -ENOMEM;
+		goto out_free;
+	}
+	tmp = ulist_alloc(GFP_NOFS);
+	if (!tmp) {
+		ret = -ENOMEM;
+		goto out_free;
+	}
+
+	mutex_lock(&fs_info->qgroup_rescan_lock);
+	if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) {
+		if (fs_info->qgroup_rescan_progress.objectid <= bytenr) {
+			mutex_unlock(&fs_info->qgroup_rescan_lock);
+			ret = 0;
+			goto out_free;
+		}
+	}
+	mutex_unlock(&fs_info->qgroup_rescan_lock);
+
+	spin_lock(&fs_info->qgroup_lock);
+	seq = fs_info->qgroup_seq;
+
+	/* Update old refcnts using old_roots */
+	ret = qgroup_update_refcnt(fs_info, old_roots, tmp, qgroups, seq,
+				   UPDATE_OLD);
+	if (ret < 0)
+		goto out;
+
+	/* Update new refcnts using new_roots */
+	ret = qgroup_update_refcnt(fs_info, new_roots, tmp, qgroups, seq,
+				   UPDATE_NEW);
+	if (ret < 0)
+		goto out;
+
+	qgroup_update_counters(fs_info, qgroups, nr_old_roots, nr_new_roots,
+			       num_bytes, seq);
+
+	/*
+	 * Bump qgroup_seq to avoid seq overlap
+	 */
+	fs_info->qgroup_seq += max(nr_old_roots, nr_new_roots) + 1;
+out:
+	spin_unlock(&fs_info->qgroup_lock);
+out_free:
+	ulist_free(tmp);
+	ulist_free(qgroups);
+	ulist_free(old_roots);
+	ulist_free(new_roots);
+	return ret;
+}
+
+int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans,
+				 struct btrfs_fs_info *fs_info)
+{
+	struct btrfs_qgroup_extent_record *record;
+	struct btrfs_delayed_ref_root *delayed_refs;
+	struct ulist *new_roots = NULL;
+	struct rb_node *node;
+	int ret = 0;
+
+	delayed_refs = &trans->transaction->delayed_refs;
+	while ((node = rb_first(&delayed_refs->dirty_extent_root))) {
+		record = rb_entry(node, struct btrfs_qgroup_extent_record,
+				  node);
+
+		if (!ret) {
+			/*
+			 * Use (u64)-1 as time_seq to do special search, which
+			 * doesn't lock tree or delayed_refs and search current
+			 * root. It's safe inside commit_transaction().
+			 */
+			ret = btrfs_find_all_roots(trans, fs_info,
+					record->bytenr, (u64)-1, &new_roots);
+			if (ret < 0)
+				goto cleanup;
+			ret = btrfs_qgroup_account_extent(trans, fs_info,
+					record->bytenr, record->num_bytes,
+					record->old_roots, new_roots);
+			record->old_roots = NULL;
+			new_roots = NULL;
+		}
+cleanup:
+		ulist_free(record->old_roots);
+		ulist_free(new_roots);
+		new_roots = NULL;
+		rb_erase(node, &delayed_refs->dirty_extent_root);
+		kfree(record);
+
+	}
+	return ret;
+}
+
 /*
  * Needs to be called everytime we run delayed refs, even if there is an error
  * in order to cleanup outstanding operations.
diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
index 6fe249f078ac..cb703f859af6 100644
--- a/fs/btrfs/qgroup.h
+++ b/fs/btrfs/qgroup.h
@@ -103,6 +103,8 @@ int btrfs_qgroup_prepare_account_extents(struct btrfs_trans_handle *trans,
 struct btrfs_qgroup_extent_record
 *btrfs_qgroup_insert_dirty_extent(struct btrfs_delayed_ref_root *delayed_refs,
 				  struct btrfs_qgroup_extent_record *record);
+int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans,
+				 struct btrfs_fs_info *fs_info);
 int btrfs_delayed_qgroup_accounting(struct btrfs_trans_handle *trans,
 				    struct btrfs_fs_info *fs_info);
 void btrfs_remove_qgroup_operation(struct btrfs_trans_handle *trans,
-- 
cgit v1.2.3-55-g7522


From 9d220c95f5191f6e99fbb992d5f36a929d3fc432 Mon Sep 17 00:00:00 2001
From: Qu Wenruo
Date: Mon, 13 Apr 2015 11:02:16 +0800
Subject: btrfs: qgroup: Switch rescan to new mechanism.

Switch rescan to use the new new extent oriented mechanism.

As rescan is also based on extent, new mechanism is just a perfect match
for rescan.

With re-designed internal functions, rescan is quite easy, just call
btrfs_find_all_roots() and then btrfs_qgroup_account_one_extent().

Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/qgroup.c | 43 +++++++------------------------------------
 1 file changed, 7 insertions(+), 36 deletions(-)

diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 607c6ca34955..70c366f27e25 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -3002,15 +3002,13 @@ void assert_qgroups_uptodate(struct btrfs_trans_handle *trans)
  */
 static int
 qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
-		   struct btrfs_trans_handle *trans, struct ulist *qgroups,
-		   struct ulist *tmp, struct extent_buffer *scratch_leaf)
+		   struct btrfs_trans_handle *trans,
+		   struct extent_buffer *scratch_leaf)
 {
 	struct btrfs_key found;
 	struct ulist *roots = NULL;
 	struct seq_list tree_mod_seq_elem = SEQ_LIST_INIT(tree_mod_seq_elem);
 	u64 num_bytes;
-	u64 seq;
-	int new_roots;
 	int slot;
 	int ret;
 
@@ -3060,33 +3058,15 @@ qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
 		else
 			num_bytes = found.offset;
 
-		ulist_reinit(qgroups);
 		ret = btrfs_find_all_roots(NULL, fs_info, found.objectid, 0,
 					   &roots);
 		if (ret < 0)
 			goto out;
-		spin_lock(&fs_info->qgroup_lock);
-		seq = fs_info->qgroup_seq;
-		fs_info->qgroup_seq += roots->nnodes + 1; /* max refcnt */
-
-		new_roots = 0;
-		ret = qgroup_calc_old_refcnt(fs_info, 0, tmp, roots, qgroups,
-					     seq, &new_roots, 1);
-		if (ret < 0) {
-			spin_unlock(&fs_info->qgroup_lock);
-			ulist_free(roots);
-			goto out;
-		}
-
-		ret = qgroup_adjust_counters(fs_info, 0, num_bytes, qgroups,
-					     seq, 0, new_roots, 1);
-		if (ret < 0) {
-			spin_unlock(&fs_info->qgroup_lock);
-			ulist_free(roots);
+		/* For rescan, just pass old_roots as NULL */
+		ret = btrfs_qgroup_account_extent(trans, fs_info,
+				found.objectid, num_bytes, NULL, roots);
+		if (ret < 0)
 			goto out;
-		}
-		spin_unlock(&fs_info->qgroup_lock);
-		ulist_free(roots);
 	}
 out:
 	btrfs_put_tree_mod_seq(fs_info, &tree_mod_seq_elem);
@@ -3100,7 +3080,6 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
 						     qgroup_rescan_work);
 	struct btrfs_path *path;
 	struct btrfs_trans_handle *trans = NULL;
-	struct ulist *tmp = NULL, *qgroups = NULL;
 	struct extent_buffer *scratch_leaf = NULL;
 	int err = -ENOMEM;
 	int ret = 0;
@@ -3108,12 +3087,6 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
 	path = btrfs_alloc_path();
 	if (!path)
 		goto out;
-	qgroups = ulist_alloc(GFP_NOFS);
-	if (!qgroups)
-		goto out;
-	tmp = ulist_alloc(GFP_NOFS);
-	if (!tmp)
-		goto out;
 	scratch_leaf = kmalloc(sizeof(*scratch_leaf), GFP_NOFS);
 	if (!scratch_leaf)
 		goto out;
@@ -3129,7 +3102,7 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
 			err = -EINTR;
 		} else {
 			err = qgroup_rescan_leaf(fs_info, path, trans,
-						 qgroups, tmp, scratch_leaf);
+						 scratch_leaf);
 		}
 		if (err > 0)
 			btrfs_commit_transaction(trans, fs_info->fs_root);
@@ -3139,8 +3112,6 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
 
 out:
 	kfree(scratch_leaf);
-	ulist_free(qgroups);
-	ulist_free(tmp);
 	btrfs_free_path(path);
 
 	mutex_lock(&fs_info->qgroup_rescan_lock);
-- 
cgit v1.2.3-55-g7522


From 0ed4792af0e8346cb670b4bc540df7594f4b2020 Mon Sep 17 00:00:00 2001
From: Qu Wenruo
Date: Thu, 16 Apr 2015 16:55:08 +0800
Subject: btrfs: qgroup: Switch to new extent-oriented qgroup mechanism.

Switch from old ref_node based qgroup to extent based qgroup mechanism
for normal operations.

The new mechanism should hugely reduce the overhead of btrfs quota
system, and further more, the codes and logic should be more clean and
easier to maintain.

Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/extent-tree.c | 110 +++++--------------------------------------------
 fs/btrfs/transaction.c |  18 ++++++++
 2 files changed, 28 insertions(+), 100 deletions(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 236a12f7d5f7..b76b42d95619 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1997,26 +1997,8 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 					   bytenr, num_bytes, parent,
 					   root_objectid, owner, offset,
 					   refs_to_add, extent_op);
-	if ((ret < 0 && ret != -EAGAIN) || (!ret && no_quota))
+	if ((ret < 0 && ret != -EAGAIN) || !ret)
 		goto out;
-	/*
-	 * Ok we were able to insert an inline extent and it appears to be a new
-	 * reference, deal with the qgroup accounting.
-	 */
-	if (!ret && !no_quota) {
-		ASSERT(root->fs_info->quota_enabled);
-		leaf = path->nodes[0];
-		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
-		item = btrfs_item_ptr(leaf, path->slots[0],
-				      struct btrfs_extent_item);
-		if (btrfs_extent_refs(leaf, item) > (u64)refs_to_add)
-			type = BTRFS_QGROUP_OPER_ADD_SHARED;
-		btrfs_release_path(path);
-
-		ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid,
-					      bytenr, num_bytes, type, 0);
-		goto out;
-	}
 
 	/*
 	 * Ok we had -EAGAIN which means we didn't have space to insert and
@@ -2036,13 +2018,6 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 	btrfs_mark_buffer_dirty(leaf);
 	btrfs_release_path(path);
 
-	if (!no_quota) {
-		ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid,
-					      bytenr, num_bytes, type, 0);
-		if (ret)
-			goto out;
-	}
-
 	path->reada = 1;
 	path->leave_spinning = 1;
 	/* now insert the actual backref */
@@ -2839,9 +2814,6 @@ again:
 		goto again;
 	}
 out:
-	ret = btrfs_delayed_qgroup_accounting(trans, root->fs_info);
-	if (ret)
-		return ret;
 	assert_qgroups_uptodate(trans);
 	return 0;
 }
@@ -6383,18 +6355,6 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
 	}
 	btrfs_release_path(path);
 
-	/* Deal with the quota accounting */
-	if (!ret && last_ref && !no_quota) {
-		int mod_seq = 0;
-
-		if (owner_objectid >= BTRFS_FIRST_FREE_OBJECTID &&
-		    type == BTRFS_QGROUP_OPER_SUB_SHARED)
-			mod_seq = 1;
-
-		ret = btrfs_qgroup_record_ref(trans, info, root_objectid,
-					      bytenr, num_bytes, type,
-					      mod_seq);
-	}
 out:
 	btrfs_free_path(path);
 	return ret;
@@ -7330,13 +7290,6 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
 	btrfs_mark_buffer_dirty(path->nodes[0]);
 	btrfs_free_path(path);
 
-	/* Always set parent to 0 here since its exclusive anyway. */
-	ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid,
-				      ins->objectid, ins->offset,
-				      BTRFS_QGROUP_OPER_ADD_EXCL, 0);
-	if (ret)
-		return ret;
-
 	ret = update_block_group(trans, root, ins->objectid, ins->offset, 1);
 	if (ret) { /* -ENOENT, logic error */
 		btrfs_err(fs_info, "update block group failed for %llu %llu",
@@ -7418,14 +7371,6 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
 	btrfs_mark_buffer_dirty(leaf);
 	btrfs_free_path(path);
 
-	if (!no_quota) {
-		ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid,
-					      ins->objectid, num_bytes,
-					      BTRFS_QGROUP_OPER_ADD_EXCL, 0);
-		if (ret)
-			return ret;
-	}
-
 	ret = update_block_group(trans, root, ins->objectid, root->nodesize,
 				 1);
 	if (ret) { /* -ENOENT, logic error */
@@ -7782,12 +7727,18 @@ reada:
 	wc->reada_slot = slot;
 }
 
+/*
+ * TODO: Modify related function to add related node/leaf to dirty_extent_root,
+ * for later qgroup accounting.
+ *
+ * Current, this function does nothing.
+ */
 static int account_leaf_items(struct btrfs_trans_handle *trans,
 			      struct btrfs_root *root,
 			      struct extent_buffer *eb)
 {
 	int nr = btrfs_header_nritems(eb);
-	int i, extent_type, ret;
+	int i, extent_type;
 	struct btrfs_key key;
 	struct btrfs_file_extent_item *fi;
 	u64 bytenr, num_bytes;
@@ -7810,13 +7761,6 @@ static int account_leaf_items(struct btrfs_trans_handle *trans,
 			continue;
 
 		num_bytes = btrfs_file_extent_disk_num_bytes(eb, fi);
-
-		ret = btrfs_qgroup_record_ref(trans, root->fs_info,
-					      root->objectid,
-					      bytenr, num_bytes,
-					      BTRFS_QGROUP_OPER_SUB_SUBTREE, 0);
-		if (ret)
-			return ret;
 	}
 	return 0;
 }
@@ -7885,6 +7829,8 @@ static int adjust_slots_upwards(struct btrfs_root *root,
 
 /*
  * root_eb is the subtree root and is locked before this function is called.
+ * TODO: Modify this function to mark all (including complete shared node)
+ * to dirty_extent_root to allow it get accounted in qgroup.
  */
 static int account_shared_subtree(struct btrfs_trans_handle *trans,
 				  struct btrfs_root *root,
@@ -7962,16 +7908,6 @@ walk_down:
 			btrfs_tree_read_lock(eb);
 			btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
 			path->locks[level] = BTRFS_READ_LOCK_BLOCKING;
-
-			ret = btrfs_qgroup_record_ref(trans, root->fs_info,
-						root->objectid,
-						child_bytenr,
-						root->nodesize,
-						BTRFS_QGROUP_OPER_SUB_SUBTREE,
-						0);
-			if (ret)
-				goto out;
-
 		}
 
 		if (level == 0) {
@@ -8566,24 +8502,6 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
 				goto out_end_trans;
 			}
 
-			/*
-			 * Qgroup update accounting is run from
-			 * delayed ref handling. This usually works
-			 * out because delayed refs are normally the
-			 * only way qgroup updates are added. However,
-			 * we may have added updates during our tree
-			 * walk so run qgroups here to make sure we
-			 * don't lose any updates.
-			 */
-			ret = btrfs_delayed_qgroup_accounting(trans,
-							      root->fs_info);
-			if (ret)
-				printk_ratelimited(KERN_ERR "BTRFS: Failure %d "
-						   "running qgroup updates "
-						   "during snapshot delete. "
-						   "Quota is out of sync, "
-						   "rescan required.\n", ret);
-
 			btrfs_end_transaction_throttle(trans, tree_root);
 			if (!for_reloc && btrfs_need_cleaner_sleep(root)) {
 				pr_debug("BTRFS: drop snapshot early exit\n");
@@ -8637,14 +8555,6 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
 	}
 	root_dropped = true;
 out_end_trans:
-	ret = btrfs_delayed_qgroup_accounting(trans, tree_root->fs_info);
-	if (ret)
-		printk_ratelimited(KERN_ERR "BTRFS: Failure %d "
-				   "running qgroup updates "
-				   "during snapshot delete. "
-				   "Quota is out of sync, "
-				   "rescan required.\n", ret);
-
 	btrfs_end_transaction_throttle(trans, tree_root);
 out_free:
 	kfree(wc);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 3694d57e759f..6f49715cc127 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -1967,6 +1967,13 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 		goto scrub_continue;
 	}
 
+	/* Reocrd old roots for later qgroup accounting */
+	ret = btrfs_qgroup_prepare_account_extents(trans, root->fs_info);
+	if (ret) {
+		mutex_unlock(&root->fs_info->reloc_mutex);
+		goto scrub_continue;
+	}
+
 	/*
 	 * make sure none of the code above managed to slip in a
 	 * delayed item
@@ -2008,6 +2015,17 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 	 */
 	btrfs_free_log_root_tree(trans, root->fs_info);
 
+	/*
+	 * Since fs roots are all committed, we can get a quite accurate
+	 * new_roots. So let's do quota accounting.
+	 */
+	ret = btrfs_qgroup_account_extents(trans, root->fs_info);
+	if (ret < 0) {
+		mutex_unlock(&root->fs_info->tree_log_mutex);
+		mutex_unlock(&root->fs_info->reloc_mutex);
+		goto scrub_continue;
+	}
+
 	ret = commit_cowonly_roots(trans, root);
 	if (ret) {
 		mutex_unlock(&root->fs_info->tree_log_mutex);
-- 
cgit v1.2.3-55-g7522


From 442244c9633292a147ab2b29e7007a7c8a3909b2 Mon Sep 17 00:00:00 2001
From: Qu Wenruo
Date: Thu, 16 Apr 2015 17:18:36 +0800
Subject: btrfs: qgroup: Switch self test to extent-oriented qgroup mechanism.

Since the self test transaction don't have delayed_ref_roots, so use
find_all_roots() and export btrfs_qgroup_account_extent() to simulate it

Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/qgroup.c             |   2 +-
 fs/btrfs/qgroup.h             |   5 ++
 fs/btrfs/tests/qgroup-tests.c | 109 ++++++++++++++++++++++++++++++++----------
 3 files changed, 89 insertions(+), 27 deletions(-)

diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 70c366f27e25..693533d9e22b 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -2455,7 +2455,7 @@ static int btrfs_qgroup_account(struct btrfs_trans_handle *trans,
 	return ret;
 }
 
-static int
+int
 btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans,
 			    struct btrfs_fs_info *fs_info,
 			    u64 bytenr, u64 num_bytes,
diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
index cb703f859af6..90998b5e1713 100644
--- a/fs/btrfs/qgroup.h
+++ b/fs/btrfs/qgroup.h
@@ -103,6 +103,11 @@ int btrfs_qgroup_prepare_account_extents(struct btrfs_trans_handle *trans,
 struct btrfs_qgroup_extent_record
 *btrfs_qgroup_insert_dirty_extent(struct btrfs_delayed_ref_root *delayed_refs,
 				  struct btrfs_qgroup_extent_record *record);
+int
+btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans,
+			    struct btrfs_fs_info *fs_info,
+			    u64 bytenr, u64 num_bytes,
+			    struct ulist *old_roots, struct ulist *new_roots);
 int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans,
 				 struct btrfs_fs_info *fs_info);
 int btrfs_delayed_qgroup_accounting(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/tests/qgroup-tests.c b/fs/btrfs/tests/qgroup-tests.c
index c32a7ba76bca..846d277b1901 100644
--- a/fs/btrfs/tests/qgroup-tests.c
+++ b/fs/btrfs/tests/qgroup-tests.c
@@ -21,6 +21,7 @@
 #include "../transaction.h"
 #include "../disk-io.h"
 #include "../qgroup.h"
+#include "../backref.h"
 
 static void init_dummy_trans(struct btrfs_trans_handle *trans)
 {
@@ -227,6 +228,8 @@ static int test_no_shared_qgroup(struct btrfs_root *root)
 {
 	struct btrfs_trans_handle trans;
 	struct btrfs_fs_info *fs_info = root->fs_info;
+	struct ulist *old_roots = NULL;
+	struct ulist *new_roots = NULL;
 	int ret;
 
 	init_dummy_trans(&trans);
@@ -238,10 +241,15 @@ static int test_no_shared_qgroup(struct btrfs_root *root)
 		return ret;
 	}
 
-	ret = btrfs_qgroup_record_ref(&trans, fs_info, 5, 4096, 4096,
-				      BTRFS_QGROUP_OPER_ADD_EXCL, 0);
+	/*
+	 * Since the test trans doesn't havee the complicated delayed refs,
+	 * we can only call btrfs_qgroup_account_extent() directly to test
+	 * quota.
+	 */
+	ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &old_roots);
 	if (ret) {
-		test_msg("Couldn't add space to a qgroup %d\n", ret);
+		ulist_free(old_roots);
+		test_msg("Couldn't find old roots: %d\n", ret);
 		return ret;
 	}
 
@@ -249,9 +257,18 @@ static int test_no_shared_qgroup(struct btrfs_root *root)
 	if (ret)
 		return ret;
 
-	ret = btrfs_delayed_qgroup_accounting(&trans, fs_info);
+	ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &new_roots);
+	if (ret) {
+		ulist_free(old_roots);
+		ulist_free(new_roots);
+		test_msg("Couldn't find old roots: %d\n", ret);
+		return ret;
+	}
+
+	ret = btrfs_qgroup_account_extent(&trans, fs_info, 4096, 4096,
+					  old_roots, new_roots);
 	if (ret) {
-		test_msg("Delayed qgroup accounting failed %d\n", ret);
+		test_msg("Couldn't account space for a qgroup %d\n", ret);
 		return ret;
 	}
 
@@ -259,21 +276,32 @@ static int test_no_shared_qgroup(struct btrfs_root *root)
 		test_msg("Qgroup counts didn't match expected values\n");
 		return -EINVAL;
 	}
+	old_roots = NULL;
+	new_roots = NULL;
+
+	ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &old_roots);
+	if (ret) {
+		ulist_free(old_roots);
+		test_msg("Couldn't find old roots: %d\n", ret);
+		return ret;
+	}
 
 	ret = remove_extent_item(root, 4096, 4096);
 	if (ret)
 		return -EINVAL;
 
-	ret = btrfs_qgroup_record_ref(&trans, fs_info, 5, 4096, 4096,
-				      BTRFS_QGROUP_OPER_SUB_EXCL, 0);
+	ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &new_roots);
 	if (ret) {
-		test_msg("Couldn't remove space from the qgroup %d\n", ret);
-		return -EINVAL;
+		ulist_free(old_roots);
+		ulist_free(new_roots);
+		test_msg("Couldn't find old roots: %d\n", ret);
+		return ret;
 	}
 
-	ret = btrfs_delayed_qgroup_accounting(&trans, fs_info);
+	ret = btrfs_qgroup_account_extent(&trans, fs_info, 4096, 4096,
+					  old_roots, new_roots);
 	if (ret) {
-		test_msg("Qgroup accounting failed %d\n", ret);
+		test_msg("Couldn't account space for a qgroup %d\n", ret);
 		return -EINVAL;
 	}
 
@@ -294,6 +322,8 @@ static int test_multiple_refs(struct btrfs_root *root)
 {
 	struct btrfs_trans_handle trans;
 	struct btrfs_fs_info *fs_info = root->fs_info;
+	struct ulist *old_roots = NULL;
+	struct ulist *new_roots = NULL;
 	int ret;
 
 	init_dummy_trans(&trans);
@@ -307,20 +337,29 @@ static int test_multiple_refs(struct btrfs_root *root)
 		return ret;
 	}
 
+	ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &old_roots);
+	if (ret) {
+		ulist_free(old_roots);
+		test_msg("Couldn't find old roots: %d\n", ret);
+		return ret;
+	}
+
 	ret = insert_normal_tree_ref(root, 4096, 4096, 0, 5);
 	if (ret)
 		return ret;
 
-	ret = btrfs_qgroup_record_ref(&trans, fs_info, 5, 4096, 4096,
-				      BTRFS_QGROUP_OPER_ADD_EXCL, 0);
+	ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &new_roots);
 	if (ret) {
-		test_msg("Couldn't add space to a qgroup %d\n", ret);
+		ulist_free(old_roots);
+		ulist_free(new_roots);
+		test_msg("Couldn't find old roots: %d\n", ret);
 		return ret;
 	}
 
-	ret = btrfs_delayed_qgroup_accounting(&trans, fs_info);
+	ret = btrfs_qgroup_account_extent(&trans, fs_info, 4096, 4096,
+					  old_roots, new_roots);
 	if (ret) {
-		test_msg("Delayed qgroup accounting failed %d\n", ret);
+		test_msg("Couldn't account space for a qgroup %d\n", ret);
 		return ret;
 	}
 
@@ -329,20 +368,29 @@ static int test_multiple_refs(struct btrfs_root *root)
 		return -EINVAL;
 	}
 
+	ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &old_roots);
+	if (ret) {
+		ulist_free(old_roots);
+		test_msg("Couldn't find old roots: %d\n", ret);
+		return ret;
+	}
+
 	ret = add_tree_ref(root, 4096, 4096, 0, 256);
 	if (ret)
 		return ret;
 
-	ret = btrfs_qgroup_record_ref(&trans, fs_info, 256, 4096, 4096,
-				      BTRFS_QGROUP_OPER_ADD_SHARED, 0);
+	ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &new_roots);
 	if (ret) {
-		test_msg("Qgroup record ref failed %d\n", ret);
+		ulist_free(old_roots);
+		ulist_free(new_roots);
+		test_msg("Couldn't find old roots: %d\n", ret);
 		return ret;
 	}
 
-	ret = btrfs_delayed_qgroup_accounting(&trans, fs_info);
+	ret = btrfs_qgroup_account_extent(&trans, fs_info, 4096, 4096,
+					  old_roots, new_roots);
 	if (ret) {
-		test_msg("Qgroup accounting failed %d\n", ret);
+		test_msg("Couldn't account space for a qgroup %d\n", ret);
 		return ret;
 	}
 
@@ -356,20 +404,29 @@ static int test_multiple_refs(struct btrfs_root *root)
 		return -EINVAL;
 	}
 
+	ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &old_roots);
+	if (ret) {
+		ulist_free(old_roots);
+		test_msg("Couldn't find old roots: %d\n", ret);
+		return ret;
+	}
+
 	ret = remove_extent_ref(root, 4096, 4096, 0, 256);
 	if (ret)
 		return ret;
 
-	ret = btrfs_qgroup_record_ref(&trans, fs_info, 256, 4096, 4096,
-				      BTRFS_QGROUP_OPER_SUB_SHARED, 0);
+	ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &new_roots);
 	if (ret) {
-		test_msg("Qgroup record ref failed %d\n", ret);
+		ulist_free(old_roots);
+		ulist_free(new_roots);
+		test_msg("Couldn't find old roots: %d\n", ret);
 		return ret;
 	}
 
-	ret = btrfs_delayed_qgroup_accounting(&trans, fs_info);
+	ret = btrfs_qgroup_account_extent(&trans, fs_info, 4096, 4096,
+					  old_roots, new_roots);
 	if (ret) {
-		test_msg("Qgroup accounting failed %d\n", ret);
+		test_msg("Couldn't account space for a qgroup %d\n", ret);
 		return ret;
 	}
 
-- 
cgit v1.2.3-55-g7522


From e69bcee37692f5d8c557335ddd2444cb4afe0005 Mon Sep 17 00:00:00 2001
From: Qu Wenruo
Date: Fri, 17 Apr 2015 10:23:16 +0800
Subject: btrfs: qgroup: Cleanup the old ref_node-oriented mechanism.

Goodbye, the old mechanisim.

Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/ctree.h             |   2 +-
 fs/btrfs/extent-tree.c       |   5 -
 fs/btrfs/qgroup.c            | 864 +------------------------------------------
 fs/btrfs/qgroup.h            |  49 ---
 include/trace/events/btrfs.h |  55 ---
 5 files changed, 3 insertions(+), 972 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 881549a35fca..0498f5cd8752 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1736,7 +1736,7 @@ struct btrfs_fs_info {
 	/* list of dirty qgroups to be written at next commit */
 	struct list_head dirty_qgroups;
 
-	/* used by btrfs_qgroup_record_ref for an efficient tree traversal */
+	/* used by qgroup for an efficient tree traversal */
 	u64 qgroup_seq;
 
 	/* qgroup rescan items */
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index b76b42d95619..1acd63fcb252 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1981,7 +1981,6 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 	u64 refs;
 	int ret;
 	int no_quota = node->no_quota;
-	enum btrfs_qgroup_operation_type type = BTRFS_QGROUP_OPER_ADD_EXCL;
 
 	path = btrfs_alloc_path();
 	if (!path)
@@ -2009,8 +2008,6 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
 	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
 	refs = btrfs_extent_refs(leaf, item);
-	if (refs)
-		type = BTRFS_QGROUP_OPER_ADD_SHARED;
 	btrfs_set_extent_refs(leaf, item, refs + refs_to_add);
 	if (extent_op)
 		__run_delayed_extent_op(extent_op, leaf, item);
@@ -6112,7 +6109,6 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
 	u64 bytenr = node->bytenr;
 	u64 num_bytes = node->num_bytes;
 	int last_ref = 0;
-	enum btrfs_qgroup_operation_type type = BTRFS_QGROUP_OPER_SUB_EXCL;
 	bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
 						 SKINNY_METADATA);
 
@@ -6293,7 +6289,6 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
 	refs -= refs_to_drop;
 
 	if (refs > 0) {
-		type = BTRFS_QGROUP_OPER_SUB_SHARED;
 		if (extent_op)
 			__run_delayed_extent_op(extent_op, leaf, ei);
 		/*
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 693533d9e22b..c5aa0d34940e 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -34,6 +34,7 @@
 #include "extent_io.h"
 #include "qgroup.h"
 
+
 /* TODO XXX FIXME
  *  - subvol delete -> delete when ref goes to 0? delete limits also?
  *  - reorganize keys
@@ -1387,172 +1388,6 @@ out:
 	return ret;
 }
 
-static int comp_oper_exist(struct btrfs_qgroup_operation *oper1,
-			   struct btrfs_qgroup_operation *oper2)
-{
-	/*
-	 * Ignore seq and type here, we're looking for any operation
-	 * at all related to this extent on that root.
-	 */
-	if (oper1->bytenr < oper2->bytenr)
-		return -1;
-	if (oper1->bytenr > oper2->bytenr)
-		return 1;
-	if (oper1->ref_root < oper2->ref_root)
-		return -1;
-	if (oper1->ref_root > oper2->ref_root)
-		return 1;
-	return 0;
-}
-
-static int qgroup_oper_exists(struct btrfs_fs_info *fs_info,
-			      struct btrfs_qgroup_operation *oper)
-{
-	struct rb_node *n;
-	struct btrfs_qgroup_operation *cur;
-	int cmp;
-
-	spin_lock(&fs_info->qgroup_op_lock);
-	n = fs_info->qgroup_op_tree.rb_node;
-	while (n) {
-		cur = rb_entry(n, struct btrfs_qgroup_operation, n);
-		cmp = comp_oper_exist(cur, oper);
-		if (cmp < 0) {
-			n = n->rb_right;
-		} else if (cmp) {
-			n = n->rb_left;
-		} else {
-			spin_unlock(&fs_info->qgroup_op_lock);
-			return -EEXIST;
-		}
-	}
-	spin_unlock(&fs_info->qgroup_op_lock);
-	return 0;
-}
-
-static int comp_oper(struct btrfs_qgroup_operation *oper1,
-		     struct btrfs_qgroup_operation *oper2)
-{
-	if (oper1->bytenr < oper2->bytenr)
-		return -1;
-	if (oper1->bytenr > oper2->bytenr)
-		return 1;
-	if (oper1->ref_root < oper2->ref_root)
-		return -1;
-	if (oper1->ref_root > oper2->ref_root)
-		return 1;
-	if (oper1->seq < oper2->seq)
-		return -1;
-	if (oper1->seq > oper2->seq)
-		return 1;
-	if (oper1->type < oper2->type)
-		return -1;
-	if (oper1->type > oper2->type)
-		return 1;
-	return 0;
-}
-
-static int insert_qgroup_oper(struct btrfs_fs_info *fs_info,
-			      struct btrfs_qgroup_operation *oper)
-{
-	struct rb_node **p;
-	struct rb_node *parent = NULL;
-	struct btrfs_qgroup_operation *cur;
-	int cmp;
-
-	spin_lock(&fs_info->qgroup_op_lock);
-	p = &fs_info->qgroup_op_tree.rb_node;
-	while (*p) {
-		parent = *p;
-		cur = rb_entry(parent, struct btrfs_qgroup_operation, n);
-		cmp = comp_oper(cur, oper);
-		if (cmp < 0) {
-			p = &(*p)->rb_right;
-		} else if (cmp) {
-			p = &(*p)->rb_left;
-		} else {
-			spin_unlock(&fs_info->qgroup_op_lock);
-			return -EEXIST;
-		}
-	}
-	rb_link_node(&oper->n, parent, p);
-	rb_insert_color(&oper->n, &fs_info->qgroup_op_tree);
-	spin_unlock(&fs_info->qgroup_op_lock);
-	return 0;
-}
-
-/*
- * Record a quota operation for processing later on.
- * @trans: the transaction we are adding the delayed op to.
- * @fs_info: the fs_info for this fs.
- * @ref_root: the root of the reference we are acting on,
- * @bytenr: the bytenr we are acting on.
- * @num_bytes: the number of bytes in the reference.
- * @type: the type of operation this is.
- * @mod_seq: do we need to get a sequence number for looking up roots.
- *
- * We just add it to our trans qgroup_ref_list and carry on and process these
- * operations in order at some later point.  If the reference root isn't a fs
- * root then we don't bother with doing anything.
- *
- * MUST BE HOLDING THE REF LOCK.
- */
-int btrfs_qgroup_record_ref(struct btrfs_trans_handle *trans,
-			    struct btrfs_fs_info *fs_info, u64 ref_root,
-			    u64 bytenr, u64 num_bytes,
-			    enum btrfs_qgroup_operation_type type, int mod_seq)
-{
-	struct btrfs_qgroup_operation *oper;
-	int ret;
-
-	if (!is_fstree(ref_root) || !fs_info->quota_enabled)
-		return 0;
-
-	oper = kmalloc(sizeof(*oper), GFP_NOFS);
-	if (!oper)
-		return -ENOMEM;
-
-	oper->ref_root = ref_root;
-	oper->bytenr = bytenr;
-	oper->num_bytes = num_bytes;
-	oper->type = type;
-	oper->seq = atomic_inc_return(&fs_info->qgroup_op_seq);
-	INIT_LIST_HEAD(&oper->elem.list);
-	oper->elem.seq = 0;
-
-	trace_btrfs_qgroup_record_ref(oper);
-
-	if (type == BTRFS_QGROUP_OPER_SUB_SUBTREE) {
-		/*
-		 * If any operation for this bytenr/ref_root combo
-		 * exists, then we know it's not exclusively owned and
-		 * shouldn't be queued up.
-		 *
-		 * This also catches the case where we have a cloned
-		 * extent that gets queued up multiple times during
-		 * drop snapshot.
-		 */
-		if (qgroup_oper_exists(fs_info, oper)) {
-			kfree(oper);
-			return 0;
-		}
-	}
-
-	ret = insert_qgroup_oper(fs_info, oper);
-	if (ret) {
-		/* Shouldn't happen so have an assert for developers */
-		ASSERT(0);
-		kfree(oper);
-		return ret;
-	}
-	list_add_tail(&oper->list, &trans->qgroup_ref_list);
-
-	if (mod_seq)
-		btrfs_get_tree_mod_seq(fs_info, &oper->elem);
-
-	return 0;
-}
-
 int btrfs_qgroup_prepare_account_extents(struct btrfs_trans_handle *trans,
 					 struct btrfs_fs_info *fs_info)
 {
@@ -1606,264 +1441,6 @@ struct btrfs_qgroup_extent_record
 	return NULL;
 }
 
-/*
- * The easy accounting, if we are adding/removing the only ref for an extent
- * then this qgroup and all of the parent qgroups get their refrence and
- * exclusive counts adjusted.
- */
-static int qgroup_excl_accounting(struct btrfs_fs_info *fs_info,
-				  struct btrfs_qgroup_operation *oper)
-{
-	struct ulist *tmp;
-	int sign = 0;
-	int ret = 0;
-
-	tmp = ulist_alloc(GFP_NOFS);
-	if (!tmp)
-		return -ENOMEM;
-
-	spin_lock(&fs_info->qgroup_lock);
-	if (!fs_info->quota_root)
-		goto out;
-
-	switch (oper->type) {
-	case BTRFS_QGROUP_OPER_ADD_EXCL:
-		sign = 1;
-		break;
-	case BTRFS_QGROUP_OPER_SUB_EXCL:
-		sign = -1;
-		break;
-	default:
-		ASSERT(0);
-	}
-	ret = __qgroup_excl_accounting(fs_info, tmp, oper->ref_root,
-				       oper->num_bytes, sign);
-out:
-	spin_unlock(&fs_info->qgroup_lock);
-	ulist_free(tmp);
-	return ret;
-}
-
-/*
- * Walk all of the roots that pointed to our bytenr and adjust their refcnts as
- * properly.
- */
-static int qgroup_calc_old_refcnt(struct btrfs_fs_info *fs_info,
-				  u64 root_to_skip, struct ulist *tmp,
-				  struct ulist *roots, struct ulist *qgroups,
-				  u64 seq, int *old_roots, int rescan)
-{
-	struct ulist_node *unode;
-	struct ulist_iterator uiter;
-	struct ulist_node *tmp_unode;
-	struct ulist_iterator tmp_uiter;
-	struct btrfs_qgroup *qg;
-	int ret;
-
-	ULIST_ITER_INIT(&uiter);
-	while ((unode = ulist_next(roots, &uiter))) {
-		/* We don't count our current root here */
-		if (unode->val == root_to_skip)
-			continue;
-		qg = find_qgroup_rb(fs_info, unode->val);
-		if (!qg)
-			continue;
-		/*
-		 * We could have a pending removal of this same ref so we may
-		 * not have actually found our ref root when doing
-		 * btrfs_find_all_roots, so we need to keep track of how many
-		 * old roots we find in case we removed ours and added a
-		 * different one at the same time.  I don't think this could
-		 * happen in practice but that sort of thinking leads to pain
-		 * and suffering and to the dark side.
-		 */
-		(*old_roots)++;
-
-		ulist_reinit(tmp);
-		ret = ulist_add(qgroups, qg->qgroupid, ptr_to_u64(qg),
-				GFP_ATOMIC);
-		if (ret < 0)
-			return ret;
-		ret = ulist_add(tmp, qg->qgroupid, ptr_to_u64(qg), GFP_ATOMIC);
-		if (ret < 0)
-			return ret;
-		ULIST_ITER_INIT(&tmp_uiter);
-		while ((tmp_unode = ulist_next(tmp, &tmp_uiter))) {
-			struct btrfs_qgroup_list *glist;
-			int mod;
-
-			qg = u64_to_ptr(tmp_unode->aux);
-			/*
-			 * We use this sequence number to keep from having to
-			 * run the whole list and 0 out the refcnt every time.
-			 * We basically use sequnce as the known 0 count and
-			 * then add 1 everytime we see a qgroup.  This is how we
-			 * get how many of the roots actually point up to the
-			 * upper level qgroups in order to determine exclusive
-			 * counts.
-			 *
-			 * For rescan none of the extent is recorded before so
-			 * we just don't add old_refcnt.
-			 */
-			if (rescan)
-				mod = 0;
-			else
-				mod = 1;
-			btrfs_qgroup_update_old_refcnt(qg, seq, mod);
-			btrfs_qgroup_update_new_refcnt(qg, seq, 1);
-			list_for_each_entry(glist, &qg->groups, next_group) {
-				ret = ulist_add(qgroups, glist->group->qgroupid,
-						ptr_to_u64(glist->group),
-						GFP_ATOMIC);
-				if (ret < 0)
-					return ret;
-				ret = ulist_add(tmp, glist->group->qgroupid,
-						ptr_to_u64(glist->group),
-						GFP_ATOMIC);
-				if (ret < 0)
-					return ret;
-			}
-		}
-	}
-	return 0;
-}
-
-/*
- * We need to walk forward in our operation tree and account for any roots that
- * were deleted after we made this operation.
- */
-static int qgroup_account_deleted_refs(struct btrfs_fs_info *fs_info,
-				       struct btrfs_qgroup_operation *oper,
-				       struct ulist *tmp,
-				       struct ulist *qgroups, u64 seq,
-				       int *old_roots)
-{
-	struct ulist_node *unode;
-	struct ulist_iterator uiter;
-	struct btrfs_qgroup *qg;
-	struct btrfs_qgroup_operation *tmp_oper;
-	struct rb_node *n;
-	int ret;
-
-	ulist_reinit(tmp);
-
-	/*
-	 * We only walk forward in the tree since we're only interested in
-	 * removals that happened _after_  our operation.
-	 */
-	spin_lock(&fs_info->qgroup_op_lock);
-	n = rb_next(&oper->n);
-	spin_unlock(&fs_info->qgroup_op_lock);
-	if (!n)
-		return 0;
-	tmp_oper = rb_entry(n, struct btrfs_qgroup_operation, n);
-	while (tmp_oper->bytenr == oper->bytenr) {
-		/*
-		 * If it's not a removal we don't care, additions work out
-		 * properly with our refcnt tracking.
-		 */
-		if (tmp_oper->type != BTRFS_QGROUP_OPER_SUB_SHARED &&
-		    tmp_oper->type != BTRFS_QGROUP_OPER_SUB_EXCL)
-			goto next;
-		qg = find_qgroup_rb(fs_info, tmp_oper->ref_root);
-		if (!qg)
-			goto next;
-		ret = ulist_add(qgroups, qg->qgroupid, ptr_to_u64(qg),
-				GFP_ATOMIC);
-		if (ret) {
-			if (ret < 0)
-				return ret;
-			/*
-			 * We only want to increase old_roots if this qgroup is
-			 * not already in the list of qgroups.  If it is already
-			 * there then that means it must have been re-added or
-			 * the delete will be discarded because we had an
-			 * existing ref that we haven't looked up yet.  In this
-			 * case we don't want to increase old_roots.  So if ret
-			 * == 1 then we know that this is the first time we've
-			 * seen this qgroup and we can bump the old_roots.
-			 */
-			(*old_roots)++;
-			ret = ulist_add(tmp, qg->qgroupid, ptr_to_u64(qg),
-					GFP_ATOMIC);
-			if (ret < 0)
-				return ret;
-		}
-next:
-		spin_lock(&fs_info->qgroup_op_lock);
-		n = rb_next(&tmp_oper->n);
-		spin_unlock(&fs_info->qgroup_op_lock);
-		if (!n)
-			break;
-		tmp_oper = rb_entry(n, struct btrfs_qgroup_operation, n);
-	}
-
-	/* Ok now process the qgroups we found */
-	ULIST_ITER_INIT(&uiter);
-	while ((unode = ulist_next(tmp, &uiter))) {
-		struct btrfs_qgroup_list *glist;
-
-		qg = u64_to_ptr(unode->aux);
-		btrfs_qgroup_update_old_refcnt(qg, seq, 1);
-		btrfs_qgroup_update_new_refcnt(qg, seq, 1);
-		list_for_each_entry(glist, &qg->groups, next_group) {
-			ret = ulist_add(qgroups, glist->group->qgroupid,
-					ptr_to_u64(glist->group), GFP_ATOMIC);
-			if (ret < 0)
-				return ret;
-			ret = ulist_add(tmp, glist->group->qgroupid,
-					ptr_to_u64(glist->group), GFP_ATOMIC);
-			if (ret < 0)
-				return ret;
-		}
-	}
-	return 0;
-}
-
-/* Add refcnt for the newly added reference. */
-static int qgroup_calc_new_refcnt(struct btrfs_fs_info *fs_info,
-				  struct btrfs_qgroup_operation *oper,
-				  struct btrfs_qgroup *qgroup,
-				  struct ulist *tmp, struct ulist *qgroups,
-				  u64 seq)
-{
-	struct ulist_node *unode;
-	struct ulist_iterator uiter;
-	struct btrfs_qgroup *qg;
-	int ret;
-
-	ulist_reinit(tmp);
-	ret = ulist_add(qgroups, qgroup->qgroupid, ptr_to_u64(qgroup),
-			GFP_ATOMIC);
-	if (ret < 0)
-		return ret;
-	ret = ulist_add(tmp, qgroup->qgroupid, ptr_to_u64(qgroup),
-			GFP_ATOMIC);
-	if (ret < 0)
-		return ret;
-	ULIST_ITER_INIT(&uiter);
-	while ((unode = ulist_next(tmp, &uiter))) {
-		struct btrfs_qgroup_list *glist;
-
-		qg = u64_to_ptr(unode->aux);
-		if (oper->type == BTRFS_QGROUP_OPER_ADD_SHARED)
-			btrfs_qgroup_update_new_refcnt(qg, seq, 1);
-		else
-			btrfs_qgroup_update_old_refcnt(qg, seq, 1);
-		list_for_each_entry(glist, &qg->groups, next_group) {
-			ret = ulist_add(tmp, glist->group->qgroupid,
-					ptr_to_u64(glist->group), GFP_ATOMIC);
-			if (ret < 0)
-				return ret;
-			ret = ulist_add(qgroups, glist->group->qgroupid,
-					ptr_to_u64(glist->group), GFP_ATOMIC);
-			if (ret < 0)
-				return ret;
-		}
-	}
-	return 0;
-}
-
 #define UPDATE_NEW	0
 #define UPDATE_OLD	1
 /*
@@ -1925,6 +1502,7 @@ static int qgroup_update_refcnt(struct btrfs_fs_info *fs_info,
 /*
  * Update qgroup rfer/excl counters.
  * Rfer update is easy, codes can explain themselves.
+ *
  * Excl update is tricky, the update is split into 2 part.
  * Part 1: Possible exclusive <-> sharing detect:
  *	|	A	|	!A	|
@@ -2042,419 +1620,6 @@ static int qgroup_update_counters(struct btrfs_fs_info *fs_info,
 	return 0;
 }
 
-/*
- * This adjusts the counters for all referenced qgroups if need be.
- */
-static int qgroup_adjust_counters(struct btrfs_fs_info *fs_info,
-				  u64 root_to_skip, u64 num_bytes,
-				  struct ulist *qgroups, u64 seq,
-				  int old_roots, int new_roots, int rescan)
-{
-	struct ulist_node *unode;
-	struct ulist_iterator uiter;
-	struct btrfs_qgroup *qg;
-	u64 cur_new_count, cur_old_count;
-
-	ULIST_ITER_INIT(&uiter);
-	while ((unode = ulist_next(qgroups, &uiter))) {
-		bool dirty = false;
-
-		qg = u64_to_ptr(unode->aux);
-		cur_old_count = btrfs_qgroup_get_old_refcnt(qg, seq);
-		cur_new_count = btrfs_qgroup_get_new_refcnt(qg, seq);
-
-		/*
-		 * Wasn't referenced before but is now, add to the reference
-		 * counters.
-		 */
-		if (cur_old_count == 0 && cur_new_count > 0) {
-			qg->rfer += num_bytes;
-			qg->rfer_cmpr += num_bytes;
-			dirty = true;
-		}
-
-		/*
-		 * Was referenced before but isn't now, subtract from the
-		 * reference counters.
-		 */
-		if (cur_old_count > 0 && cur_new_count == 0) {
-			qg->rfer -= num_bytes;
-			qg->rfer_cmpr -= num_bytes;
-			dirty = true;
-		}
-
-		/*
-		 * If our refcount was the same as the roots previously but our
-		 * new count isn't the same as the number of roots now then we
-		 * went from having a exclusive reference on this range to not.
-		 */
-		if (old_roots && cur_old_count == old_roots &&
-		    (cur_new_count != new_roots || new_roots == 0)) {
-			WARN_ON(cur_new_count != new_roots && new_roots == 0);
-			qg->excl -= num_bytes;
-			qg->excl_cmpr -= num_bytes;
-			dirty = true;
-		}
-
-		/*
-		 * If we didn't reference all the roots before but now we do we
-		 * have an exclusive reference to this range.
-		 */
-		if ((!old_roots || (old_roots && cur_old_count != old_roots))
-		    && cur_new_count == new_roots) {
-			qg->excl += num_bytes;
-			qg->excl_cmpr += num_bytes;
-			dirty = true;
-		}
-
-		if (dirty)
-			qgroup_dirty(fs_info, qg);
-	}
-	return 0;
-}
-
-/*
- * If we removed a data extent and there were other references for that bytenr
- * then we need to lookup all referenced roots to make sure we still don't
- * reference this bytenr.  If we do then we can just discard this operation.
- */
-static int check_existing_refs(struct btrfs_trans_handle *trans,
-			       struct btrfs_fs_info *fs_info,
-			       struct btrfs_qgroup_operation *oper)
-{
-	struct ulist *roots = NULL;
-	struct ulist_node *unode;
-	struct ulist_iterator uiter;
-	int ret = 0;
-
-	ret = btrfs_find_all_roots(trans, fs_info, oper->bytenr,
-				   oper->elem.seq, &roots);
-	if (ret < 0)
-		return ret;
-	ret = 0;
-
-	ULIST_ITER_INIT(&uiter);
-	while ((unode = ulist_next(roots, &uiter))) {
-		if (unode->val == oper->ref_root) {
-			ret = 1;
-			break;
-		}
-	}
-	ulist_free(roots);
-	btrfs_put_tree_mod_seq(fs_info, &oper->elem);
-
-	return ret;
-}
-
-/*
- * If we share a reference across multiple roots then we may need to adjust
- * various qgroups referenced and exclusive counters.  The basic premise is this
- *
- * 1) We have seq to represent a 0 count.  Instead of looping through all of the
- * qgroups and resetting their refcount to 0 we just constantly bump this
- * sequence number to act as the base reference count.  This means that if
- * anybody is equal to or below this sequence they were never referenced.  We
- * jack this sequence up by the number of roots we found each time in order to
- * make sure we don't have any overlap.
- *
- * 2) We first search all the roots that reference the area _except_ the root
- * we're acting on currently.  This makes up the old_refcnt of all the qgroups
- * before.
- *
- * 3) We walk all of the qgroups referenced by the root we are currently acting
- * on, and will either adjust old_refcnt in the case of a removal or the
- * new_refcnt in the case of an addition.
- *
- * 4) Finally we walk all the qgroups that are referenced by this range
- * including the root we are acting on currently.  We will adjust the counters
- * based on the number of roots we had and will have after this operation.
- *
- * Take this example as an illustration
- *
- *			[qgroup 1/0]
- *		     /         |          \
- *		[qg 0/0]   [qg 0/1]	[qg 0/2]
- *		   \          |            /
- *		  [	   extent	    ]
- *
- * Say we are adding a reference that is covered by qg 0/0.  The first step
- * would give a refcnt of 1 to qg 0/1 and 0/2 and a refcnt of 2 to qg 1/0 with
- * old_roots being 2.  Because it is adding new_roots will be 1.  We then go
- * through qg 0/0 which will get the new_refcnt set to 1 and add 1 to qg 1/0's
- * new_refcnt, bringing it to 3.  We then walk through all of the qgroups, we
- * notice that the old refcnt for qg 0/0 < the new refcnt, so we added a
- * reference and thus must add the size to the referenced bytes.  Everything
- * else is the same so nothing else changes.
- */
-static int qgroup_shared_accounting(struct btrfs_trans_handle *trans,
-				    struct btrfs_fs_info *fs_info,
-				    struct btrfs_qgroup_operation *oper)
-{
-	struct ulist *roots = NULL;
-	struct ulist *qgroups, *tmp;
-	struct btrfs_qgroup *qgroup;
-	struct seq_list elem = SEQ_LIST_INIT(elem);
-	u64 seq;
-	int old_roots = 0;
-	int new_roots = 0;
-	int ret = 0;
-
-	if (oper->elem.seq) {
-		ret = check_existing_refs(trans, fs_info, oper);
-		if (ret < 0)
-			return ret;
-		if (ret)
-			return 0;
-	}
-
-	qgroups = ulist_alloc(GFP_NOFS);
-	if (!qgroups)
-		return -ENOMEM;
-
-	tmp = ulist_alloc(GFP_NOFS);
-	if (!tmp) {
-		ulist_free(qgroups);
-		return -ENOMEM;
-	}
-
-	btrfs_get_tree_mod_seq(fs_info, &elem);
-	ret = btrfs_find_all_roots(trans, fs_info, oper->bytenr, elem.seq,
-				   &roots);
-	btrfs_put_tree_mod_seq(fs_info, &elem);
-	if (ret < 0) {
-		ulist_free(qgroups);
-		ulist_free(tmp);
-		return ret;
-	}
-	spin_lock(&fs_info->qgroup_lock);
-	qgroup = find_qgroup_rb(fs_info, oper->ref_root);
-	if (!qgroup)
-		goto out;
-	seq = fs_info->qgroup_seq;
-
-	/*
-	 * So roots is the list of all the roots currently pointing at the
-	 * bytenr, including the ref we are adding if we are adding, or not if
-	 * we are removing a ref.  So we pass in the ref_root to skip that root
-	 * in our calculations.  We set old_refnct and new_refcnt cause who the
-	 * hell knows what everything looked like before, and it doesn't matter
-	 * except...
-	 */
-	ret = qgroup_calc_old_refcnt(fs_info, oper->ref_root, tmp, roots, qgroups,
-				     seq, &old_roots, 0);
-	if (ret < 0)
-		goto out;
-
-	/*
-	 * Now adjust the refcounts of the qgroups that care about this
-	 * reference, either the old_count in the case of removal or new_count
-	 * in the case of an addition.
-	 */
-	ret = qgroup_calc_new_refcnt(fs_info, oper, qgroup, tmp, qgroups,
-				     seq);
-	if (ret < 0)
-		goto out;
-
-	/*
-	 * ...in the case of removals.  If we had a removal before we got around
-	 * to processing this operation then we need to find that guy and count
-	 * his references as if they really existed so we don't end up screwing
-	 * up the exclusive counts.  Then whenever we go to process the delete
-	 * everything will be grand and we can account for whatever exclusive
-	 * changes need to be made there.  We also have to pass in old_roots so
-	 * we have an accurate count of the roots as it pertains to this
-	 * operations view of the world.
-	 */
-	ret = qgroup_account_deleted_refs(fs_info, oper, tmp, qgroups, seq,
-					  &old_roots);
-	if (ret < 0)
-		goto out;
-
-	/*
-	 * We are adding our root, need to adjust up the number of roots,
-	 * otherwise old_roots is the number of roots we want.
-	 */
-	if (oper->type == BTRFS_QGROUP_OPER_ADD_SHARED) {
-		new_roots = old_roots + 1;
-	} else {
-		new_roots = old_roots;
-		old_roots++;
-	}
-
-	/*
-	 * Bump qgroup_seq to avoid seq overlap
-	 * XXX: This makes qgroup_seq mismatch with oper->seq.
-	 */
-	fs_info->qgroup_seq += old_roots + 1;
-
-
-	/*
-	 * And now the magic happens, bless Arne for having a pretty elegant
-	 * solution for this.
-	 */
-	qgroup_adjust_counters(fs_info, oper->ref_root, oper->num_bytes,
-			       qgroups, seq, old_roots, new_roots, 0);
-out:
-	spin_unlock(&fs_info->qgroup_lock);
-	ulist_free(qgroups);
-	ulist_free(roots);
-	ulist_free(tmp);
-	return ret;
-}
-
-/*
- * Process a reference to a shared subtree. This type of operation is
- * queued during snapshot removal when we encounter extents which are
- * shared between more than one root.
- */
-static int qgroup_subtree_accounting(struct btrfs_trans_handle *trans,
-				     struct btrfs_fs_info *fs_info,
-				     struct btrfs_qgroup_operation *oper)
-{
-	struct ulist *roots = NULL;
-	struct ulist_node *unode;
-	struct ulist_iterator uiter;
-	struct btrfs_qgroup_list *glist;
-	struct ulist *parents;
-	int ret = 0;
-	int err;
-	struct btrfs_qgroup *qg;
-	u64 root_obj = 0;
-	struct seq_list elem = SEQ_LIST_INIT(elem);
-
-	parents = ulist_alloc(GFP_NOFS);
-	if (!parents)
-		return -ENOMEM;
-
-	btrfs_get_tree_mod_seq(fs_info, &elem);
-	ret = btrfs_find_all_roots(trans, fs_info, oper->bytenr,
-				   elem.seq, &roots);
-	btrfs_put_tree_mod_seq(fs_info, &elem);
-	if (ret < 0)
-		goto out;
-
-	if (roots->nnodes != 1)
-		goto out;
-
-	ULIST_ITER_INIT(&uiter);
-	unode = ulist_next(roots, &uiter); /* Only want 1 so no need to loop */
-	/*
-	 * If we find our ref root then that means all refs
-	 * this extent has to the root have not yet been
-	 * deleted. In that case, we do nothing and let the
-	 * last ref for this bytenr drive our update.
-	 *
-	 * This can happen for example if an extent is
-	 * referenced multiple times in a snapshot (clone,
-	 * etc). If we are in the middle of snapshot removal,
-	 * queued updates for such an extent will find the
-	 * root if we have not yet finished removing the
-	 * snapshot.
-	 */
-	if (unode->val == oper->ref_root)
-		goto out;
-
-	root_obj = unode->val;
-	BUG_ON(!root_obj);
-
-	spin_lock(&fs_info->qgroup_lock);
-	qg = find_qgroup_rb(fs_info, root_obj);
-	if (!qg)
-		goto out_unlock;
-
-	qg->excl += oper->num_bytes;
-	qg->excl_cmpr += oper->num_bytes;
-	qgroup_dirty(fs_info, qg);
-
-	/*
-	 * Adjust counts for parent groups. First we find all
-	 * parents, then in the 2nd loop we do the adjustment
-	 * while adding parents of the parents to our ulist.
-	 */
-	list_for_each_entry(glist, &qg->groups, next_group) {
-		err = ulist_add(parents, glist->group->qgroupid,
-				ptr_to_u64(glist->group), GFP_ATOMIC);
-		if (err < 0) {
-			ret = err;
-			goto out_unlock;
-		}
-	}
-
-	ULIST_ITER_INIT(&uiter);
-	while ((unode = ulist_next(parents, &uiter))) {
-		qg = u64_to_ptr(unode->aux);
-		qg->excl += oper->num_bytes;
-		qg->excl_cmpr += oper->num_bytes;
-		qgroup_dirty(fs_info, qg);
-
-		/* Add any parents of the parents */
-		list_for_each_entry(glist, &qg->groups, next_group) {
-			err = ulist_add(parents, glist->group->qgroupid,
-					ptr_to_u64(glist->group), GFP_ATOMIC);
-			if (err < 0) {
-				ret = err;
-				goto out_unlock;
-			}
-		}
-	}
-
-out_unlock:
-	spin_unlock(&fs_info->qgroup_lock);
-
-out:
-	ulist_free(roots);
-	ulist_free(parents);
-	return ret;
-}
-
-/*
- * btrfs_qgroup_account_ref is called for every ref that is added to or deleted
- * from the fs. First, all roots referencing the extent are searched, and
- * then the space is accounted accordingly to the different roots. The
- * accounting algorithm works in 3 steps documented inline.
- */
-static int btrfs_qgroup_account(struct btrfs_trans_handle *trans,
-				struct btrfs_fs_info *fs_info,
-				struct btrfs_qgroup_operation *oper)
-{
-	int ret = 0;
-
-	if (!fs_info->quota_enabled)
-		return 0;
-
-	BUG_ON(!fs_info->quota_root);
-
-	mutex_lock(&fs_info->qgroup_rescan_lock);
-	if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) {
-		if (fs_info->qgroup_rescan_progress.objectid <= oper->bytenr) {
-			mutex_unlock(&fs_info->qgroup_rescan_lock);
-			return 0;
-		}
-	}
-	mutex_unlock(&fs_info->qgroup_rescan_lock);
-
-	ASSERT(is_fstree(oper->ref_root));
-
-	trace_btrfs_qgroup_account(oper);
-
-	switch (oper->type) {
-	case BTRFS_QGROUP_OPER_ADD_EXCL:
-	case BTRFS_QGROUP_OPER_SUB_EXCL:
-		ret = qgroup_excl_accounting(fs_info, oper);
-		break;
-	case BTRFS_QGROUP_OPER_ADD_SHARED:
-	case BTRFS_QGROUP_OPER_SUB_SHARED:
-		ret = qgroup_shared_accounting(trans, fs_info, oper);
-		break;
-	case BTRFS_QGROUP_OPER_SUB_SUBTREE:
-		ret = qgroup_subtree_accounting(trans, fs_info, oper);
-		break;
-	default:
-		ASSERT(0);
-	}
-	return ret;
-}
-
 int
 btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans,
 			    struct btrfs_fs_info *fs_info,
@@ -2571,31 +1736,6 @@ cleanup:
 	return ret;
 }
 
-/*
- * Needs to be called everytime we run delayed refs, even if there is an error
- * in order to cleanup outstanding operations.
- */
-int btrfs_delayed_qgroup_accounting(struct btrfs_trans_handle *trans,
-				    struct btrfs_fs_info *fs_info)
-{
-	struct btrfs_qgroup_operation *oper;
-	int ret = 0;
-
-	while (!list_empty(&trans->qgroup_ref_list)) {
-		oper = list_first_entry(&trans->qgroup_ref_list,
-					struct btrfs_qgroup_operation, list);
-		list_del_init(&oper->list);
-		if (!ret || !trans->aborted)
-			ret = btrfs_qgroup_account(trans, fs_info, oper);
-		spin_lock(&fs_info->qgroup_op_lock);
-		rb_erase(&oper->n, &fs_info->qgroup_op_tree);
-		spin_unlock(&fs_info->qgroup_op_lock);
-		btrfs_put_tree_mod_seq(fs_info, &oper->elem);
-		kfree(oper);
-	}
-	return ret;
-}
-
 /*
  * called from commit_transaction. Writes all changed qgroups to disk.
  */
diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
index 90998b5e1713..6387dcfa354c 100644
--- a/fs/btrfs/qgroup.h
+++ b/fs/btrfs/qgroup.h
@@ -22,45 +22,6 @@
 #include "ulist.h"
 #include "delayed-ref.h"
 
-/*
- * A description of the operations, all of these operations only happen when we
- * are adding the 1st reference for that subvolume in the case of adding space
- * or on the last reference delete in the case of subtraction.  The only
- * exception is the last one, which is added for confusion.
- *
- * BTRFS_QGROUP_OPER_ADD_EXCL: adding bytes where this subvolume is the only
- * one pointing at the bytes we are adding.  This is called on the first
- * allocation.
- *
- * BTRFS_QGROUP_OPER_ADD_SHARED: adding bytes where this bytenr is going to be
- * shared between subvols.  This is called on the creation of a ref that already
- * has refs from a different subvolume, so basically reflink.
- *
- * BTRFS_QGROUP_OPER_SUB_EXCL: removing bytes where this subvolume is the only
- * one referencing the range.
- *
- * BTRFS_QGROUP_OPER_SUB_SHARED: removing bytes where this subvolume shares with
- * refs with other subvolumes.
- */
-enum btrfs_qgroup_operation_type {
-	BTRFS_QGROUP_OPER_ADD_EXCL,
-	BTRFS_QGROUP_OPER_ADD_SHARED,
-	BTRFS_QGROUP_OPER_SUB_EXCL,
-	BTRFS_QGROUP_OPER_SUB_SHARED,
-	BTRFS_QGROUP_OPER_SUB_SUBTREE,
-};
-
-struct btrfs_qgroup_operation {
-	u64 ref_root;
-	u64 bytenr;
-	u64 num_bytes;
-	u64 seq;
-	enum btrfs_qgroup_operation_type type;
-	struct seq_list elem;
-	struct rb_node n;
-	struct list_head list;
-};
-
 /*
  * Record a dirty extent, and info qgroup to update quota on it
  * TODO: Use kmem cache to alloc it.
@@ -93,11 +54,6 @@ int btrfs_limit_qgroup(struct btrfs_trans_handle *trans,
 int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info);
 void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info);
 struct btrfs_delayed_extent_op;
-int btrfs_qgroup_record_ref(struct btrfs_trans_handle *trans,
-			    struct btrfs_fs_info *fs_info, u64 ref_root,
-			    u64 bytenr, u64 num_bytes,
-			    enum btrfs_qgroup_operation_type type,
-			    int mod_seq);
 int btrfs_qgroup_prepare_account_extents(struct btrfs_trans_handle *trans,
 					 struct btrfs_fs_info *fs_info);
 struct btrfs_qgroup_extent_record
@@ -110,11 +66,6 @@ btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans,
 			    struct ulist *old_roots, struct ulist *new_roots);
 int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans,
 				 struct btrfs_fs_info *fs_info);
-int btrfs_delayed_qgroup_accounting(struct btrfs_trans_handle *trans,
-				    struct btrfs_fs_info *fs_info);
-void btrfs_remove_qgroup_operation(struct btrfs_trans_handle *trans,
-				   struct btrfs_fs_info *fs_info,
-				   struct btrfs_qgroup_operation *oper);
 int btrfs_run_qgroups(struct btrfs_trans_handle *trans,
 		      struct btrfs_fs_info *fs_info);
 int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h
index 7f79cf459591..0b73af9be12f 100644
--- a/include/trace/events/btrfs.h
+++ b/include/trace/events/btrfs.h
@@ -1117,61 +1117,6 @@ DEFINE_EVENT(btrfs__workqueue_done, btrfs_workqueue_destroy,
 	TP_ARGS(wq)
 );
 
-#define show_oper_type(type)						\
-	__print_symbolic(type,						\
-		{ BTRFS_QGROUP_OPER_ADD_EXCL, 	"OPER_ADD_EXCL" },	\
-		{ BTRFS_QGROUP_OPER_ADD_SHARED, "OPER_ADD_SHARED" },	\
-		{ BTRFS_QGROUP_OPER_SUB_EXCL, 	"OPER_SUB_EXCL" },	\
-		{ BTRFS_QGROUP_OPER_SUB_SHARED,	"OPER_SUB_SHARED" })
-
-DECLARE_EVENT_CLASS(btrfs_qgroup_oper,
-
-	TP_PROTO(struct btrfs_qgroup_operation *oper),
-
-	TP_ARGS(oper),
-
-	TP_STRUCT__entry(
-		__field(	u64,  ref_root		)
-		__field(	u64,  bytenr		)
-		__field(	u64,  num_bytes		)
-		__field(	u64,  seq		)
-		__field(	int,  type		)
-		__field(	u64,  elem_seq		)
-	),
-
-	TP_fast_assign(
-		__entry->ref_root	= oper->ref_root;
-		__entry->bytenr		= oper->bytenr,
-		__entry->num_bytes	= oper->num_bytes;
-		__entry->seq 		= oper->seq;
-		__entry->type		= oper->type;
-		__entry->elem_seq	= oper->elem.seq;
-	),
-
-	TP_printk("ref_root = %llu, bytenr = %llu, num_bytes = %llu, "
-		  "seq = %llu, elem.seq = %llu, type = %s",
-		  (unsigned long long)__entry->ref_root,
-		  (unsigned long long)__entry->bytenr,
-		  (unsigned long long)__entry->num_bytes,
-		  (unsigned long long)__entry->seq,
-		  (unsigned long long)__entry->elem_seq,
-		  show_oper_type(__entry->type))
-);
-
-DEFINE_EVENT(btrfs_qgroup_oper, btrfs_qgroup_account,
-
-	TP_PROTO(struct btrfs_qgroup_operation *oper),
-
-	TP_ARGS(oper)
-);
-
-DEFINE_EVENT(btrfs_qgroup_oper, btrfs_qgroup_record_ref,
-
-	TP_PROTO(struct btrfs_qgroup_operation *oper),
-
-	TP_ARGS(oper)
-);
-
 #endif /* _TRACE_BTRFS_H */
 
 /* This part must be outside protection */
-- 
cgit v1.2.3-55-g7522


From d4b804045924d7f8d2ea988be22c4b9e6492ec11 Mon Sep 17 00:00:00 2001
From: Qu Wenruo
Date: Mon, 20 Apr 2015 09:26:02 +0800
Subject: btrfs: ulist: Add ulist_del() function.

This function will delete unode with given (val,aux) pair.
And with this patch, seqnum for debug usage doesn't have any meaning
now, so remove them.

This is used by later patches to skip snapshot root.

Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/ulist.c | 47 ++++++++++++++++++++++++++++++++++++-----------
 fs/btrfs/ulist.h |  1 +
 2 files changed, 37 insertions(+), 11 deletions(-)

diff --git a/fs/btrfs/ulist.c b/fs/btrfs/ulist.c
index 840a38b2778a..91feb2bdefee 100644
--- a/fs/btrfs/ulist.c
+++ b/fs/btrfs/ulist.c
@@ -132,6 +132,15 @@ static struct ulist_node *ulist_rbtree_search(struct ulist *ulist, u64 val)
 	return NULL;
 }
 
+static void ulist_rbtree_erase(struct ulist *ulist, struct ulist_node *node)
+{
+	rb_erase(&node->rb_node, &ulist->root);
+	list_del(&node->list);
+	kfree(node);
+	BUG_ON(ulist->nnodes == 0);
+	ulist->nnodes--;
+}
+
 static int ulist_rbtree_insert(struct ulist *ulist, struct ulist_node *ins)
 {
 	struct rb_node **p = &ulist->root.rb_node;
@@ -197,9 +206,6 @@ int ulist_add_merge(struct ulist *ulist, u64 val, u64 aux,
 
 	node->val = val;
 	node->aux = aux;
-#ifdef CONFIG_BTRFS_DEBUG
-	node->seqnum = ulist->nnodes;
-#endif
 
 	ret = ulist_rbtree_insert(ulist, node);
 	ASSERT(!ret);
@@ -209,6 +215,33 @@ int ulist_add_merge(struct ulist *ulist, u64 val, u64 aux,
 	return 1;
 }
 
+/*
+ * ulist_del - delete one node from ulist
+ * @ulist:	ulist to remove node from
+ * @val:	value to delete
+ * @aux:	aux to delete
+ *
+ * The deletion will only be done when *BOTH* val and aux matches.
+ * Return 0 for successful delete.
+ * Return > 0 for not found.
+ */
+int ulist_del(struct ulist *ulist, u64 val, u64 aux)
+{
+	struct ulist_node *node;
+
+	node = ulist_rbtree_search(ulist, val);
+	/* Not found */
+	if (!node)
+		return 1;
+
+	if (node->aux != aux)
+		return 1;
+
+	/* Found and delete */
+	ulist_rbtree_erase(ulist, node);
+	return 0;
+}
+
 /**
  * ulist_next - iterate ulist
  * @ulist:	ulist to iterate
@@ -237,15 +270,7 @@ struct ulist_node *ulist_next(struct ulist *ulist, struct ulist_iterator *uiter)
 		uiter->cur_list = uiter->cur_list->next;
 	} else {
 		uiter->cur_list = ulist->nodes.next;
-#ifdef CONFIG_BTRFS_DEBUG
-		uiter->i = 0;
-#endif
 	}
 	node = list_entry(uiter->cur_list, struct ulist_node, list);
-#ifdef CONFIG_BTRFS_DEBUG
-	ASSERT(node->seqnum == uiter->i);
-	ASSERT(uiter->i >= 0 && uiter->i < ulist->nnodes);
-	uiter->i++;
-#endif
 	return node;
 }
diff --git a/fs/btrfs/ulist.h b/fs/btrfs/ulist.h
index 4c29db604bbe..a01a2c45825f 100644
--- a/fs/btrfs/ulist.h
+++ b/fs/btrfs/ulist.h
@@ -57,6 +57,7 @@ void ulist_free(struct ulist *ulist);
 int ulist_add(struct ulist *ulist, u64 val, u64 aux, gfp_t gfp_mask);
 int ulist_add_merge(struct ulist *ulist, u64 val, u64 aux,
 		    u64 *old_aux, gfp_t gfp_mask);
+int ulist_del(struct ulist *ulist, u64 val, u64 aux);
 
 /* just like ulist_add_merge() but take a pointer for the aux data */
 static inline int ulist_add_merge_ptr(struct ulist *ulist, u64 val, void *aux,
-- 
cgit v1.2.3-55-g7522


From 9086db86e0b09c39abead4d747119695553e3978 Mon Sep 17 00:00:00 2001
From: Qu Wenruo
Date: Mon, 20 Apr 2015 09:53:50 +0800
Subject: btrfs: qgroup: Add the ability to skip given qgroup for
 old/new_roots.

This is used by later qgroup fix patches for snapshot.

As current snapshot accounting is done by btrfs_qgroup_inherit(), but
new extent oriented quota mechanism will account extent from
btrfs_copy_root() and other snapshot things, causing wrong result.

So add this ability to handle snapshot accounting.

Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/delayed-ref.h |  8 ++++++++
 fs/btrfs/qgroup.c      |  8 ++++++++
 fs/btrfs/transaction.c |  1 +
 fs/btrfs/transaction.h | 23 +++++++++++++++++++++++
 4 files changed, 40 insertions(+)

diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index 4016f963599e..13fb5e6090fe 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -175,6 +175,14 @@ struct btrfs_delayed_ref_root {
 	int flushing;
 
 	u64 run_delayed_start;
+
+	/*
+	 * To make qgroup to skip given root.
+	 * This is for snapshot, as btrfs_qgroup_inherit() will manully
+	 * modify counters for snapshot and its source, so we should skip
+	 * the snapshot in new_root/old_roots or it will get calculated twice
+	 */
+	u64 qgroup_to_skip;
 };
 
 extern struct kmem_cache *btrfs_delayed_ref_head_cachep;
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index c5aa0d34940e..d5f1f033b7a0 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1394,9 +1394,11 @@ int btrfs_qgroup_prepare_account_extents(struct btrfs_trans_handle *trans,
 	struct btrfs_qgroup_extent_record *record;
 	struct btrfs_delayed_ref_root *delayed_refs;
 	struct rb_node *node;
+	u64 qgroup_to_skip;
 	int ret = 0;
 
 	delayed_refs = &trans->transaction->delayed_refs;
+	qgroup_to_skip = delayed_refs->qgroup_to_skip;
 
 	/*
 	 * No need to do lock, since this function will only be called in
@@ -1410,6 +1412,8 @@ int btrfs_qgroup_prepare_account_extents(struct btrfs_trans_handle *trans,
 					   &record->old_roots);
 		if (ret < 0)
 			break;
+		if (qgroup_to_skip)
+			ulist_del(record->old_roots, qgroup_to_skip, 0);
 		node = rb_next(node);
 	}
 	return ret;
@@ -1702,9 +1706,11 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans,
 	struct btrfs_delayed_ref_root *delayed_refs;
 	struct ulist *new_roots = NULL;
 	struct rb_node *node;
+	u64 qgroup_to_skip;
 	int ret = 0;
 
 	delayed_refs = &trans->transaction->delayed_refs;
+	qgroup_to_skip = delayed_refs->qgroup_to_skip;
 	while ((node = rb_first(&delayed_refs->dirty_extent_root))) {
 		record = rb_entry(node, struct btrfs_qgroup_extent_record,
 				  node);
@@ -1719,6 +1725,8 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans,
 					record->bytenr, (u64)-1, &new_roots);
 			if (ret < 0)
 				goto cleanup;
+			if (qgroup_to_skip)
+				ulist_del(new_roots, qgroup_to_skip, 0);
 			ret = btrfs_qgroup_account_extent(trans, fs_info,
 					record->bytenr, record->num_bytes,
 					record->old_roots, new_roots);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 6f49715cc127..3e3793dcb4c2 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -232,6 +232,7 @@ loop:
 	cur_trans->delayed_refs.num_heads = 0;
 	cur_trans->delayed_refs.flushing = 0;
 	cur_trans->delayed_refs.run_delayed_start = 0;
+	cur_trans->delayed_refs.qgroup_to_skip = 0;
 
 	/*
 	 * although the tree mod log is per file system and not per transaction,
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 036fa83d6ccb..eb09c2067fa8 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -154,6 +154,29 @@ static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans,
 	spin_unlock(&BTRFS_I(inode)->lock);
 }
 
+/*
+ * Make qgroup codes to skip given qgroupid, means the old/new_roots for
+ * qgroup won't contain the qgroupid in it.
+ */
+static inline void btrfs_set_skip_qgroup(struct btrfs_trans_handle *trans,
+					 u64 qgroupid)
+{
+	struct btrfs_delayed_ref_root *delayed_refs;
+
+	delayed_refs = &trans->transaction->delayed_refs;
+	WARN_ON(delayed_refs->qgroup_to_skip);
+	delayed_refs->qgroup_to_skip = qgroupid;
+}
+
+static inline void btrfs_clear_skip_qgroup(struct btrfs_trans_handle *trans)
+{
+	struct btrfs_delayed_ref_root *delayed_refs;
+
+	delayed_refs = &trans->transaction->delayed_refs;
+	WARN_ON(!delayed_refs->qgroup_to_skip);
+	delayed_refs->qgroup_to_skip = 0;
+}
+
 int btrfs_end_transaction(struct btrfs_trans_handle *trans,
 			  struct btrfs_root *root);
 struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
-- 
cgit v1.2.3-55-g7522


From d67263354541982a29e22a327a9d8c71d1099766 Mon Sep 17 00:00:00 2001
From: Qu Wenruo
Date: Mon, 20 Apr 2015 10:09:06 +0800
Subject: btrfs: qgroup: Make snapshot accounting work with new extent-oriented
 qgroup.

Make snapshot accounting work with new extent-oriented mechanism by
skipping given root in new/old_roots in create_pending_snapshot().

Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/transaction.c | 53 +++++++++++++++++++++++++++++++-------------------
 1 file changed, 33 insertions(+), 20 deletions(-)

diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 3e3793dcb4c2..c0f18e7266b6 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -1295,6 +1295,12 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 	if (pending->error)
 		goto no_free_objectid;
 
+	/*
+	 * Make qgroup to skip current new snapshot's qgroupid, as it is
+	 * accounted by later btrfs_qgroup_inherit().
+	 */
+	btrfs_set_skip_qgroup(trans, objectid);
+
 	btrfs_reloc_pre_snapshot(trans, pending, &to_reserve);
 
 	if (to_reserve > 0) {
@@ -1303,7 +1309,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 						     to_reserve,
 						     BTRFS_RESERVE_NO_FLUSH);
 		if (pending->error)
-			goto no_free_objectid;
+			goto clear_skip_qgroup;
 	}
 
 	key.objectid = objectid;
@@ -1401,25 +1407,6 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 		btrfs_abort_transaction(trans, root, ret);
 		goto fail;
 	}
-
-	/*
-	 * We need to flush delayed refs in order to make sure all of our quota
-	 * operations have been done before we call btrfs_qgroup_inherit.
-	 */
-	ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
-	if (ret) {
-		btrfs_abort_transaction(trans, root, ret);
-		goto fail;
-	}
-
-	ret = btrfs_qgroup_inherit(trans, fs_info,
-				   root->root_key.objectid,
-				   objectid, pending->inherit);
-	if (ret) {
-		btrfs_abort_transaction(trans, root, ret);
-		goto fail;
-	}
-
 	/* see comments in should_cow_block() */
 	set_bit(BTRFS_ROOT_FORCE_COW, &root->state);
 	smp_wmb();
@@ -1502,11 +1489,37 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 			goto fail;
 		}
 	}
+
+	ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
+	if (ret) {
+		btrfs_abort_transaction(trans, root, ret);
+		goto fail;
+	}
+
+	/*
+	 * account qgroup counters before qgroup_inherit()
+	 */
+	ret = btrfs_qgroup_prepare_account_extents(trans, fs_info);
+	if (ret)
+		goto fail;
+	ret = btrfs_qgroup_account_extents(trans, fs_info);
+	if (ret)
+		goto fail;
+	ret = btrfs_qgroup_inherit(trans, fs_info,
+				   root->root_key.objectid,
+				   objectid, pending->inherit);
+	if (ret) {
+		btrfs_abort_transaction(trans, root, ret);
+		goto fail;
+	}
+
 fail:
 	pending->error = ret;
 dir_item_existed:
 	trans->block_rsv = rsv;
 	trans->bytes_reserved = 0;
+clear_skip_qgroup:
+	btrfs_clear_skip_qgroup(trans);
 no_free_objectid:
 	kfree(new_root_item);
 root_item_alloc_fail:
-- 
cgit v1.2.3-55-g7522


From 9a4e7276d39071576d369e607d7accb84b41d0b4 Mon Sep 17 00:00:00 2001
From: Zhao Lei
Date: Thu, 9 Apr 2015 12:34:43 +0800
Subject: btrfs: wait for delayed iputs on no space

btrfs will report no_space when we run following write and delete
file loop:
 # FILE_SIZE_M=[ 75% of fs space ]
 # DEV=[ some dev ]
 # MNT=[ some dir ]
 #
 # mkfs.btrfs -f "$DEV"
 # mount -o nodatacow "$DEV" "$MNT"
 # for ((i = 0; i < 100; i++)); do dd if=/dev/zero of="$MNT"/file0 bs=1M count="$FILE_SIZE_M"; rm -f "$MNT"/file0; done
 #

Reason:
 iput() and evict() is run after write pages to block device, if
 write pages work is not finished before next write, the "rm"ed space
 is not freed, and caused above bug.

Fix:
 We can add "-o flushoncommit" mount option to avoid above bug, but
 it have performance problem. Actually, we can to wait for on-the-fly
 writes only when no-space happened, it is which this patch do.

Signed-off-by: Zhao Lei <zhaolei@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/extent-tree.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 1acd63fcb252..38b76cc02f48 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3918,6 +3918,9 @@ commit_trans:
 		    !atomic_read(&root->fs_info->open_ioctl_trans)) {
 			need_commit--;
 
+			if (need_commit > 0)
+				btrfs_wait_ordered_roots(fs_info, -1);
+
 			trans = btrfs_join_transaction(root);
 			if (IS_ERR(trans))
 				return PTR_ERR(trans);
-- 
cgit v1.2.3-55-g7522


From 0eeff2362b829b5e80f8b69f86b60b8094bc742d Mon Sep 17 00:00:00 2001
From: Liu Bo
Date: Thu, 11 Jun 2015 14:16:44 +0800
Subject: Btrfs: fix use-after-free in btrfs_replay_log

@log_root_tree should not be referenced after kfree.

Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
Reviewed-by: David Sterba <dsterba@suse.cz>
Reported-by: Julia Lawall <julia.lawall@lip6.fr>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/disk-io.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 695363ae1c28..b7fa3bac2cff 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2321,8 +2321,9 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info,
 			fs_info->generation + 1);
 	if (IS_ERR(log_tree_root->node)) {
 		printk(KERN_ERR "BTRFS: failed to read log tree\n");
+		ret = PTR_ERR(log_tree_root->node);
 		kfree(log_tree_root);
-		return PTR_ERR(log_tree_root->node);
+		return ret;
 	} else if (!extent_buffer_uptodate(log_tree_root->node)) {
 		printk(KERN_ERR "BTRFS: failed to read log tree\n");
 		free_extent_buffer(log_tree_root->node);
-- 
cgit v1.2.3-55-g7522


From 37b8d27de5d0079e1ecef2711061048e13054ebe Mon Sep 17 00:00:00 2001
From: Josef Bacik
Date: Thu, 4 Jun 2015 17:17:25 -0400
Subject: Btrfs: use received_uuid of parent during send

Neil Horman pointed out a problem where if he did something like this

receive A
snap A B
change B
send -p A B

and then on another box do

recieve A
receive B

the receive B would fail because we use the UUID of A for the clone sources for
B.  This makes sense most of the time because normally you are sending from the
original sources, not a received source.  However when you use a recieved subvol
its UUID is going to be something completely different, so if you then try to
receive the diff on a different volume it won't find the UUID because the new A
will be something else.  The only constant is the received uuid.  So instead
check to see if we have received_uuid set on the root, and if so use that as the
clone source, as btrfs receive looks for matches either in received_uuid or
uuid.  Thanks,

Reported-by: Neil Horman <nhorman@redhat.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Reviewed-by: Hugo Mills <hugo@carfax.org.uk>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/send.c | 25 +++++++++++++++++++++----
 1 file changed, 21 insertions(+), 4 deletions(-)

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 50ebc622a324..aa72bfd28f7d 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -2356,8 +2356,12 @@ static int send_subvol_begin(struct send_ctx *sctx)
 	TLV_PUT_U64(sctx, BTRFS_SEND_A_CTRANSID,
 		    le64_to_cpu(sctx->send_root->root_item.ctransid));
 	if (parent_root) {
-		TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID,
-				sctx->parent_root->root_item.uuid);
+		if (!btrfs_is_empty_uuid(parent_root->root_item.received_uuid))
+			TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID,
+				     parent_root->root_item.received_uuid);
+		else
+			TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID,
+				     parent_root->root_item.uuid);
 		TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_CTRANSID,
 			    le64_to_cpu(sctx->parent_root->root_item.ctransid));
 	}
@@ -4586,8 +4590,21 @@ verbose_printk("btrfs: send_clone offset=%llu, len=%d, clone_root=%llu, "
 	if (ret < 0)
 		goto out;
 
-	TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID,
-			clone_root->root->root_item.uuid);
+	/*
+	 * If the parent we're using has a received_uuid set then use that as
+	 * our clone source as that is what we will look for when doing a
+	 * receive.
+	 *
+	 * This covers the case that we create a snapshot off of a received
+	 * subvolume and then use that as the parent and try to receive on a
+	 * different host.
+	 */
+	if (!btrfs_is_empty_uuid(clone_root->root->root_item.received_uuid))
+		TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID,
+			     clone_root->root->root_item.received_uuid);
+	else
+		TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID,
+			     clone_root->root->root_item.uuid);
 	TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_CTRANSID,
 		    le64_to_cpu(clone_root->root->root_item.ctransid));
 	TLV_PUT_PATH(sctx, BTRFS_SEND_A_CLONE_PATH, p);
-- 
cgit v1.2.3-55-g7522


From 4fde46f0cc71c7aba299ee6dfb4f017fb97b6e70 Mon Sep 17 00:00:00 2001
From: Anand Jain
Date: Wed, 17 Jun 2015 21:10:48 +0800
Subject: Btrfs: free the stale device

When btrfs on a device is overwritten with a new btrfs (mkfs),
the old btrfs instance in the kernel becomes stale. So with this
patch, if kernel finds device is overwritten then delete the stale
fsid/uuid.

Signed-off-by: Anand Jain <anand.jain@oracle.com>
---
 fs/btrfs/volumes.c | 61 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 61 insertions(+)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index b851964b7345..ea293db89cc4 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -445,6 +445,61 @@ static void pending_bios_fn(struct btrfs_work *work)
 	run_scheduled_bios(device);
 }
 
+
+void btrfs_free_stale_device(struct btrfs_device *cur_dev)
+{
+	struct btrfs_fs_devices *fs_devs;
+	struct btrfs_device *dev;
+
+	if (!cur_dev->name)
+		return;
+
+	list_for_each_entry(fs_devs, &fs_uuids, list) {
+		int del = 1;
+
+		if (fs_devs->opened)
+			continue;
+		if (fs_devs->seeding)
+			continue;
+
+		list_for_each_entry(dev, &fs_devs->devices, dev_list) {
+
+			if (dev == cur_dev)
+				continue;
+			if (!dev->name)
+				continue;
+
+			/*
+			 * Todo: This won't be enough. What if the same device
+			 * comes back (with new uuid and) with its mapper path?
+			 * But for now, this does help as mostly an admin will
+			 * either use mapper or non mapper path throughout.
+			 */
+			rcu_read_lock();
+			del = strcmp(rcu_str_deref(dev->name),
+						rcu_str_deref(cur_dev->name));
+			rcu_read_unlock();
+			if (!del)
+				break;
+		}
+
+		if (!del) {
+			/* delete the stale device */
+			if (fs_devs->num_devices == 1) {
+				btrfs_sysfs_remove_fsid(fs_devs);
+				list_del(&fs_devs->list);
+				free_fs_devices(fs_devs);
+			} else {
+				fs_devs->num_devices--;
+				list_del(&dev->dev_list);
+				rcu_string_free(dev->name);
+				kfree(dev);
+			}
+			break;
+		}
+	}
+}
+
 /*
  * Add new device to list of registered devices
  *
@@ -560,6 +615,12 @@ static noinline int device_list_add(const char *path,
 	if (!fs_devices->opened)
 		device->generation = found_transid;
 
+	/*
+	 * if there is new btrfs on an already registered device,
+	 * then remove the stale device entry.
+	 */
+	btrfs_free_stale_device(device);
+
 	*fs_devices_ret = fs_devices;
 
 	return ret;
-- 
cgit v1.2.3-55-g7522


From d2ff1b2008cb807102fc9496fadd8eddff98350c Mon Sep 17 00:00:00 2001
From: Anand Jain
Date: Tue, 10 Mar 2015 06:38:42 +0800
Subject: Btrfs: sysfs: add support to show replacing target in the sysfs

This patch will add support to show the replacing target in sysfs
during the process of replacement.

Signed-off-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.cz>
---
 fs/btrfs/dev-replace.c | 5 ++++-
 fs/btrfs/volumes.c     | 3 +++
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index f982ef303f1c..862fbc206755 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -376,6 +376,10 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
 	WARN_ON(!tgt_device);
 	dev_replace->tgtdev = tgt_device;
 
+	ret = btrfs_kobj_add_device(tgt_device->fs_devices, tgt_device);
+	if (ret)
+		btrfs_error(root->fs_info, ret, "kobj add dev failed");
+
 	printk_in_rcu(KERN_INFO
 		      "BTRFS: dev_replace from %s (devid %llu) to %s started\n",
 		      src_device->missing ? "<missing disk>" :
@@ -584,7 +588,6 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
 
 	/* replace the sysfs entry */
 	btrfs_kobj_rm_device(fs_info->fs_devices, src_device);
-	btrfs_kobj_add_device(fs_info->fs_devices, tgt_device);
 	btrfs_rm_dev_replace_free_srcdev(fs_info, src_device);
 
 	/* write back the superblocks */
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index ea293db89cc4..2d9061f078ca 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1940,6 +1940,9 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
 	mutex_lock(&uuid_mutex);
 	WARN_ON(!tgtdev);
 	mutex_lock(&fs_info->fs_devices->device_list_mutex);
+
+	btrfs_kobj_rm_device(fs_info->fs_devices, tgtdev);
+
 	if (tgtdev->bdev) {
 		btrfs_scratch_superblock(tgtdev);
 		fs_info->fs_devices->open_devices--;
-- 
cgit v1.2.3-55-g7522


From 24199d206a739a77edc3ff6c429b68b7bd6b2ef8 Mon Sep 17 00:00:00 2001
From: Anand Jain
Date: Thu, 12 Feb 2015 07:03:37 +0800
Subject: lib: export symbol kobject_move()

drivers/cpufreq/cpufreq.c is already using this function. And now btrfs
needs it as well. Export symbol kobject_move().

Signed-off-by: Anand Jain <anand.jain@oracle.com>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: David Sterba <dsterba@suse.cz>
---
 lib/kobject.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/lib/kobject.c b/lib/kobject.c
index 3b841b97fccd..53bfc853cd8c 100644
--- a/lib/kobject.c
+++ b/lib/kobject.c
@@ -548,6 +548,7 @@ out:
 	kfree(devpath);
 	return error;
 }
+EXPORT_SYMBOL_GPL(kobject_move);
 
 /**
  * kobject_del - unlink kobject from hierarchy.
-- 
cgit v1.2.3-55-g7522


From f90fc5472882ee7e76e0f345b9642b92c8677582 Mon Sep 17 00:00:00 2001
From: Anand Jain
Date: Mon, 22 Jun 2015 18:18:32 +0800
Subject: Btrfs: Check if kobject is initialized before put

Signed-off-by: Anand Jain <anand.jain@oracle.com>
Tested-by: David Sterba <dsterba@suse.cz>
Signed-off-by: David Sterba <dsterba@suse.cz>
---
 fs/btrfs/sysfs.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index ea81a057c79b..603b0cc2b9bb 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -523,9 +523,11 @@ static void __btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs)
 		fs_devs->device_dir_kobj = NULL;
 	}
 
-	kobject_del(&fs_devs->super_kobj);
-	kobject_put(&fs_devs->super_kobj);
-	wait_for_completion(&fs_devs->kobj_unregister);
+	if (fs_devs->super_kobj.state_initialized) {
+		kobject_del(&fs_devs->super_kobj);
+		kobject_put(&fs_devs->super_kobj);
+		wait_for_completion(&fs_devs->kobj_unregister);
+	}
 }
 
 /* when fs_devs is NULL it will remove all fsid kobject */
-- 
cgit v1.2.3-55-g7522


From 5a5003df98d5a7f6834227885b7c9728f767cc27 Mon Sep 17 00:00:00 2001
From: Dan Carpenter
Date: Wed, 24 Jun 2015 17:32:33 +0300
Subject: btrfs: delayed-ref: double free in btrfs_add_delayed_tree_ref()

There is a cut and paste error so instead of freeing "head_ref", we free
"ref" twice.

Fixes: 3368d001ba5d ('btrfs: qgroup: Record possible quota-related extent for qgroup.')
Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/delayed-ref.c | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index fd64fd0f011a..ac3e81da6d4e 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -650,18 +650,13 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
 		return -ENOMEM;
 
 	head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS);
-	if (!head_ref) {
-		kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref);
-		return -ENOMEM;
-	}
+	if (!head_ref)
+		goto free_ref;
 
 	if (fs_info->quota_enabled && is_fstree(ref_root)) {
 		record = kmalloc(sizeof(*record), GFP_NOFS);
-		if (!record) {
-			kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref);
-			kmem_cache_free(btrfs_delayed_ref_head_cachep, ref);
-			return -ENOMEM;
-		}
+		if (!record)
+			goto free_head_ref;
 	}
 
 	head_ref->extent_op = extent_op;
@@ -682,6 +677,13 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
 	spin_unlock(&delayed_refs->lock);
 
 	return 0;
+
+free_head_ref:
+	kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref);
+free_ref:
+	kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref);
+
+	return -ENOMEM;
 }
 
 /*
-- 
cgit v1.2.3-55-g7522