From 827463c49f29111efd22148d24c9ca44d648acfa Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Tue, 14 Jan 2014 20:31:51 +0800
Subject: Btrfs: don't mix the ordered extents of all files together during
 logging the inodes

There was a problem in the old code:
If we failed to log the csum, we would free all the ordered extents in the log list
including those ordered extents that were logged successfully, it would make the
log committer not to wait for the completion of the ordered extents.

This patch doesn't insert the ordered extents that is about to be logged into
a global list, instead, we insert them into a local list. If we log the ordered
extents successfully, we splice them with the global list, or we will throw them
away, then do full sync. It can also reduce the lock contention and the traverse
time of list.

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
---
 fs/btrfs/ordered-data.c | 37 +++++++++++++++++++++++++++++--------
 fs/btrfs/ordered-data.h |  6 +++++-
 fs/btrfs/tree-log.c     | 41 +++++++++++++++--------------------------
 3 files changed, 49 insertions(+), 35 deletions(-)

diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index b16450b840e7..138a7d7e9c90 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -424,27 +424,48 @@ out:
 }
 
 /* Needs to either be called under a log transaction or the log_mutex */
-void btrfs_get_logged_extents(struct btrfs_root *log, struct inode *inode)
+void btrfs_get_logged_extents(struct inode *inode,
+			      struct list_head *logged_list)
 {
 	struct btrfs_ordered_inode_tree *tree;
 	struct btrfs_ordered_extent *ordered;
 	struct rb_node *n;
-	int index = log->log_transid % 2;
 
 	tree = &BTRFS_I(inode)->ordered_tree;
 	spin_lock_irq(&tree->lock);
 	for (n = rb_first(&tree->tree); n; n = rb_next(n)) {
 		ordered = rb_entry(n, struct btrfs_ordered_extent, rb_node);
-		spin_lock(&log->log_extents_lock[index]);
-		if (list_empty(&ordered->log_list)) {
-			list_add_tail(&ordered->log_list, &log->logged_list[index]);
-			atomic_inc(&ordered->refs);
-		}
-		spin_unlock(&log->log_extents_lock[index]);
+		if (!list_empty(&ordered->log_list))
+			continue;
+		list_add_tail(&ordered->log_list, logged_list);
+		atomic_inc(&ordered->refs);
 	}
 	spin_unlock_irq(&tree->lock);
 }
 
+void btrfs_put_logged_extents(struct list_head *logged_list)
+{
+	struct btrfs_ordered_extent *ordered;
+
+	while (!list_empty(logged_list)) {
+		ordered = list_first_entry(logged_list,
+					   struct btrfs_ordered_extent,
+					   log_list);
+		list_del_init(&ordered->log_list);
+		btrfs_put_ordered_extent(ordered);
+	}
+}
+
+void btrfs_submit_logged_extents(struct list_head *logged_list,
+				 struct btrfs_root *log)
+{
+	int index = log->log_transid % 2;
+
+	spin_lock_irq(&log->log_extents_lock[index]);
+	list_splice_tail(logged_list, &log->logged_list[index]);
+	spin_unlock_irq(&log->log_extents_lock[index]);
+}
+
 void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid)
 {
 	struct btrfs_ordered_extent *ordered;
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 9b0450f7ac20..246897058efb 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -197,7 +197,11 @@ void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
 				 struct inode *inode);
 int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr);
 void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr);
-void btrfs_get_logged_extents(struct btrfs_root *log, struct inode *inode);
+void btrfs_get_logged_extents(struct inode *inode,
+			      struct list_head *logged_list);
+void btrfs_put_logged_extents(struct list_head *logged_list);
+void btrfs_submit_logged_extents(struct list_head *logged_list,
+				 struct btrfs_root *log);
 void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid);
 void btrfs_free_logged_extents(struct btrfs_root *log, u64 transid);
 int __init ordered_data_init(void);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 39d83da03e03..7c449c699bed 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -3479,7 +3479,8 @@ static int extent_cmp(void *priv, struct list_head *a, struct list_head *b)
 
 static int log_one_extent(struct btrfs_trans_handle *trans,
 			  struct inode *inode, struct btrfs_root *root,
-			  struct extent_map *em, struct btrfs_path *path)
+			  struct extent_map *em, struct btrfs_path *path,
+			  struct list_head *logged_list)
 {
 	struct btrfs_root *log = root->log_root;
 	struct btrfs_file_extent_item *fi;
@@ -3495,7 +3496,6 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
 	u64 extent_offset = em->start - em->orig_start;
 	u64 block_len;
 	int ret;
-	int index = log->log_transid % 2;
 	bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
 	int extent_inserted = 0;
 
@@ -3579,17 +3579,12 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
 	 * First check and see if our csums are on our outstanding ordered
 	 * extents.
 	 */
-again:
-	spin_lock_irq(&log->log_extents_lock[index]);
-	list_for_each_entry(ordered, &log->logged_list[index], log_list) {
+	list_for_each_entry(ordered, logged_list, log_list) {
 		struct btrfs_ordered_sum *sum;
 
 		if (!mod_len)
 			break;
 
-		if (ordered->inode != inode)
-			continue;
-
 		if (ordered->file_offset + ordered->len <= mod_start ||
 		    mod_start + mod_len <= ordered->file_offset)
 			continue;
@@ -3632,12 +3627,6 @@ again:
 		if (test_and_set_bit(BTRFS_ORDERED_LOGGED_CSUM,
 				     &ordered->flags))
 			continue;
-		atomic_inc(&ordered->refs);
-		spin_unlock_irq(&log->log_extents_lock[index]);
-		/*
-		 * we've dropped the lock, we must either break or
-		 * start over after this.
-		 */
 
 		if (ordered->csum_bytes_left) {
 			btrfs_start_ordered_extent(inode, ordered, 0);
@@ -3647,16 +3636,11 @@ again:
 
 		list_for_each_entry(sum, &ordered->list, list) {
 			ret = btrfs_csum_file_blocks(trans, log, sum);
-			if (ret) {
-				btrfs_put_ordered_extent(ordered);
+			if (ret)
 				goto unlocked;
-			}
 		}
-		btrfs_put_ordered_extent(ordered);
-		goto again;
 
 	}
-	spin_unlock_irq(&log->log_extents_lock[index]);
 unlocked:
 
 	if (!mod_len || ret)
@@ -3694,7 +3678,8 @@ unlocked:
 static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
 				     struct btrfs_root *root,
 				     struct inode *inode,
-				     struct btrfs_path *path)
+				     struct btrfs_path *path,
+				     struct list_head *logged_list)
 {
 	struct extent_map *em, *n;
 	struct list_head extents;
@@ -3752,7 +3737,7 @@ process:
 
 		write_unlock(&tree->lock);
 
-		ret = log_one_extent(trans, inode, root, em, path);
+		ret = log_one_extent(trans, inode, root, em, path, logged_list);
 		write_lock(&tree->lock);
 		clear_em_logging(tree, em);
 		free_extent_map(em);
@@ -3788,6 +3773,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
 	struct btrfs_key max_key;
 	struct btrfs_root *log = root->log_root;
 	struct extent_buffer *src = NULL;
+	LIST_HEAD(logged_list);
 	u64 last_extent = 0;
 	int err = 0;
 	int ret;
@@ -3836,7 +3822,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
 
 	mutex_lock(&BTRFS_I(inode)->log_mutex);
 
-	btrfs_get_logged_extents(log, inode);
+	btrfs_get_logged_extents(inode, &logged_list);
 
 	/*
 	 * a brute force approach to making sure we get the most uptodate
@@ -3962,7 +3948,8 @@ log_extents:
 	btrfs_release_path(path);
 	btrfs_release_path(dst_path);
 	if (fast_search) {
-		ret = btrfs_log_changed_extents(trans, root, inode, dst_path);
+		ret = btrfs_log_changed_extents(trans, root, inode, dst_path,
+						&logged_list);
 		if (ret) {
 			err = ret;
 			goto out_unlock;
@@ -3987,8 +3974,10 @@ log_extents:
 	BTRFS_I(inode)->logged_trans = trans->transid;
 	BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->last_sub_trans;
 out_unlock:
-	if (err)
-		btrfs_free_logged_extents(log, log->log_transid);
+	if (unlikely(err))
+		btrfs_put_logged_extents(&logged_list);
+	else
+		btrfs_submit_logged_extents(&logged_list, log);
 	mutex_unlock(&BTRFS_I(inode)->log_mutex);
 
 	btrfs_free_path(path);
-- 
cgit v1.2.3


From 23ad5b17dce0f09af82c071b26acac35a0ab892b Mon Sep 17 00:00:00 2001
From: Kusanagi Kouichi <slash@ac.auone-net.jp>
Date: Thu, 30 Jan 2014 16:32:02 +0900
Subject: btrfs: Return EXDEV for cross file system snapshot

EXDEV seems an appropriate error if an operation fails bacause it
crosses file system boundaries.

Reviewed-by: David Sterba <dsterba@suse.cz>
Signed-off-by: Kusanagi Kouichi <slash@ac.auone-net.jp>
Signed-off-by: Josef Bacik <jbacik@fb.com>
---
 fs/btrfs/ioctl.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 9a9044585da7..a692aad8fa5a 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1573,7 +1573,7 @@ static noinline int btrfs_ioctl_snap_create_transid(struct file *file,
 		if (src_inode->i_sb != file_inode(file)->i_sb) {
 			btrfs_info(BTRFS_I(src_inode)->root->fs_info,
 				   "Snapshot src from another FS");
-			ret = -EINVAL;
+			ret = -EXDEV;
 		} else if (!inode_owner_or_capable(src_inode)) {
 			/*
 			 * Subvolume creation is not restricted, but snapshots
-- 
cgit v1.2.3


From 391cd9df81ac07ce7e66ac8fb13e56693061a6e6 Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Thu, 30 Jan 2014 16:46:54 +0800
Subject: Btrfs: fix unprotected alloc list insertion during the finishing
 procedure of replace

the alloc list of the filesystem is protected by ->chunk_mutex, we need
get that mutex when we insert the new device into the list.

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
---
 fs/btrfs/dev-replace.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 564c92638b20..b20d59e5e5dd 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -484,6 +484,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
 	WARN_ON(ret);
 
 	/* keep away write_all_supers() during the finishing procedure */
+	mutex_lock(&root->fs_info->chunk_mutex);
 	mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
 	btrfs_dev_replace_lock(dev_replace);
 	dev_replace->replace_state =
@@ -503,6 +504,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
 			      rcu_str_deref(tgt_device->name), scrub_ret);
 		btrfs_dev_replace_unlock(dev_replace);
 		mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
+		mutex_unlock(&root->fs_info->chunk_mutex);
 		if (tgt_device)
 			btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device);
 		mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
@@ -543,6 +545,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
 	 */
 	btrfs_dev_replace_unlock(dev_replace);
 	mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
+	mutex_unlock(&root->fs_info->chunk_mutex);
 
 	/* write back the superblocks */
 	trans = btrfs_start_transaction(root, 0);
-- 
cgit v1.2.3


From c404e0dc2c843b154f9a36c3aec10d0a715d88eb Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Thu, 30 Jan 2014 16:46:55 +0800
Subject: Btrfs: fix use-after-free in the finishing procedure of the device
 replace

During device replace test, we hit a null pointer deference (It was very easy
to reproduce it by running xfstests' btrfs/011 on the devices with the virtio
scsi driver). There were two bugs that caused this problem:
- We might allocate new chunks on the replaced device after we updated
  the mapping tree. And we forgot to replace the source device in those
  mapping of the new chunks.
- We might get the mapping information which including the source device
  before the mapping information update. And then submit the bio which was
  based on that mapping information after we freed the source device.

For the first bug, we can fix it by doing mapping tree update and source
device remove in the same context of the chunk mutex. The chunk mutex is
used to protect the allocable device list, the above method can avoid
the new chunk allocation, and after we remove the source device, all
the new chunks will be allocated on the new device. So it can fix
the first bug.

For the second bug, we need make sure all flighting bios are finished and
no new bios are produced during we are removing the source device. To fix
this problem, we introduced a global @bio_counter, we not only inc/dec
@bio_counter outsize of map_blocks, but also inc it before submitting bio
and dec @bio_counter when ending bios.

Since Raid56 is a little different and device replace dosen't support raid56
yet, it is not addressed in the patch and I add comments to make sure we will
fix it in the future.

Reported-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Signed-off-by: Wang Shilong <wangsl.fnst@cn.fujitsu.com>
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
---
 fs/btrfs/ctree.h       |  9 ++++++
 fs/btrfs/dev-replace.c | 74 +++++++++++++++++++++++++++++++++++++++++++++-----
 fs/btrfs/disk-io.c     | 12 +++++++-
 fs/btrfs/volumes.c     | 30 +++++++++++++++-----
 fs/btrfs/volumes.h     |  1 +
 5 files changed, 111 insertions(+), 15 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index fceddbdfdd3d..dac6653d4cce 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -351,6 +351,7 @@ static inline unsigned long btrfs_chunk_item_size(int num_stripes)
 #define BTRFS_FS_STATE_ERROR		0
 #define BTRFS_FS_STATE_REMOUNTING	1
 #define BTRFS_FS_STATE_TRANS_ABORTED	2
+#define BTRFS_FS_STATE_DEV_REPLACING	3
 
 /* Super block flags */
 /* Errors detected */
@@ -1674,6 +1675,9 @@ struct btrfs_fs_info {
 
 	atomic_t mutually_exclusive_operation_running;
 
+	struct percpu_counter bio_counter;
+	wait_queue_head_t replace_wait;
+
 	struct semaphore uuid_tree_rescan_sem;
 	unsigned int update_uuid_tree_gen:1;
 };
@@ -4008,6 +4012,11 @@ int btrfs_scrub_cancel_dev(struct btrfs_fs_info *info,
 int btrfs_scrub_progress(struct btrfs_root *root, u64 devid,
 			 struct btrfs_scrub_progress *progress);
 
+/* dev-replace.c */
+void btrfs_bio_counter_inc_blocked(struct btrfs_fs_info *fs_info);
+void btrfs_bio_counter_inc_noblocked(struct btrfs_fs_info *fs_info);
+void btrfs_bio_counter_dec(struct btrfs_fs_info *fs_info);
+
 /* reada.c */
 struct reada_control {
 	struct btrfs_root	*root;		/* tree to prefetch */
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index b20d59e5e5dd..ec1c3f3a775d 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -431,6 +431,35 @@ leave_no_lock:
 	return ret;
 }
 
+/*
+ * blocked until all flighting bios are finished.
+ */
+static void btrfs_rm_dev_replace_blocked(struct btrfs_fs_info *fs_info)
+{
+	s64 writers;
+	DEFINE_WAIT(wait);
+
+	set_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state);
+	do {
+		prepare_to_wait(&fs_info->replace_wait, &wait,
+				TASK_UNINTERRUPTIBLE);
+		writers = percpu_counter_sum(&fs_info->bio_counter);
+		if (writers)
+			schedule();
+		finish_wait(&fs_info->replace_wait, &wait);
+	} while (writers);
+}
+
+/*
+ * we have removed target device, it is safe to allow new bios request.
+ */
+static void btrfs_rm_dev_replace_unblocked(struct btrfs_fs_info *fs_info)
+{
+	clear_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state);
+	if (waitqueue_active(&fs_info->replace_wait))
+		wake_up(&fs_info->replace_wait);
+}
+
 static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
 				       int scrub_ret)
 {
@@ -458,12 +487,6 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
 	src_device = dev_replace->srcdev;
 	btrfs_dev_replace_unlock(dev_replace);
 
-	/* replace old device with new one in mapping tree */
-	if (!scrub_ret)
-		btrfs_dev_replace_update_device_in_mapping_tree(fs_info,
-								src_device,
-								tgt_device);
-
 	/*
 	 * flush all outstanding I/O and inode extent mappings before the
 	 * copy operation is declared as being finished
@@ -495,7 +518,12 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
 	dev_replace->time_stopped = get_seconds();
 	dev_replace->item_needs_writeback = 1;
 
-	if (scrub_ret) {
+	/* replace old device with new one in mapping tree */
+	if (!scrub_ret) {
+		btrfs_dev_replace_update_device_in_mapping_tree(fs_info,
+								src_device,
+								tgt_device);
+	} else {
 		printk_in_rcu(KERN_ERR
 			      "BTRFS: btrfs_scrub_dev(%s, %llu, %s) failed %d\n",
 			      src_device->missing ? "<missing disk>" :
@@ -534,8 +562,12 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
 		fs_info->fs_devices->latest_bdev = tgt_device->bdev;
 	list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list);
 
+	btrfs_rm_dev_replace_blocked(fs_info);
+
 	btrfs_rm_dev_replace_srcdev(fs_info, src_device);
 
+	btrfs_rm_dev_replace_unblocked(fs_info);
+
 	/*
 	 * this is again a consistent state where no dev_replace procedure
 	 * is running, the target device is part of the filesystem, the
@@ -865,3 +897,31 @@ void btrfs_dev_replace_unlock(struct btrfs_dev_replace *dev_replace)
 		mutex_unlock(&dev_replace->lock_management_lock);
 	}
 }
+
+void btrfs_bio_counter_inc_noblocked(struct btrfs_fs_info *fs_info)
+{
+	percpu_counter_inc(&fs_info->bio_counter);
+}
+
+void btrfs_bio_counter_dec(struct btrfs_fs_info *fs_info)
+{
+	percpu_counter_dec(&fs_info->bio_counter);
+
+	if (waitqueue_active(&fs_info->replace_wait))
+		wake_up(&fs_info->replace_wait);
+}
+
+void btrfs_bio_counter_inc_blocked(struct btrfs_fs_info *fs_info)
+{
+	DEFINE_WAIT(wait);
+again:
+	percpu_counter_inc(&fs_info->bio_counter);
+	if (test_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state)) {
+		btrfs_bio_counter_dec(fs_info);
+		wait_event(fs_info->replace_wait,
+			   !test_bit(BTRFS_FS_STATE_DEV_REPLACING,
+				     &fs_info->fs_state));
+		goto again;
+	}
+
+}
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index fcf367581073..0cafacb07b43 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2136,10 +2136,16 @@ int open_ctree(struct super_block *sb,
 		goto fail_dirty_metadata_bytes;
 	}
 
+	ret = percpu_counter_init(&fs_info->bio_counter, 0);
+	if (ret) {
+		err = ret;
+		goto fail_delalloc_bytes;
+	}
+
 	fs_info->btree_inode = new_inode(sb);
 	if (!fs_info->btree_inode) {
 		err = -ENOMEM;
-		goto fail_delalloc_bytes;
+		goto fail_bio_counter;
 	}
 
 	mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS);
@@ -2214,6 +2220,7 @@ int open_ctree(struct super_block *sb,
 	atomic_set(&fs_info->scrub_pause_req, 0);
 	atomic_set(&fs_info->scrubs_paused, 0);
 	atomic_set(&fs_info->scrub_cancel_req, 0);
+	init_waitqueue_head(&fs_info->replace_wait);
 	init_waitqueue_head(&fs_info->scrub_pause_wait);
 	fs_info->scrub_workers_refcnt = 0;
 #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
@@ -2966,6 +2973,8 @@ fail_iput:
 	btrfs_mapping_tree_free(&fs_info->mapping_tree);
 
 	iput(fs_info->btree_inode);
+fail_bio_counter:
+	percpu_counter_destroy(&fs_info->bio_counter);
 fail_delalloc_bytes:
 	percpu_counter_destroy(&fs_info->delalloc_bytes);
 fail_dirty_metadata_bytes:
@@ -3613,6 +3622,7 @@ int close_ctree(struct btrfs_root *root)
 
 	percpu_counter_destroy(&fs_info->dirty_metadata_bytes);
 	percpu_counter_destroy(&fs_info->delalloc_bytes);
+	percpu_counter_destroy(&fs_info->bio_counter);
 	bdi_destroy(&fs_info->bdi);
 	cleanup_srcu_struct(&fs_info->subvol_srcu);
 
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index b68afe32419f..07629e99809a 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -5263,6 +5263,7 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
 static void btrfs_end_bio(struct bio *bio, int err)
 {
 	struct btrfs_bio *bbio = bio->bi_private;
+	struct btrfs_device *dev = bbio->stripes[0].dev;
 	int is_orig_bio = 0;
 
 	if (err) {
@@ -5270,7 +5271,6 @@ static void btrfs_end_bio(struct bio *bio, int err)
 		if (err == -EIO || err == -EREMOTEIO) {
 			unsigned int stripe_index =
 				btrfs_io_bio(bio)->stripe_index;
-			struct btrfs_device *dev;
 
 			BUG_ON(stripe_index >= bbio->num_stripes);
 			dev = bbio->stripes[stripe_index].dev;
@@ -5292,6 +5292,8 @@ static void btrfs_end_bio(struct bio *bio, int err)
 	if (bio == bbio->orig_bio)
 		is_orig_bio = 1;
 
+	btrfs_bio_counter_dec(bbio->fs_info);
+
 	if (atomic_dec_and_test(&bbio->stripes_pending)) {
 		if (!is_orig_bio) {
 			bio_put(bio);
@@ -5440,6 +5442,9 @@ static void submit_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio,
 	}
 #endif
 	bio->bi_bdev = dev->bdev;
+
+	btrfs_bio_counter_inc_noblocked(root->fs_info);
+
 	if (async)
 		btrfs_schedule_bio(root, dev, rw, bio);
 	else
@@ -5508,28 +5513,38 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
 	length = bio->bi_size;
 	map_length = length;
 
+	btrfs_bio_counter_inc_blocked(root->fs_info);
 	ret = __btrfs_map_block(root->fs_info, rw, logical, &map_length, &bbio,
 			      mirror_num, &raid_map);
-	if (ret) /* -ENOMEM */
+	if (ret) {
+		btrfs_bio_counter_dec(root->fs_info);
 		return ret;
+	}
 
 	total_devs = bbio->num_stripes;
 	bbio->orig_bio = first_bio;
 	bbio->private = first_bio->bi_private;
 	bbio->end_io = first_bio->bi_end_io;
+	bbio->fs_info = root->fs_info;
 	atomic_set(&bbio->stripes_pending, bbio->num_stripes);
 
 	if (raid_map) {
 		/* In this case, map_length has been set to the length of
 		   a single stripe; not the whole write */
 		if (rw & WRITE) {
-			return raid56_parity_write(root, bio, bbio,
-						   raid_map, map_length);
+			ret = raid56_parity_write(root, bio, bbio,
+						  raid_map, map_length);
 		} else {
-			return raid56_parity_recover(root, bio, bbio,
-						     raid_map, map_length,
-						     mirror_num);
+			ret = raid56_parity_recover(root, bio, bbio,
+						    raid_map, map_length,
+						    mirror_num);
 		}
+		/*
+		 * FIXME, replace dosen't support raid56 yet, please fix
+		 * it in the future.
+		 */
+		btrfs_bio_counter_dec(root->fs_info);
+		return ret;
 	}
 
 	if (map_length < length) {
@@ -5571,6 +5586,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
 				  async_submit);
 		dev_nr++;
 	}
+	btrfs_bio_counter_dec(root->fs_info);
 	return 0;
 }
 
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 8b3cd142b373..80754f9dd3df 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -192,6 +192,7 @@ typedef void (btrfs_bio_end_io_t) (struct btrfs_bio *bio, int err);
 
 struct btrfs_bio {
 	atomic_t stripes_pending;
+	struct btrfs_fs_info *fs_info;
 	bio_end_io_t *end_io;
 	struct bio *orig_bio;
 	void *private;
-- 
cgit v1.2.3


From d86477b303da51832002eec1cdec2938c42fccc3 Mon Sep 17 00:00:00 2001
From: Filipe David Borba Manana <fdmanana@gmail.com>
Date: Thu, 30 Jan 2014 13:27:12 +0000
Subject: Btrfs: add missing error check in incremental send

Function wait_for_parent_move() returns negative value if an error
happened, 0 if we don't need to wait for the parent's move, and
1 if the wait is needed.
Before this change an error return value was being treated like the
return value 1, which was not correct.

Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
---
 fs/btrfs/send.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 9dde9717c1b9..70272e1d2b1e 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -3227,7 +3227,10 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
 				 * dirs, we always have one new and one deleted
 				 * ref. The deleted ref is ignored later.
 				 */
-				if (wait_for_parent_move(sctx, cur)) {
+				ret = wait_for_parent_move(sctx, cur);
+				if (ret < 0)
+					goto out;
+				if (ret) {
 					ret = add_pending_dir_move(sctx,
 								   cur->dir);
 					*pending_move = 1;
-- 
cgit v1.2.3


From abccd00f8af27c585be48904515bad5658130e48 Mon Sep 17 00:00:00 2001
From: Hugo Mills <hugo@carfax.org.uk>
Date: Thu, 30 Jan 2014 20:17:00 +0000
Subject: btrfs: Fix 32/64-bit problem with BTRFS_SET_RECEIVED_SUBVOL ioctl

The structure for BTRFS_SET_RECEIVED_IOCTL packs differently on 32-bit
and 64-bit systems. This means that it is impossible to use btrfs
receive on a system with a 64-bit kernel and 32-bit userspace, because
the structure size (and hence the ioctl number) is different.

This patch adds a compatibility structure and ioctl to deal with the
above case.

Signed-off-by: Hugo Mills <hugo@carfax.org.uk>
Signed-off-by: Josef Bacik <jbacik@fb.com>
---
 fs/btrfs/ioctl.c | 123 +++++++++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 111 insertions(+), 12 deletions(-)

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index a692aad8fa5a..d4c179502775 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -59,6 +59,32 @@
 #include "props.h"
 #include "sysfs.h"
 
+#ifdef CONFIG_64BIT
+/* If we have a 32-bit userspace and 64-bit kernel, then the UAPI
+ * structures are incorrect, as the timespec structure from userspace
+ * is 4 bytes too small. We define these alternatives here to teach
+ * the kernel about the 32-bit struct packing.
+ */
+struct btrfs_ioctl_timespec_32 {
+	__u64 sec;
+	__u32 nsec;
+} __attribute__ ((__packed__));
+
+struct btrfs_ioctl_received_subvol_args_32 {
+	char	uuid[BTRFS_UUID_SIZE];	/* in */
+	__u64	stransid;		/* in */
+	__u64	rtransid;		/* out */
+	struct btrfs_ioctl_timespec_32 stime; /* in */
+	struct btrfs_ioctl_timespec_32 rtime; /* out */
+	__u64	flags;			/* in */
+	__u64	reserved[16];		/* in */
+} __attribute__ ((__packed__));
+
+#define BTRFS_IOC_SET_RECEIVED_SUBVOL_32 _IOWR(BTRFS_IOCTL_MAGIC, 37, \
+				struct btrfs_ioctl_received_subvol_args_32)
+#endif
+
+
 static int btrfs_clone(struct inode *src, struct inode *inode,
 		       u64 off, u64 olen, u64 olen_aligned, u64 destoff);
 
@@ -4375,10 +4401,9 @@ static long btrfs_ioctl_quota_rescan_wait(struct file *file, void __user *arg)
 	return btrfs_qgroup_wait_for_completion(root->fs_info);
 }
 
-static long btrfs_ioctl_set_received_subvol(struct file *file,
-					    void __user *arg)
+static long _btrfs_ioctl_set_received_subvol(struct file *file,
+					    struct btrfs_ioctl_received_subvol_args *sa)
 {
-	struct btrfs_ioctl_received_subvol_args *sa = NULL;
 	struct inode *inode = file_inode(file);
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_root_item *root_item = &root->root_item;
@@ -4406,13 +4431,6 @@ static long btrfs_ioctl_set_received_subvol(struct file *file,
 		goto out;
 	}
 
-	sa = memdup_user(arg, sizeof(*sa));
-	if (IS_ERR(sa)) {
-		ret = PTR_ERR(sa);
-		sa = NULL;
-		goto out;
-	}
-
 	/*
 	 * 1 - root item
 	 * 2 - uuid items (received uuid + subvol uuid)
@@ -4466,14 +4484,91 @@ static long btrfs_ioctl_set_received_subvol(struct file *file,
 		goto out;
 	}
 
+out:
+	up_write(&root->fs_info->subvol_sem);
+	mnt_drop_write_file(file);
+	return ret;
+}
+
+#ifdef CONFIG_64BIT
+static long btrfs_ioctl_set_received_subvol_32(struct file *file,
+						void __user *arg)
+{
+	struct btrfs_ioctl_received_subvol_args_32 *args32 = NULL;
+	struct btrfs_ioctl_received_subvol_args *args64 = NULL;
+	int ret = 0;
+
+	args32 = memdup_user(arg, sizeof(*args32));
+	if (IS_ERR(args32)) {
+		ret = PTR_ERR(args32);
+		args32 = NULL;
+		goto out;
+	}
+
+	args64 = kmalloc(sizeof(*args64), GFP_NOFS);
+	if (IS_ERR(args64)) {
+		ret = PTR_ERR(args64);
+		args64 = NULL;
+		goto out;
+	}
+
+	memcpy(args64->uuid, args32->uuid, BTRFS_UUID_SIZE);
+	args64->stransid = args32->stransid;
+	args64->rtransid = args32->rtransid;
+	args64->stime.sec = args32->stime.sec;
+	args64->stime.nsec = args32->stime.nsec;
+	args64->rtime.sec = args32->rtime.sec;
+	args64->rtime.nsec = args32->rtime.nsec;
+	args64->flags = args32->flags;
+
+	ret = _btrfs_ioctl_set_received_subvol(file, args64);
+	if (ret)
+		goto out;
+
+	memcpy(args32->uuid, args64->uuid, BTRFS_UUID_SIZE);
+	args32->stransid = args64->stransid;
+	args32->rtransid = args64->rtransid;
+	args32->stime.sec = args64->stime.sec;
+	args32->stime.nsec = args64->stime.nsec;
+	args32->rtime.sec = args64->rtime.sec;
+	args32->rtime.nsec = args64->rtime.nsec;
+	args32->flags = args64->flags;
+
+	ret = copy_to_user(arg, args32, sizeof(*args32));
+	if (ret)
+		ret = -EFAULT;
+
+out:
+	kfree(args32);
+	kfree(args64);
+	return ret;
+}
+#endif
+
+static long btrfs_ioctl_set_received_subvol(struct file *file,
+					    void __user *arg)
+{
+	struct btrfs_ioctl_received_subvol_args *sa = NULL;
+	int ret = 0;
+
+	sa = memdup_user(arg, sizeof(*sa));
+	if (IS_ERR(sa)) {
+		ret = PTR_ERR(sa);
+		sa = NULL;
+		goto out;
+	}
+
+	ret = _btrfs_ioctl_set_received_subvol(file, sa);
+
+	if (ret)
+		goto out;
+
 	ret = copy_to_user(arg, sa, sizeof(*sa));
 	if (ret)
 		ret = -EFAULT;
 
 out:
 	kfree(sa);
-	up_write(&root->fs_info->subvol_sem);
-	mnt_drop_write_file(file);
 	return ret;
 }
 
@@ -4792,6 +4887,10 @@ long btrfs_ioctl(struct file *file, unsigned int
 		return btrfs_ioctl_balance_progress(root, argp);
 	case BTRFS_IOC_SET_RECEIVED_SUBVOL:
 		return btrfs_ioctl_set_received_subvol(file, argp);
+#ifdef CONFIG_64BIT
+	case BTRFS_IOC_SET_RECEIVED_SUBVOL_32:
+		return btrfs_ioctl_set_received_subvol_32(file, argp);
+#endif
 	case BTRFS_IOC_SEND:
 		return btrfs_ioctl_send(file, argp);
 	case BTRFS_IOC_GET_DEV_STATS:
-- 
cgit v1.2.3


From 98cfee214394a3560bd4ce3209b55a71c4267783 Mon Sep 17 00:00:00 2001
From: Wang Shilong <wangsl.fnst@cn.fujitsu.com>
Date: Sat, 1 Feb 2014 00:42:05 +0800
Subject: Btrfs: only add roots if necessary in find_parent_nodes()

find_all_leafs() dosen't need add all roots actually, add roots only
if we need, this can avoid unnecessary ulist dance.

Signed-off-by: Wang Shilong <wangsl.fnst@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
---
 fs/btrfs/backref.c | 14 +++-----------
 1 file changed, 3 insertions(+), 11 deletions(-)

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index aded3ef3d3d4..903fe68e017b 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -965,7 +965,7 @@ again:
 	while (!list_empty(&prefs)) {
 		ref = list_first_entry(&prefs, struct __prelim_ref, list);
 		WARN_ON(ref->count < 0);
-		if (ref->count && ref->root_id && ref->parent == 0) {
+		if (roots && ref->count && ref->root_id && ref->parent == 0) {
 			/* no parent == root of tree */
 			ret = ulist_add(roots, ref->root_id, 0, GFP_NOFS);
 			if (ret < 0)
@@ -1061,22 +1061,14 @@ static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans,
 				u64 time_seq, struct ulist **leafs,
 				const u64 *extent_item_pos)
 {
-	struct ulist *tmp;
 	int ret;
 
-	tmp = ulist_alloc(GFP_NOFS);
-	if (!tmp)
-		return -ENOMEM;
 	*leafs = ulist_alloc(GFP_NOFS);
-	if (!*leafs) {
-		ulist_free(tmp);
+	if (!*leafs)
 		return -ENOMEM;
-	}
 
 	ret = find_parent_nodes(trans, fs_info, bytenr,
-				time_seq, *leafs, tmp, extent_item_pos);
-	ulist_free(tmp);
-
+				time_seq, *leafs, NULL, extent_item_pos);
 	if (ret < 0 && ret != -ENOENT) {
 		free_leaf_list(*leafs);
 		return ret;
-- 
cgit v1.2.3


From 03cb4fb9d86d591bc8a3f66eac6fb874b50b1b4d Mon Sep 17 00:00:00 2001
From: Filipe David Borba Manana <fdmanana@gmail.com>
Date: Sat, 1 Feb 2014 02:00:15 +0000
Subject: Btrfs: fix send dealing with file renames and directory moves

This fixes a case that the commit titled:

   Btrfs: fix infinite path build loops in incremental send

didn't cover. If the parent-child relationship between 2 directories
is inverted, both get renamed, and the former parent has a file that
got renamed too (but remains a child of that directory), the incremental
send operation would use the file's old path after sending an unlink
operation for that old path, causing receive to fail on future operations
like changing owner, permissions or utimes of the corresponding inode.

This is not a regression from the commit mentioned before, as without
that commit we would fall into the issues that commit fixed, so it's
just one case that wasn't covered before.

Simple steps to reproduce this issue are:

      $ mkfs.btrfs -f /dev/sdb3
      $ mount /dev/sdb3 /mnt/btrfs
      $ mkdir -p /mnt/btrfs/a/b/c/d
      $ touch /mnt/btrfs/a/b/c/d/file
      $ mkdir -p /mnt/btrfs/a/b/x
      $ btrfs subvol snapshot -r /mnt/btrfs /mnt/btrfs/snap1
      $ mv /mnt/btrfs/a/b/x /mnt/btrfs/a/b/c/x2
      $ mv /mnt/btrfs/a/b/c/d /mnt/btrfs/a/b/c/x2/d2
      $ mv /mnt/btrfs/a/b/c/x2/d2/file /mnt/btrfs/a/b/c/x2/d2/file2
      $ btrfs subvol snapshot -r /mnt/btrfs /mnt/btrfs/snap2
      $ btrfs send -p /mnt/btrfs/snap1 /mnt/btrfs/snap2 > /tmp/incremental.send

A patch to update the test btrfs/030 from xfstests, so that it covers
this case, will be submitted soon.

Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
---
 fs/btrfs/send.c | 10 +---------
 1 file changed, 1 insertion(+), 9 deletions(-)

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 70272e1d2b1e..8bd0505ee2f9 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -2131,8 +2131,6 @@ static int get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen,
 	u64 parent_inode = 0;
 	u64 parent_gen = 0;
 	int stop = 0;
-	u64 start_ino = ino;
-	u64 start_gen = gen;
 	int skip_name_cache = 0;
 
 	name = fs_path_alloc();
@@ -2144,7 +2142,6 @@ static int get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen,
 	if (is_waiting_for_move(sctx, ino))
 		skip_name_cache = 1;
 
-again:
 	dest->reversed = 1;
 	fs_path_reset(dest);
 
@@ -2159,13 +2156,8 @@ again:
 			stop = 1;
 
 		if (!skip_name_cache &&
-		    is_waiting_for_move(sctx, parent_inode)) {
-			ino = start_ino;
-			gen = start_gen;
-			stop = 0;
+		    is_waiting_for_move(sctx, parent_inode))
 			skip_name_cache = 1;
-			goto again;
-		}
 
 		ret = fs_path_add_path(dest, name);
 		if (ret < 0)
-- 
cgit v1.2.3


From 5ed7f9ff15e6ea56bcb78f69e9503dc1a587caf0 Mon Sep 17 00:00:00 2001
From: Filipe David Borba Manana <fdmanana@gmail.com>
Date: Sat, 1 Feb 2014 02:02:16 +0000
Subject: Btrfs: more send support for parent/child dir relationship inversion

The commit titled "Btrfs: fix infinite path build loops in incremental send"
didn't cover a particular case where the parent-child relationship inversion
of directories doesn't imply a rename of the new parent directory. This was
due to a simple logic mistake, a logical and instead of a logical or.

Steps to reproduce:

  $ mkfs.btrfs -f /dev/sdb3
  $ mount /dev/sdb3 /mnt/btrfs
  $ mkdir -p /mnt/btrfs/a/b/bar1/bar2/bar3/bar4
  $ btrfs subvol snapshot -r /mnt/btrfs /mnt/btrfs/snap1
  $ mv /mnt/btrfs/a/b/bar1/bar2/bar3/bar4 /mnt/btrfs/a/b/k44
  $ mv /mnt/btrfs/a/b/bar1/bar2/bar3 /mnt/btrfs/a/b/k44
  $ mv /mnt/btrfs/a/b/bar1/bar2 /mnt/btrfs/a/b/k44/bar3
  $ mv /mnt/btrfs/a/b/bar1 /mnt/btrfs/a/b/k44/bar3/bar2/k11
  $ btrfs subvol snapshot -r /mnt/btrfs /mnt/btrfs/snap2
  $ btrfs send -p /mnt/btrfs/snap1 /mnt/btrfs/snap2 > /tmp/incremental.send

A patch to update the test btrfs/030 from xfstests, so that it covers
this case, will be submitted soon.

Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
---
 fs/btrfs/send.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 8bd0505ee2f9..154a717d2a3c 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -3053,8 +3053,8 @@ static int wait_for_parent_move(struct send_ctx *sctx,
 
 	len1 = fs_path_len(path_before);
 	len2 = fs_path_len(path_after);
-	if ((parent_ino_before != parent_ino_after) && (len1 != len2 ||
-	     memcmp(path_before->start, path_after->start, len1))) {
+	if (parent_ino_before != parent_ino_after || len1 != len2 ||
+	     memcmp(path_before->start, path_after->start, len1)) {
 		ret = 1;
 		goto out;
 	}
-- 
cgit v1.2.3


From 64792f253508268eb390a86f42f128d877b40776 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.cz>
Date: Mon, 3 Feb 2014 18:24:09 +0100
Subject: btrfs: send: replace check with an assert in gen_unique_name

The buffer passed to snprintf can hold the fully expanded format string,
64 = 3x largest ULL + 3x char + trailing null.  I don't think that removing the
check entirely is a good idea, hence the ASSERT.

Signed-off-by: David Sterba <dsterba@suse.cz>
Signed-off-by: Josef Bacik <jbacik@fb.com>
---
 fs/btrfs/send.c | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 154a717d2a3c..08edd0a7fff1 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -1418,11 +1418,7 @@ static int gen_unique_name(struct send_ctx *sctx,
 	while (1) {
 		len = snprintf(tmp, sizeof(tmp), "o%llu-%llu-%llu",
 				ino, gen, idx);
-		if (len >= sizeof(tmp)) {
-			/* should really not happen */
-			ret = -EOVERFLOW;
-			goto out;
-		}
+		ASSERT(len < sizeof(tmp));
 
 		di = btrfs_lookup_dir_item(NULL, sctx->send_root,
 				path, BTRFS_FIRST_FREE_OBJECTID,
-- 
cgit v1.2.3


From b23ab57d485c985c10ee7c03627359bfbba590d8 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.cz>
Date: Mon, 3 Feb 2014 19:23:19 +0100
Subject: btrfs: send: remove prepared member from fs_path

The member is used only to return value back from
fs_path_prepare_for_add, we can do it locally and save 8 bytes for the
inline_buf path.

Signed-off-by: David Sterba <dsterba@suse.cz>
Signed-off-by: Josef Bacik <jbacik@fb.com>
---
 fs/btrfs/send.c | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 08edd0a7fff1..851ebfd43aa7 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -51,7 +51,6 @@ struct fs_path {
 		struct {
 			char *start;
 			char *end;
-			char *prepared;
 
 			char *buf;
 			int buf_len;
@@ -338,7 +337,8 @@ static int fs_path_ensure_buf(struct fs_path *p, int len)
 	return 0;
 }
 
-static int fs_path_prepare_for_add(struct fs_path *p, int name_len)
+static int fs_path_prepare_for_add(struct fs_path *p, int name_len,
+				   char **prepared)
 {
 	int ret;
 	int new_len;
@@ -354,11 +354,11 @@ static int fs_path_prepare_for_add(struct fs_path *p, int name_len)
 		if (p->start != p->end)
 			*--p->start = '/';
 		p->start -= name_len;
-		p->prepared = p->start;
+		*prepared = p->start;
 	} else {
 		if (p->start != p->end)
 			*p->end++ = '/';
-		p->prepared = p->end;
+		*prepared = p->end;
 		p->end += name_len;
 		*p->end = 0;
 	}
@@ -370,12 +370,12 @@ out:
 static int fs_path_add(struct fs_path *p, const char *name, int name_len)
 {
 	int ret;
+	char *prepared;
 
-	ret = fs_path_prepare_for_add(p, name_len);
+	ret = fs_path_prepare_for_add(p, name_len, &prepared);
 	if (ret < 0)
 		goto out;
-	memcpy(p->prepared, name, name_len);
-	p->prepared = NULL;
+	memcpy(prepared, name, name_len);
 
 out:
 	return ret;
@@ -384,12 +384,12 @@ out:
 static int fs_path_add_path(struct fs_path *p, struct fs_path *p2)
 {
 	int ret;
+	char *prepared;
 
-	ret = fs_path_prepare_for_add(p, p2->end - p2->start);
+	ret = fs_path_prepare_for_add(p, p2->end - p2->start, &prepared);
 	if (ret < 0)
 		goto out;
-	memcpy(p->prepared, p2->start, p2->end - p2->start);
-	p->prepared = NULL;
+	memcpy(prepared, p2->start, p2->end - p2->start);
 
 out:
 	return ret;
@@ -400,13 +400,13 @@ static int fs_path_add_from_extent_buffer(struct fs_path *p,
 					  unsigned long off, int len)
 {
 	int ret;
+	char *prepared;
 
-	ret = fs_path_prepare_for_add(p, len);
+	ret = fs_path_prepare_for_add(p, len, &prepared);
 	if (ret < 0)
 		goto out;
 
-	read_extent_buffer(eb, p->prepared, off, len);
-	p->prepared = NULL;
+	read_extent_buffer(eb, prepared, off, len);
 
 out:
 	return ret;
-- 
cgit v1.2.3


From e25a8122061edcde6175cbcfd2e21367ad017212 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.cz>
Date: Mon, 3 Feb 2014 19:23:33 +0100
Subject: btrfs: send: remove virtual_mem member from fs_path

We don't need to keep track of that, it's available via is_vmalloc_addr.

Signed-off-by: David Sterba <dsterba@suse.cz>
Signed-off-by: Josef Bacik <jbacik@fb.com>
---
 fs/btrfs/send.c | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 851ebfd43aa7..5b9b82b32cde 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -55,7 +55,6 @@ struct fs_path {
 			char *buf;
 			int buf_len;
 			unsigned int reversed:1;
-			unsigned int virtual_mem:1;
 			char inline_buf[];
 		};
 		char pad[PAGE_SIZE];
@@ -241,7 +240,6 @@ static struct fs_path *fs_path_alloc(void)
 	if (!p)
 		return NULL;
 	p->reversed = 0;
-	p->virtual_mem = 0;
 	p->buf = p->inline_buf;
 	p->buf_len = FS_PATH_INLINE_SIZE;
 	fs_path_reset(p);
@@ -265,7 +263,7 @@ static void fs_path_free(struct fs_path *p)
 	if (!p)
 		return;
 	if (p->buf != p->inline_buf) {
-		if (p->virtual_mem)
+		if (is_vmalloc_addr(p->buf))
 			vfree(p->buf);
 		else
 			kfree(p->buf);
@@ -299,13 +297,12 @@ static int fs_path_ensure_buf(struct fs_path *p, int len)
 			tmp_buf = vmalloc(len);
 			if (!tmp_buf)
 				return -ENOMEM;
-			p->virtual_mem = 1;
 		}
 		memcpy(tmp_buf, p->buf, p->buf_len);
 		p->buf = tmp_buf;
 		p->buf_len = len;
 	} else {
-		if (p->virtual_mem) {
+		if (is_vmalloc_addr(p->buf)) {
 			tmp_buf = vmalloc(len);
 			if (!tmp_buf)
 				return -ENOMEM;
@@ -319,7 +316,6 @@ static int fs_path_ensure_buf(struct fs_path *p, int len)
 					return -ENOMEM;
 				memcpy(tmp_buf, p->buf, p->buf_len);
 				kfree(p->buf);
-				p->virtual_mem = 1;
 			}
 		}
 		p->buf = tmp_buf;
-- 
cgit v1.2.3


From 1f5a7ff999523e9996befbe03e196eb73370fe36 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.cz>
Date: Mon, 3 Feb 2014 19:23:47 +0100
Subject: btrfs: send: squeeze bitfilelds in fs_path

We know that buf_len is at most PATH_MAX, 4k, and can merge it with the
reversed member. This saves 3 bytes in favor of inline_buf.

Signed-off-by: David Sterba <dsterba@suse.cz>
Signed-off-by: Josef Bacik <jbacik@fb.com>
---
 fs/btrfs/send.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 5b9b82b32cde..4405aae05281 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -53,8 +53,8 @@ struct fs_path {
 			char *end;
 
 			char *buf;
-			int buf_len;
-			unsigned int reversed:1;
+			unsigned short buf_len:15;
+			unsigned short reversed:1;
 			char inline_buf[];
 		};
 		char pad[PAGE_SIZE];
-- 
cgit v1.2.3


From 4d1a63b21b4f77a82efe7d78fc1ae1cc7532691c Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.cz>
Date: Mon, 3 Feb 2014 19:24:19 +0100
Subject: btrfs: send: remove BUG from process_all_refs

There are only 2 static callers, the BUG would normally be never
reached, but let's be nice.

Signed-off-by: David Sterba <dsterba@suse.cz>
Signed-off-by: Josef Bacik <jbacik@fb.com>
---
 fs/btrfs/send.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 4405aae05281..d3ed9df77422 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -3606,7 +3606,10 @@ static int process_all_refs(struct send_ctx *sctx,
 		root = sctx->parent_root;
 		cb = __record_deleted_ref;
 	} else {
-		BUG();
+		btrfs_err(sctx->send_root->fs_info,
+				"Wrong command %d in process_all_refs", cmd);
+		ret = -EINVAL;
+		goto out;
 	}
 
 	key.objectid = sctx->cmp_key->objectid;
-- 
cgit v1.2.3


From 57fb8910c24004ec924103c9a8c8542119f7629a Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.cz>
Date: Mon, 3 Feb 2014 19:24:40 +0100
Subject: btrfs: send: remove BUG_ON from name_cache_delete

If cleaning the name cache fails, we could try to proceed at the cost of
some memory leak. This is not expected to happen often.

Signed-off-by: David Sterba <dsterba@suse.cz>
Signed-off-by: Josef Bacik <jbacik@fb.com>
---
 fs/btrfs/send.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index d3ed9df77422..bef7ba638dee 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -1890,13 +1890,20 @@ static void name_cache_delete(struct send_ctx *sctx,
 
 	nce_head = radix_tree_lookup(&sctx->name_cache,
 			(unsigned long)nce->ino);
-	BUG_ON(!nce_head);
+	if (!nce_head) {
+		btrfs_err(sctx->send_root->fs_info,
+	      "name_cache_delete lookup failed ino %llu cache size %d, leaking memory",
+			nce->ino, sctx->name_cache_size);
+	}
 
 	list_del(&nce->radix_list);
 	list_del(&nce->list);
 	sctx->name_cache_size--;
 
-	if (list_empty(nce_head)) {
+	/*
+	 * We may not get to the final release of nce_head if the lookup fails
+	 */
+	if (nce_head && list_empty(nce_head)) {
 		radix_tree_delete(&sctx->name_cache, (unsigned long)nce->ino);
 		kfree(nce_head);
 	}
-- 
cgit v1.2.3


From a0859c0998605d2dc1b021543398cd84a40589db Mon Sep 17 00:00:00 2001
From: Filipe David Borba Manana <fdmanana@gmail.com>
Date: Wed, 5 Feb 2014 16:48:55 +0000
Subject: Btrfs: use right extent item position in send when finding extent
 clones

This was a leftover from the commit:

   74dd17fbe3d65829e75d84f00a9525b2ace93998
   (Btrfs: fix btrfs send for inline items and compression)

Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
---
 fs/btrfs/send.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index bef7ba638dee..89fefbd955f3 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -1288,8 +1288,6 @@ static int find_extent_clone(struct send_ctx *sctx,
 		extent_item_pos = logical - found_key.objectid;
 	else
 		extent_item_pos = 0;
-
-	extent_item_pos = logical - found_key.objectid;
 	ret = iterate_extent_inodes(sctx->send_root->fs_info,
 					found_key.objectid, extent_item_pos, 1,
 					__iterate_backrefs, backref_ctx);
-- 
cgit v1.2.3


From dff6d0adbe998927f72fc8d9ceee8ff72b124328 Mon Sep 17 00:00:00 2001
From: Filipe David Borba Manana <fdmanana@gmail.com>
Date: Wed, 5 Feb 2014 16:48:56 +0000
Subject: Btrfs: make some tree searches in send.c more efficient

We have this pattern where we do search for a contiguous group of
items in a tree and everytime we find an item, we process it, then
we release our path, increment the offset of the search key, do
another full tree search and repeat these steps until a tree search
can't find more items we're interested in.

Instead of doing these full tree searches after processing each item,
just process the next item/slot in our leaf and don't release the path.
Since all these trees are read only and we always use the commit root
for a search and skip node/leaf locks, we're not affecting concurrency
on the trees.

Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
---
 fs/btrfs/send.c | 105 ++++++++++++++++++++++++++++++++++----------------------
 1 file changed, 64 insertions(+), 41 deletions(-)

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 89fefbd955f3..a2621e7aaa5c 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -2501,17 +2501,26 @@ static int did_create_dir(struct send_ctx *sctx, u64 dir)
 	key.objectid = dir;
 	key.type = BTRFS_DIR_INDEX_KEY;
 	key.offset = 0;
+	ret = btrfs_search_slot(NULL, sctx->send_root, &key, path, 0, 0);
+	if (ret < 0)
+		goto out;
+
 	while (1) {
-		ret = btrfs_search_slot_for_read(sctx->send_root, &key, path,
-				1, 0);
-		if (ret < 0)
-			goto out;
-		if (!ret) {
-			eb = path->nodes[0];
-			slot = path->slots[0];
-			btrfs_item_key_to_cpu(eb, &found_key, slot);
+		eb = path->nodes[0];
+		slot = path->slots[0];
+		if (slot >= btrfs_header_nritems(eb)) {
+			ret = btrfs_next_leaf(sctx->send_root, path);
+			if (ret < 0) {
+				goto out;
+			} else if (ret > 0) {
+				ret = 0;
+				break;
+			}
+			continue;
 		}
-		if (ret || found_key.objectid != key.objectid ||
+
+		btrfs_item_key_to_cpu(eb, &found_key, slot);
+		if (found_key.objectid != key.objectid ||
 		    found_key.type != key.type) {
 			ret = 0;
 			goto out;
@@ -2526,8 +2535,7 @@ static int did_create_dir(struct send_ctx *sctx, u64 dir)
 			goto out;
 		}
 
-		key.offset = found_key.offset + 1;
-		btrfs_release_path(path);
+		path->slots[0]++;
 	}
 
 out:
@@ -2693,19 +2701,24 @@ static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 send_progress)
 	key.objectid = dir;
 	key.type = BTRFS_DIR_INDEX_KEY;
 	key.offset = 0;
+	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+	if (ret < 0)
+		goto out;
 
 	while (1) {
-		ret = btrfs_search_slot_for_read(root, &key, path, 1, 0);
-		if (ret < 0)
-			goto out;
-		if (!ret) {
-			btrfs_item_key_to_cpu(path->nodes[0], &found_key,
-					path->slots[0]);
+		if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
+			ret = btrfs_next_leaf(root, path);
+			if (ret < 0)
+				goto out;
+			else if (ret > 0)
+				break;
+			continue;
 		}
-		if (ret || found_key.objectid != key.objectid ||
-		    found_key.type != key.type) {
+		btrfs_item_key_to_cpu(path->nodes[0], &found_key,
+				      path->slots[0]);
+		if (found_key.objectid != key.objectid ||
+		    found_key.type != key.type)
 			break;
-		}
 
 		di = btrfs_item_ptr(path->nodes[0], path->slots[0],
 				struct btrfs_dir_item);
@@ -2716,8 +2729,7 @@ static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 send_progress)
 			goto out;
 		}
 
-		btrfs_release_path(path);
-		key.offset = found_key.offset + 1;
+		path->slots[0]++;
 	}
 
 	ret = 1;
@@ -3620,15 +3632,22 @@ static int process_all_refs(struct send_ctx *sctx,
 	key.objectid = sctx->cmp_key->objectid;
 	key.type = BTRFS_INODE_REF_KEY;
 	key.offset = 0;
-	while (1) {
-		ret = btrfs_search_slot_for_read(root, &key, path, 1, 0);
-		if (ret < 0)
-			goto out;
-		if (ret)
-			break;
+	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+	if (ret < 0)
+		goto out;
 
+	while (1) {
 		eb = path->nodes[0];
 		slot = path->slots[0];
+		if (slot >= btrfs_header_nritems(eb)) {
+			ret = btrfs_next_leaf(root, path);
+			if (ret < 0)
+				goto out;
+			else if (ret > 0)
+				break;
+			continue;
+		}
+
 		btrfs_item_key_to_cpu(eb, &found_key, slot);
 
 		if (found_key.objectid != key.objectid ||
@@ -3637,11 +3656,10 @@ static int process_all_refs(struct send_ctx *sctx,
 			break;
 
 		ret = iterate_inode_ref(root, path, &found_key, 0, cb, sctx);
-		btrfs_release_path(path);
 		if (ret < 0)
 			goto out;
 
-		key.offset = found_key.offset + 1;
+		path->slots[0]++;
 	}
 	btrfs_release_path(path);
 
@@ -3922,19 +3940,25 @@ static int process_all_new_xattrs(struct send_ctx *sctx)
 	key.objectid = sctx->cmp_key->objectid;
 	key.type = BTRFS_XATTR_ITEM_KEY;
 	key.offset = 0;
-	while (1) {
-		ret = btrfs_search_slot_for_read(root, &key, path, 1, 0);
-		if (ret < 0)
-			goto out;
-		if (ret) {
-			ret = 0;
-			goto out;
-		}
+	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+	if (ret < 0)
+		goto out;
 
+	while (1) {
 		eb = path->nodes[0];
 		slot = path->slots[0];
-		btrfs_item_key_to_cpu(eb, &found_key, slot);
+		if (slot >= btrfs_header_nritems(eb)) {
+			ret = btrfs_next_leaf(root, path);
+			if (ret < 0) {
+				goto out;
+			} else if (ret > 0) {
+				ret = 0;
+				break;
+			}
+			continue;
+		}
 
+		btrfs_item_key_to_cpu(eb, &found_key, slot);
 		if (found_key.objectid != key.objectid ||
 		    found_key.type != key.type) {
 			ret = 0;
@@ -3946,8 +3970,7 @@ static int process_all_new_xattrs(struct send_ctx *sctx)
 		if (ret < 0)
 			goto out;
 
-		btrfs_release_path(path);
-		key.offset = found_key.offset + 1;
+		path->slots[0]++;
 	}
 
 out:
-- 
cgit v1.2.3


From ace0105076a493c04e6d5e91e6a19f222d6b3875 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.cz>
Date: Wed, 5 Feb 2014 16:17:34 +0100
Subject: btrfs: send: lower memory requirements in common case

The fs_path structure uses an inline buffer and falls back to a chain of
allocations, but vmalloc is not necessary because PATH_MAX fits into
PAGE_SIZE.

The size of fs_path has been reduced to 256 bytes from PAGE_SIZE,
usually 4k. Experimental measurements show that most paths on a single
filesystem do not exceed 200 bytes, and these get stored into the inline
buffer directly, which is now 230 bytes. Longer paths are kmalloced when
needed.

Signed-off-by: David Sterba <dsterba@suse.cz>
Signed-off-by: Josef Bacik <jbacik@fb.com>
---
 fs/btrfs/send.c | 106 ++++++++++++++++++++------------------------------------
 1 file changed, 37 insertions(+), 69 deletions(-)

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index a2621e7aaa5c..a5da82f49120 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -57,7 +57,12 @@ struct fs_path {
 			unsigned short reversed:1;
 			char inline_buf[];
 		};
-		char pad[PAGE_SIZE];
+		/*
+		 * Average path length does not exceed 200 bytes, we'll have
+		 * better packing in the slab and higher chance to satisfy
+		 * a allocation later during send.
+		 */
+		char pad[256];
 	};
 };
 #define FS_PATH_INLINE_SIZE \
@@ -262,12 +267,8 @@ static void fs_path_free(struct fs_path *p)
 {
 	if (!p)
 		return;
-	if (p->buf != p->inline_buf) {
-		if (is_vmalloc_addr(p->buf))
-			vfree(p->buf);
-		else
-			kfree(p->buf);
-	}
+	if (p->buf != p->inline_buf)
+		kfree(p->buf);
 	kfree(p);
 }
 
@@ -287,40 +288,31 @@ static int fs_path_ensure_buf(struct fs_path *p, int len)
 	if (p->buf_len >= len)
 		return 0;
 
-	path_len = p->end - p->start;
-	old_buf_len = p->buf_len;
-	len = PAGE_ALIGN(len);
-
+	/*
+	 * First time the inline_buf does not suffice
+	 */
 	if (p->buf == p->inline_buf) {
-		tmp_buf = kmalloc(len, GFP_NOFS | __GFP_NOWARN);
-		if (!tmp_buf) {
-			tmp_buf = vmalloc(len);
-			if (!tmp_buf)
-				return -ENOMEM;
-		}
-		memcpy(tmp_buf, p->buf, p->buf_len);
-		p->buf = tmp_buf;
-		p->buf_len = len;
+		p->buf = kmalloc(len, GFP_NOFS);
+		if (!p->buf)
+			return -ENOMEM;
+		/*
+		 * The real size of the buffer is bigger, this will let the
+		 * fast path happen most of the time
+		 */
+		p->buf_len = ksize(p->buf);
 	} else {
-		if (is_vmalloc_addr(p->buf)) {
-			tmp_buf = vmalloc(len);
-			if (!tmp_buf)
-				return -ENOMEM;
-			memcpy(tmp_buf, p->buf, p->buf_len);
-			vfree(p->buf);
-		} else {
-			tmp_buf = krealloc(p->buf, len, GFP_NOFS);
-			if (!tmp_buf) {
-				tmp_buf = vmalloc(len);
-				if (!tmp_buf)
-					return -ENOMEM;
-				memcpy(tmp_buf, p->buf, p->buf_len);
-				kfree(p->buf);
-			}
-		}
-		p->buf = tmp_buf;
-		p->buf_len = len;
+		char *tmp;
+
+		tmp = krealloc(p->buf, len, GFP_NOFS);
+		if (!tmp)
+			return -ENOMEM;
+		p->buf = tmp;
+		p->buf_len = ksize(p->buf);
 	}
+
+	path_len = p->end - p->start;
+	old_buf_len = p->buf_len;
+
 	if (p->reversed) {
 		tmp_buf = p->buf + old_buf_len - path_len - 1;
 		p->end = p->buf + p->buf_len - 1;
@@ -911,9 +903,7 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
 	struct btrfs_dir_item *di;
 	struct btrfs_key di_key;
 	char *buf = NULL;
-	char *buf2 = NULL;
-	int buf_len;
-	int buf_virtual = 0;
+	const int buf_len = PATH_MAX;
 	u32 name_len;
 	u32 data_len;
 	u32 cur;
@@ -923,7 +913,6 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
 	int num;
 	u8 type;
 
-	buf_len = PAGE_SIZE;
 	buf = kmalloc(buf_len, GFP_NOFS);
 	if (!buf) {
 		ret = -ENOMEM;
@@ -945,30 +934,12 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
 		type = btrfs_dir_type(eb, di);
 		btrfs_dir_item_key_to_cpu(eb, di, &di_key);
 
+		/*
+		 * Path too long
+		 */
 		if (name_len + data_len > buf_len) {
-			buf_len = PAGE_ALIGN(name_len + data_len);
-			if (buf_virtual) {
-				buf2 = vmalloc(buf_len);
-				if (!buf2) {
-					ret = -ENOMEM;
-					goto out;
-				}
-				vfree(buf);
-			} else {
-				buf2 = krealloc(buf, buf_len, GFP_NOFS);
-				if (!buf2) {
-					buf2 = vmalloc(buf_len);
-					if (!buf2) {
-						ret = -ENOMEM;
-						goto out;
-					}
-					kfree(buf);
-					buf_virtual = 1;
-				}
-			}
-
-			buf = buf2;
-			buf2 = NULL;
+			ret = -ENAMETOOLONG;
+			goto out;
 		}
 
 		read_extent_buffer(eb, buf, (unsigned long)(di + 1),
@@ -991,10 +962,7 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
 	}
 
 out:
-	if (buf_virtual)
-		vfree(buf);
-	else
-		kfree(buf);
+	kfree(buf);
 	return ret;
 }
 
-- 
cgit v1.2.3


From 1bae30982bc86ab66d61ccb6e22792593b45d44d Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.cz>
Date: Wed, 5 Feb 2014 15:36:18 +0100
Subject: btrfs: add simple debugfs interface

Help during debugging to export various interesting infromation and
tunables without the need of extra mount options or ioctls.

Usage:
* declare your variable in sysfs.h, and include where you need it
* define the variable in sysfs.c and make it visible via
  debugfs_create_TYPE

Depends on CONFIG_DEBUG_FS.

Signed-off-by: David Sterba <dsterba@suse.cz>
Signed-off-by: Josef Bacik <jbacik@fb.com>
---
 fs/btrfs/sysfs.c | 33 +++++++++++++++++++++++++++------
 fs/btrfs/sysfs.h |  5 +++++
 2 files changed, 32 insertions(+), 6 deletions(-)

diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 865f4cf9a769..c5eb2143dc66 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -24,6 +24,7 @@
 #include <linux/kobject.h>
 #include <linux/bug.h>
 #include <linux/genhd.h>
+#include <linux/debugfs.h>
 
 #include "ctree.h"
 #include "disk-io.h"
@@ -599,6 +600,12 @@ static int add_device_membership(struct btrfs_fs_info *fs_info)
 /* /sys/fs/btrfs/ entry */
 static struct kset *btrfs_kset;
 
+/* /sys/kernel/debug/btrfs */
+static struct dentry *btrfs_debugfs_root_dentry;
+
+/* Debugging tunables and exported data */
+u64 btrfs_debugfs_test;
+
 int btrfs_sysfs_add_one(struct btrfs_fs_info *fs_info)
 {
 	int error;
@@ -642,27 +649,41 @@ failure:
 	return error;
 }
 
+static int btrfs_init_debugfs(void)
+{
+#ifdef CONFIG_DEBUG_FS
+	btrfs_debugfs_root_dentry = debugfs_create_dir("btrfs", NULL);
+	if (!btrfs_debugfs_root_dentry)
+		return -ENOMEM;
+
+	debugfs_create_u64("test", S_IRUGO | S_IWUGO, btrfs_debugfs_root_dentry,
+			&btrfs_debugfs_test);
+#endif
+	return 0;
+}
+
 int btrfs_init_sysfs(void)
 {
 	int ret;
+
 	btrfs_kset = kset_create_and_add("btrfs", NULL, fs_kobj);
 	if (!btrfs_kset)
 		return -ENOMEM;
 
-	init_feature_attrs();
+	ret = btrfs_init_debugfs();
+	if (ret)
+		return ret;
 
+	init_feature_attrs();
 	ret = sysfs_create_group(&btrfs_kset->kobj, &btrfs_feature_attr_group);
-	if (ret) {
-		kset_unregister(btrfs_kset);
-		return ret;
-	}
 
-	return 0;
+	return ret;
 }
 
 void btrfs_exit_sysfs(void)
 {
 	sysfs_remove_group(&btrfs_kset->kobj, &btrfs_feature_attr_group);
 	kset_unregister(btrfs_kset);
+	debugfs_remove_recursive(btrfs_debugfs_root_dentry);
 }
 
diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h
index f3cea3710d44..9ab576318a84 100644
--- a/fs/btrfs/sysfs.h
+++ b/fs/btrfs/sysfs.h
@@ -1,6 +1,11 @@
 #ifndef _BTRFS_SYSFS_H_
 #define _BTRFS_SYSFS_H_
 
+/*
+ * Data exported through sysfs
+ */
+extern u64 btrfs_debugfs_test;
+
 enum btrfs_feature_set {
 	FEAT_COMPAT,
 	FEAT_COMPAT_RO,
-- 
cgit v1.2.3


From c581afc8db4e9aaa8af2246bb72c1bf72825014d Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fb.com>
Date: Thu, 6 Feb 2014 16:06:06 -0500
Subject: Btrfs: balance delayed inode updates

While trying to reproduce a delayed ref problem I noticed the box kept falling
over using all 80gb of my ram with btrfs_inode's and btrfs_delayed_node's.
Turns out this is because we only throttle delayed inode updates in
btrfs_dirty_inode, which doesn't actually get called that often, especially when
all you are doing is creating a bunch of files.  So balance delayed inode
updates everytime we create a new inode.  With this patch we no longer use up
all of our ram with delayed inode updates.  Thanks,

Signed-off-by: Josef Bacik <jbacik@fb.com>
---
 fs/btrfs/inode.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 4ffb6d79f9f0..a7e6690e0946 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -5795,6 +5795,7 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
 	}
 out_unlock:
 	btrfs_end_transaction(trans, root);
+	btrfs_balance_delayed_items(root);
 	btrfs_btree_balance_dirty(root);
 	if (drop_inode) {
 		inode_dec_link_count(inode);
@@ -5868,6 +5869,7 @@ out_unlock:
 		inode_dec_link_count(inode);
 		iput(inode);
 	}
+	btrfs_balance_delayed_items(root);
 	btrfs_btree_balance_dirty(root);
 	return err;
 }
@@ -5926,6 +5928,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
 	}
 
 	btrfs_end_transaction(trans, root);
+	btrfs_balance_delayed_items(root);
 fail:
 	if (drop_inode) {
 		inode_dec_link_count(inode);
@@ -5992,6 +5995,7 @@ out_fail:
 	btrfs_end_transaction(trans, root);
 	if (drop_on_err)
 		iput(inode);
+	btrfs_balance_delayed_items(root);
 	btrfs_btree_balance_dirty(root);
 	return err;
 }
-- 
cgit v1.2.3


From 29bce2f3997a8dc5195b7a7724362d1e55df7bb2 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fb.com>
Date: Fri, 7 Feb 2014 12:21:23 -0500
Subject: Btrfs: unlock extent and pages on error in cow_file_range

When I converted the BUG_ON() for the free_space_cache_inode in cow_file_range I
made it so we just return an error instead of unlocking all of our various
stuff.  This is a mistake and causes us to hang when we run into this.  This
patch fixes this problem.  Thanks,

Signed-off-by: Josef Bacik <jbacik@fb.com>
---
 fs/btrfs/inode.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index a7e6690e0946..5b8925003090 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -864,7 +864,8 @@ static noinline int cow_file_range(struct inode *inode,
 
 	if (btrfs_is_free_space_inode(inode)) {
 		WARN_ON_ONCE(1);
-		return -EINVAL;
+		ret = -EINVAL;
+		goto out_unlock;
 	}
 
 	num_bytes = ALIGN(end - start + 1, blocksize);
-- 
cgit v1.2.3


From f88ba6a2a44ee98e8d59654463dc157bb6d13c43 Mon Sep 17 00:00:00 2001
From: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Date: Wed, 5 Feb 2014 16:34:38 +0900
Subject: Btrfs: skip submitting barrier for missing device

I got an error on v3.13:
 BTRFS error (device sdf1) in write_all_supers:3378: errno=-5 IO failure (errors while submitting device barriers.)

how to reproduce:
  > mkfs.btrfs -f -d raid1 /dev/sdf1 /dev/sdf2
  > wipefs -a /dev/sdf2
  > mount -o degraded /dev/sdf1 /mnt
  > btrfs balance start -f -sconvert=single -mconvert=single -dconvert=single /mnt

The reason of the error is that barrier_all_devices() failed to submit
barrier to the missing device.  However it is clear that we cannot do
anything on missing device, and also it is not necessary to care chunks
on the missing device.

This patch stops sending/waiting barrier if device is missing.

Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Josef Bacik <jbacik@fb.com>
---
 fs/btrfs/disk-io.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 0cafacb07b43..74c9be89fc0c 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -3256,6 +3256,8 @@ static int barrier_all_devices(struct btrfs_fs_info *info)
 	/* send down all the barriers */
 	head = &info->fs_devices->devices;
 	list_for_each_entry_rcu(dev, head, dev_list) {
+		if (dev->missing)
+			continue;
 		if (!dev->bdev) {
 			errors_send++;
 			continue;
@@ -3270,6 +3272,8 @@ static int barrier_all_devices(struct btrfs_fs_info *info)
 
 	/* wait for all the barriers */
 	list_for_each_entry_rcu(dev, head, dev_list) {
+		if (dev->missing)
+			continue;
 		if (!dev->bdev) {
 			errors_wait++;
 			continue;
-- 
cgit v1.2.3


From 850a8cdffe41abec9e3319d7801c49eced0778a1 Mon Sep 17 00:00:00 2001
From: Wang Shilong <wangsl.fnst@cn.fujitsu.com>
Date: Thu, 6 Feb 2014 20:02:29 +0800
Subject: Btrfs: switch to btrfs_previous_extent_item()

Since we have introduced btrfs_previous_extent_item() to search previous
extent item, just switch into it.

Signed-off-by: Wang Shilong <wangsl.fnst@cn.fujitsu.com>
Reviewed-by: Filipe Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
---
 fs/btrfs/backref.c | 37 ++++++-------------------------------
 1 file changed, 6 insertions(+), 31 deletions(-)

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 903fe68e017b..a88da721dfc5 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -1325,38 +1325,13 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
 	if (ret < 0)
 		return ret;
 
-	while (1) {
-		u32 nritems;
-		if (path->slots[0] == 0) {
-			btrfs_set_path_blocking(path);
-			ret = btrfs_prev_leaf(fs_info->extent_root, path);
-			if (ret != 0) {
-				if (ret > 0) {
-					pr_debug("logical %llu is not within "
-						 "any extent\n", logical);
-					ret = -ENOENT;
-				}
-				return ret;
-			}
-		} else {
-			path->slots[0]--;
-		}
-		nritems = btrfs_header_nritems(path->nodes[0]);
-		if (nritems == 0) {
-			pr_debug("logical %llu is not within any extent\n",
-				 logical);
-			return -ENOENT;
-		}
-		if (path->slots[0] == nritems)
-			path->slots[0]--;
-
-		btrfs_item_key_to_cpu(path->nodes[0], found_key,
-				      path->slots[0]);
-		if (found_key->type == BTRFS_EXTENT_ITEM_KEY ||
-		    found_key->type == BTRFS_METADATA_ITEM_KEY)
-			break;
+	ret = btrfs_previous_extent_item(fs_info->extent_root, path, 0);
+	if (ret) {
+		if (ret > 0)
+			ret = -ENOENT;
+		return ret;
 	}
-
+	btrfs_item_key_to_cpu(path->nodes[0], found_key, path->slots[0]);
 	if (found_key->type == BTRFS_METADATA_ITEM_KEY)
 		size = fs_info->extent_root->leafsize;
 	else if (found_key->type == BTRFS_EXTENT_ITEM_KEY)
-- 
cgit v1.2.3


From bcbba5e6593281adc234938b42d3c3d3570335db Mon Sep 17 00:00:00 2001
From: Wang Shilong <wangsl.fnst@cn.fujitsu.com>
Date: Sat, 8 Feb 2014 23:46:35 +0800
Subject: Btrfs: skip readonly root for snapshot-aware defragment

Btrfs send is assuming readonly root won't change, let's skip readonly root.

Signed-off-by: Wang Shilong <wangsl.fnst@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
---
 fs/btrfs/inode.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 5b8925003090..b88f6221b48b 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2240,6 +2240,11 @@ static noinline int relink_extent_backref(struct btrfs_path *path,
 		return PTR_ERR(root);
 	}
 
+	if (btrfs_root_readonly(root)) {
+		srcu_read_unlock(&fs_info->subvol_srcu, index);
+		return 0;
+	}
+
 	/* step 2: get inode */
 	key.objectid = backref->inum;
 	key.type = BTRFS_INODE_ITEM_KEY;
-- 
cgit v1.2.3


From dcfd5ad2fc3337a959873e9d20ca33ad9809aa90 Mon Sep 17 00:00:00 2001
From: Wang Shilong <wangsl.fnst@cn.fujitsu.com>
Date: Sat, 8 Feb 2014 23:46:36 +0800
Subject: Revert "Btrfs: remove transaction from btrfs send"

This reverts commit 41ce9970a8a6a362ae8df145f7a03d789e9ef9d2.
Previously i was thinking we can use readonly root's commit root
safely while it is not true, readonly root may be cowed with the
following cases.

1.snapshot send root will cow source root.
2.balance,device operations will also cow readonly send root
to relocate.

So i have two ideas to make us safe to use commit root.

-->approach 1:
make it protected by transaction and end transaction properly and we research
next item from root node(see btrfs_search_slot_for_read()).

-->approach 2:
add another counter to local root structure to sync snapshot with send.
and add a global counter to sync send with exclusive device operations.

So with approach 2, send can use commit root safely, because we make sure
send root can not be cowed during send. Unfortunately, it make codes *ugly*
and more complex to maintain.

To make snapshot and send exclusively, device operations and send operation
exclusively with each other is a little confusing for common users.

So why not drop into previous way.

Cc: Josef Bacik <jbacik@fb.com>
Signed-off-by: Wang Shilong <wangsl.fnst@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
---
 fs/btrfs/send.c | 33 +++++++++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index a5da82f49120..3ddd2bb75083 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -5104,6 +5104,7 @@ out:
 static int full_send_tree(struct send_ctx *sctx)
 {
 	int ret;
+	struct btrfs_trans_handle *trans = NULL;
 	struct btrfs_root *send_root = sctx->send_root;
 	struct btrfs_key key;
 	struct btrfs_key found_key;
@@ -5125,6 +5126,19 @@ static int full_send_tree(struct send_ctx *sctx)
 	key.type = BTRFS_INODE_ITEM_KEY;
 	key.offset = 0;
 
+join_trans:
+	/*
+	 * We need to make sure the transaction does not get committed
+	 * while we do anything on commit roots. Join a transaction to prevent
+	 * this.
+	 */
+	trans = btrfs_join_transaction(send_root);
+	if (IS_ERR(trans)) {
+		ret = PTR_ERR(trans);
+		trans = NULL;
+		goto out;
+	}
+
 	/*
 	 * Make sure the tree has not changed after re-joining. We detect this
 	 * by comparing start_ctransid and ctransid. They should always match.
@@ -5148,6 +5162,19 @@ static int full_send_tree(struct send_ctx *sctx)
 		goto out_finish;
 
 	while (1) {
+		/*
+		 * When someone want to commit while we iterate, end the
+		 * joined transaction and rejoin.
+		 */
+		if (btrfs_should_end_transaction(trans, send_root)) {
+			ret = btrfs_end_transaction(trans, send_root);
+			trans = NULL;
+			if (ret < 0)
+				goto out;
+			btrfs_release_path(path);
+			goto join_trans;
+		}
+
 		eb = path->nodes[0];
 		slot = path->slots[0];
 		btrfs_item_key_to_cpu(eb, &found_key, slot);
@@ -5175,6 +5202,12 @@ out_finish:
 
 out:
 	btrfs_free_path(path);
+	if (trans) {
+		if (!ret)
+			ret = btrfs_end_transaction(trans, send_root);
+		else
+			btrfs_end_transaction(trans, send_root);
+	}
 	return ret;
 }
 
-- 
cgit v1.2.3


From 51b98effa4c673feaa7237ba87645ea60d8f3578 Mon Sep 17 00:00:00 2001
From: Stanislaw Gruszka <stf_xl@wp.pl>
Date: Sat, 8 Feb 2014 23:18:43 +0100
Subject: btrfs: always choose work from prio_head first

In case we do not refill, we can overwrite cur pointer from prio_head
by one from not prioritized head, what looks as something that was
not intended.

This change make we always take works from prio_head first until it's
not empty.

Signed-off-by: Stanislaw Gruszka <stf_xl@wp.pl>
Signed-off-by: Josef Bacik <jbacik@fb.com>
---
 fs/btrfs/async-thread.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index c1e0b0caf9cc..0b78bf28ff5d 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -262,18 +262,19 @@ static struct btrfs_work *get_next_work(struct btrfs_worker_thread *worker,
 	struct btrfs_work *work = NULL;
 	struct list_head *cur = NULL;
 
-	if (!list_empty(prio_head))
+	if (!list_empty(prio_head)) {
 		cur = prio_head->next;
+		goto out;
+	}
 
 	smp_mb();
 	if (!list_empty(&worker->prio_pending))
 		goto refill;
 
-	if (!list_empty(head))
+	if (!list_empty(head)) {
 		cur = head->next;
-
-	if (cur)
 		goto out;
+	}
 
 refill:
 	spin_lock_irq(&worker->lock);
-- 
cgit v1.2.3


From d5f375270aa55794f4a7196b5247469f86278a8f Mon Sep 17 00:00:00 2001
From: Filipe David Borba Manana <fdmanana@gmail.com>
Date: Sun, 9 Feb 2014 23:45:12 +0000
Subject: Btrfs: faster/more efficient insertion of file extent items

This is an extension to my previous commit titled:

  "Btrfs: faster file extent item replace operations"
  (hash 1acae57b161ef1282f565ef907f72aeed0eb71d9)

Instead of inserting the new file extent item if we deleted existing
file extent items covering our target file range, also allow to insert
the new file extent item if we didn't find any existing items to delete
and replace_extent != 0, since in this case our caller would do another
tree search to insert the new file extent item anyway, therefore just
combine the two tree searches into a single one, saving cpu time, reducing
lock contention and reducing btree node/leaf COW operations.

This covers the case where applications keep doing tail append writes to
files, which for example is the case of Apache CouchDB (its database and
view index files are always open with O_APPEND).

Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
---
 fs/btrfs/file.c | 52 ++++++++++++++++++++++++++++++----------------------
 1 file changed, 30 insertions(+), 22 deletions(-)

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 0165b8672f09..006af2f4dd98 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -720,7 +720,7 @@ int __btrfs_drop_extents(struct btrfs_trans_handle *trans,
 	if (drop_cache)
 		btrfs_drop_extent_cache(inode, start, end - 1, 0);
 
-	if (start >= BTRFS_I(inode)->disk_i_size)
+	if (start >= BTRFS_I(inode)->disk_i_size && !replace_extent)
 		modify_tree = 0;
 
 	while (1) {
@@ -938,34 +938,42 @@ next_slot:
 		 * Set path->slots[0] to first slot, so that after the delete
 		 * if items are move off from our leaf to its immediate left or
 		 * right neighbor leafs, we end up with a correct and adjusted
-		 * path->slots[0] for our insertion.
+		 * path->slots[0] for our insertion (if replace_extent != 0).
 		 */
 		path->slots[0] = del_slot;
 		ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
 		if (ret)
 			btrfs_abort_transaction(trans, root, ret);
+	}
 
-		leaf = path->nodes[0];
-		/*
-		 * leaf eb has flag EXTENT_BUFFER_STALE if it was deleted (that
-		 * is, its contents got pushed to its neighbors), in which case
-		 * it means path->locks[0] == 0
-		 */
-		if (!ret && replace_extent && leafs_visited == 1 &&
-		    path->locks[0] &&
-		    btrfs_leaf_free_space(root, leaf) >=
-		    sizeof(struct btrfs_item) + extent_item_size) {
-
-			key.objectid = ino;
-			key.type = BTRFS_EXTENT_DATA_KEY;
-			key.offset = start;
-			setup_items_for_insert(root, path, &key,
-					       &extent_item_size,
-					       extent_item_size,
-					       sizeof(struct btrfs_item) +
-					       extent_item_size, 1);
-			*key_inserted = 1;
+	leaf = path->nodes[0];
+	/*
+	 * If btrfs_del_items() was called, it might have deleted a leaf, in
+	 * which case it unlocked our path, so check path->locks[0] matches a
+	 * write lock.
+	 */
+	if (!ret && replace_extent && leafs_visited == 1 &&
+	    (path->locks[0] == BTRFS_WRITE_LOCK_BLOCKING ||
+	     path->locks[0] == BTRFS_WRITE_LOCK) &&
+	    btrfs_leaf_free_space(root, leaf) >=
+	    sizeof(struct btrfs_item) + extent_item_size) {
+
+		key.objectid = ino;
+		key.type = BTRFS_EXTENT_DATA_KEY;
+		key.offset = start;
+		if (!del_nr && path->slots[0] < btrfs_header_nritems(leaf)) {
+			struct btrfs_key slot_key;
+
+			btrfs_item_key_to_cpu(leaf, &slot_key, path->slots[0]);
+			if (btrfs_comp_cpu_keys(&key, &slot_key) > 0)
+				path->slots[0]++;
 		}
+		setup_items_for_insert(root, path, &key,
+				       &extent_item_size,
+				       extent_item_size,
+				       sizeof(struct btrfs_item) +
+				       extent_item_size, 1);
+		*key_inserted = 1;
 	}
 
 	if (!replace_extent || !(*key_inserted))
-- 
cgit v1.2.3


From 2a85d9cac160bb5b845985a60007cc8348d77def Mon Sep 17 00:00:00 2001
From: Liu Bo <bo.li.liu@oracle.com>
Date: Mon, 10 Feb 2014 17:07:16 +0800
Subject: Btrfs: fix possible deadlock in btrfs_cleanup_transaction

[13654.480669] ======================================================
[13654.480905] [ INFO: possible circular locking dependency detected ]
[13654.481003] 3.12.0+ #4 Tainted: G        W  O
[13654.481060] -------------------------------------------------------
[13654.481060] btrfs-transacti/9347 is trying to acquire lock:
[13654.481060]  (&(&root->ordered_extent_lock)->rlock){+.+...}, at: [<ffffffffa02d30a1>] btrfs_cleanup_transaction+0x271/0x570 [btrfs]
[13654.481060] but task is already holding lock:
[13654.481060]  (&(&fs_info->ordered_root_lock)->rlock){+.+...}, at: [<ffffffffa02d3015>] btrfs_cleanup_transaction+0x1e5/0x570 [btrfs]
[13654.481060] which lock already depends on the new lock.

[13654.481060] the existing dependency chain (in reverse order) is:
[13654.481060] -> #1 (&(&fs_info->ordered_root_lock)->rlock){+.+...}:
[13654.481060]        [<ffffffff810c4103>] lock_acquire+0x93/0x130
[13654.481060]        [<ffffffff81689991>] _raw_spin_lock+0x41/0x50
[13654.481060]        [<ffffffffa02f011b>] __btrfs_add_ordered_extent+0x39b/0x450 [btrfs]
[13654.481060]        [<ffffffffa02f0202>] btrfs_add_ordered_extent+0x32/0x40 [btrfs]
[13654.481060]        [<ffffffffa02df6aa>] run_delalloc_nocow+0x78a/0x9d0 [btrfs]
[13654.481060]        [<ffffffffa02dfc0d>] run_delalloc_range+0x31d/0x390 [btrfs]
[13654.481060]        [<ffffffffa02f7c00>] __extent_writepage+0x310/0x780 [btrfs]
[13654.481060]        [<ffffffffa02f830a>] extent_write_cache_pages.isra.29.constprop.48+0x29a/0x410 [btrfs]
[13654.481060]        [<ffffffffa02f879d>] extent_writepages+0x4d/0x70 [btrfs]
[13654.481060]        [<ffffffffa02d9f68>] btrfs_writepages+0x28/0x30 [btrfs]
[13654.481060]        [<ffffffff8114be91>] do_writepages+0x21/0x50
[13654.481060]        [<ffffffff81140d49>] __filemap_fdatawrite_range+0x59/0x60
[13654.481060]        [<ffffffff81140e13>] filemap_fdatawrite_range+0x13/0x20
[13654.481060]        [<ffffffffa02f1db9>] btrfs_wait_ordered_range+0x49/0x140 [btrfs]
[13654.481060]        [<ffffffffa0318fe2>] __btrfs_write_out_cache+0x682/0x8b0 [btrfs]
[13654.481060]        [<ffffffffa031952d>] btrfs_write_out_cache+0x8d/0xe0 [btrfs]
[13654.481060]        [<ffffffffa02c7083>] btrfs_write_dirty_block_groups+0x593/0x680 [btrfs]
[13654.481060]        [<ffffffffa0345307>] commit_cowonly_roots+0x14b/0x20d [btrfs]
[13654.481060]        [<ffffffffa02d7c1a>] btrfs_commit_transaction+0x43a/0x9d0 [btrfs]
[13654.481060]        [<ffffffffa030061a>] btrfs_create_uuid_tree+0x5a/0x100 [btrfs]
[13654.481060]        [<ffffffffa02d5a8a>] open_ctree+0x21da/0x2210 [btrfs]
[13654.481060]        [<ffffffffa02ab6fe>] btrfs_mount+0x68e/0x870 [btrfs]
[13654.481060]        [<ffffffff811b2409>] mount_fs+0x39/0x1b0
[13654.481060]        [<ffffffff811cd653>] vfs_kern_mount+0x63/0xf0
[13654.481060]        [<ffffffff811cfcce>] do_mount+0x23e/0xa90
[13654.481060]        [<ffffffff811d05a3>] SyS_mount+0x83/0xc0
[13654.481060]        [<ffffffff81692b52>] system_call_fastpath+0x16/0x1b
[13654.481060] -> #0 (&(&root->ordered_extent_lock)->rlock){+.+...}:
[13654.481060]        [<ffffffff810c340a>] __lock_acquire+0x150a/0x1a70
[13654.481060]        [<ffffffff810c4103>] lock_acquire+0x93/0x130
[13654.481060]        [<ffffffff81689991>] _raw_spin_lock+0x41/0x50
[13654.481060]        [<ffffffffa02d30a1>] btrfs_cleanup_transaction+0x271/0x570 [btrfs]
[13654.481060]        [<ffffffffa02d35ce>] transaction_kthread+0x22e/0x270 [btrfs]
[13654.481060]        [<ffffffff81079efa>] kthread+0xea/0xf0
[13654.481060]        [<ffffffff81692aac>] ret_from_fork+0x7c/0xb0
[13654.481060] other info that might help us debug this:

[13654.481060]  Possible unsafe locking scenario:

[13654.481060]        CPU0                    CPU1
[13654.481060]        ----                    ----
[13654.481060]   lock(&(&fs_info->ordered_root_lock)->rlock);
[13654.481060]				 lock(&(&root->ordered_extent_lock)->rlock);
[13654.481060]				 lock(&(&fs_info->ordered_root_lock)->rlock);
[13654.481060]   lock(&(&root->ordered_extent_lock)->rlock);
[13654.481060]
 *** DEADLOCK ***
[...]

======================================================

btrfs_destroy_all_ordered_extents()
gets &fs_info->ordered_root_lock __BEFORE__ acquiring &root->ordered_extent_lock,
while btrfs_[add,remove]_ordered_extent()
acquires &fs_info->ordered_root_lock __AFTER__ getting &root->ordered_extent_lock.

This patch fixes the above problem.

Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
---
 fs/btrfs/disk-io.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 74c9be89fc0c..cc1b4237dc62 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -3808,9 +3808,11 @@ static void btrfs_destroy_all_ordered_extents(struct btrfs_fs_info *fs_info)
 		list_move_tail(&root->ordered_root,
 			       &fs_info->ordered_roots);
 
+		spin_unlock(&fs_info->ordered_root_lock);
 		btrfs_destroy_ordered_extents(root);
 
-		cond_resched_lock(&fs_info->ordered_root_lock);
+		cond_resched();
+		spin_lock(&fs_info->ordered_root_lock);
 	}
 	spin_unlock(&fs_info->ordered_root_lock);
 }
-- 
cgit v1.2.3


From 7813b3db0a9ec77ff1f4b3ee3fb4925848395d59 Mon Sep 17 00:00:00 2001
From: Liu Bo <bo.li.liu@oracle.com>
Date: Mon, 10 Feb 2014 17:37:25 +0800
Subject: Btrfs: avoid warning bomb of btrfs_invalidate_inodes

So after transaction is aborted, we need to cleanup inode resources by
calling btrfs_invalidate_inodes(), and btrfs_invalidate_inodes() hopes
roots' refs to be zero in old times and sets a WARN_ON(), however, this
is not always true within cleaning up transaction, so we get to detect
transaction abortion and not warn at all.

Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
---
 fs/btrfs/inode.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index b88f6221b48b..8dba152883d3 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -4926,7 +4926,8 @@ void btrfs_invalidate_inodes(struct btrfs_root *root)
 	struct inode *inode;
 	u64 objectid = 0;
 
-	WARN_ON(btrfs_root_refs(&root->root_item) != 0);
+	if (!test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state))
+		WARN_ON(btrfs_root_refs(&root->root_item) != 0);
 
 	spin_lock(&root->inode_lock);
 again:
-- 
cgit v1.2.3


From 5c902ba6223f6a6575054226931fafc51314a25f Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Thu, 20 Feb 2014 18:08:51 +0800
Subject: Btrfs: use ACCESS_ONCE to prevent the optimize accesses to
 ->last_trans_log_full_commit

->last_trans_log_full_commit may be changed by the other tasks without lock,
so we need prevent the compiler from the optimize access just like
	tmp = fs_info->last_trans_log_full_commit
	if (tmp == ...)
		...

	<do something>

	if (tmp == ...)
		...

In fact, we need get the new value of ->last_trans_log_full_commit during
the second access. Fix it by ACCESS_ONCE().

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
---
 fs/btrfs/tree-log.c | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 7c449c699bed..5a4e10b9ac3e 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -2375,14 +2375,14 @@ static int wait_log_commit(struct btrfs_trans_handle *trans,
 				&wait, TASK_UNINTERRUPTIBLE);
 		mutex_unlock(&root->log_mutex);
 
-		if (root->fs_info->last_trans_log_full_commit !=
+		if (ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) !=
 		    trans->transid && root->log_transid < transid + 2 &&
 		    atomic_read(&root->log_commit[index]))
 			schedule();
 
 		finish_wait(&root->log_commit_wait[index], &wait);
 		mutex_lock(&root->log_mutex);
-	} while (root->fs_info->last_trans_log_full_commit !=
+	} while (ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) !=
 		 trans->transid && root->log_transid < transid + 2 &&
 		 atomic_read(&root->log_commit[index]));
 	return 0;
@@ -2392,12 +2392,12 @@ static void wait_for_writer(struct btrfs_trans_handle *trans,
 			    struct btrfs_root *root)
 {
 	DEFINE_WAIT(wait);
-	while (root->fs_info->last_trans_log_full_commit !=
+	while (ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) !=
 	       trans->transid && atomic_read(&root->log_writers)) {
 		prepare_to_wait(&root->log_writer_wait,
 				&wait, TASK_UNINTERRUPTIBLE);
 		mutex_unlock(&root->log_mutex);
-		if (root->fs_info->last_trans_log_full_commit !=
+		if (ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) !=
 		    trans->transid && atomic_read(&root->log_writers))
 			schedule();
 		mutex_lock(&root->log_mutex);
@@ -2456,7 +2456,8 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 	}
 
 	/* bail out if we need to do a full commit */
-	if (root->fs_info->last_trans_log_full_commit == trans->transid) {
+	if (ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) ==
+	    trans->transid) {
 		ret = -EAGAIN;
 		btrfs_free_logged_extents(log, log_transid);
 		mutex_unlock(&root->log_mutex);
@@ -2515,7 +2516,8 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 			mutex_unlock(&log_root_tree->log_mutex);
 			goto out;
 		}
-		root->fs_info->last_trans_log_full_commit = trans->transid;
+		ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) =
+								trans->transid;
 		btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
 		btrfs_free_logged_extents(log, log_transid);
 		mutex_unlock(&log_root_tree->log_mutex);
@@ -2547,7 +2549,8 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 	 * now that we've moved on to the tree of log tree roots,
 	 * check the full commit flag again
 	 */
-	if (root->fs_info->last_trans_log_full_commit == trans->transid) {
+	if (ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) ==
+	    trans->transid) {
 		blk_finish_plug(&plug);
 		btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
 		btrfs_free_logged_extents(log, log_transid);
-- 
cgit v1.2.3


From 48cab2e0714913a63155f800a55609a4ff6a36b9 Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Thu, 20 Feb 2014 18:08:52 +0800
Subject: Btrfs: fix the skipped transaction commit during the file sync

We may abort the wait earlier if ->last_trans_log_full_commit was set to
the current transaction id, at this case, we need commit the current
transaction instead of the log sub-transaction. But the current code
didn't tell the caller to do it (return 0, not -EAGAIN). Fix it.

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
---
 fs/btrfs/tree-log.c | 26 ++++++++++++++++----------
 1 file changed, 16 insertions(+), 10 deletions(-)

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 5a4e10b9ac3e..8a03b39648be 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -2364,6 +2364,7 @@ static int wait_log_commit(struct btrfs_trans_handle *trans,
 {
 	DEFINE_WAIT(wait);
 	int index = transid % 2;
+	int ret = 0;
 
 	/*
 	 * we only allow two pending log transactions at a time,
@@ -2371,21 +2372,26 @@ static int wait_log_commit(struct btrfs_trans_handle *trans,
 	 * current transaction, we're done
 	 */
 	do {
+		if (ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) ==
+		    trans->transid) {
+			ret = -EAGAIN;
+			break;
+		}
+
 		prepare_to_wait(&root->log_commit_wait[index],
 				&wait, TASK_UNINTERRUPTIBLE);
 		mutex_unlock(&root->log_mutex);
 
-		if (ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) !=
-		    trans->transid && root->log_transid < transid + 2 &&
+		if (root->log_transid < transid + 2 &&
 		    atomic_read(&root->log_commit[index]))
 			schedule();
 
 		finish_wait(&root->log_commit_wait[index], &wait);
 		mutex_lock(&root->log_mutex);
-	} while (ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) !=
-		 trans->transid && root->log_transid < transid + 2 &&
+	} while (root->log_transid < transid + 2 &&
 		 atomic_read(&root->log_commit[index]));
-	return 0;
+
+	return ret;
 }
 
 static void wait_for_writer(struct btrfs_trans_handle *trans,
@@ -2433,15 +2439,16 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 	log_transid = root->log_transid;
 	index1 = root->log_transid % 2;
 	if (atomic_read(&root->log_commit[index1])) {
-		wait_log_commit(trans, root, root->log_transid);
+		ret = wait_log_commit(trans, root, root->log_transid);
 		mutex_unlock(&root->log_mutex);
-		return 0;
+		return ret;
 	}
 	atomic_set(&root->log_commit[index1], 1);
 
 	/* wait for previous tree log sync to complete */
 	if (atomic_read(&root->log_commit[(index1 + 1) % 2]))
 		wait_log_commit(trans, root, root->log_transid - 1);
+
 	while (1) {
 		int batch = atomic_read(&root->log_batch);
 		/* when we're on an ssd, just kick the log commit out */
@@ -2529,11 +2536,10 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 	if (atomic_read(&log_root_tree->log_commit[index2])) {
 		blk_finish_plug(&plug);
 		btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
-		wait_log_commit(trans, log_root_tree,
-				log_root_tree->log_transid);
+		ret = wait_log_commit(trans, log_root_tree,
+				      log_root_tree->log_transid);
 		btrfs_free_logged_extents(log, log_transid);
 		mutex_unlock(&log_root_tree->log_mutex);
-		ret = 0;
 		goto out;
 	}
 	atomic_set(&log_root_tree->log_commit[index2], 1);
-- 
cgit v1.2.3


From e87ac1368700af66c295afa47e5c7df0d9d8b919 Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Thu, 20 Feb 2014 18:08:53 +0800
Subject: Btrfs: don't start the log transaction if the log tree init fails

The old code would start the log transaction even the log tree init
failed, it was unnecessary. Fix it.

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
---
 fs/btrfs/tree-log.c | 26 ++++++++++++++------------
 1 file changed, 14 insertions(+), 12 deletions(-)

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 8a03b39648be..ca960ad271fe 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -139,7 +139,6 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
 			   struct btrfs_root *root)
 {
 	int ret;
-	int err = 0;
 
 	mutex_lock(&root->log_mutex);
 	if (root->log_root) {
@@ -155,24 +154,27 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
 		mutex_unlock(&root->log_mutex);
 		return 0;
 	}
-	root->log_multiple_pids = false;
-	root->log_start_pid = current->pid;
+
+	ret = 0;
 	mutex_lock(&root->fs_info->tree_log_mutex);
-	if (!root->fs_info->log_root_tree) {
+	if (!root->fs_info->log_root_tree)
 		ret = btrfs_init_log_root_tree(trans, root->fs_info);
-		if (ret)
-			err = ret;
-	}
-	if (err == 0 && !root->log_root) {
+	mutex_unlock(&root->fs_info->tree_log_mutex);
+	if (ret)
+		goto out;
+
+	if (!root->log_root) {
 		ret = btrfs_add_log_tree(trans, root);
 		if (ret)
-			err = ret;
+			goto out;
 	}
-	mutex_unlock(&root->fs_info->tree_log_mutex);
+	root->log_multiple_pids = false;
+	root->log_start_pid = current->pid;
 	atomic_inc(&root->log_batch);
 	atomic_inc(&root->log_writers);
+out:
 	mutex_unlock(&root->log_mutex);
-	return err;
+	return ret;
 }
 
 /*
@@ -4116,7 +4118,7 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
 
 	ret = start_log_trans(trans, root);
 	if (ret)
-		goto end_trans;
+		goto end_no_trans;
 
 	ret = btrfs_log_inode(trans, root, inode, inode_only);
 	if (ret)
-- 
cgit v1.2.3


From 7483e1a4464999c72b231af0efe39cb31fd73f14 Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Thu, 20 Feb 2014 18:08:55 +0800
Subject: Btrfs: remove unnecessary memory barrier in btrfs_sync_log()

Mutex unlock implies certain memory barriers to make sure all the memory
operation completes before the unlock, and the next mutex lock implies memory
barriers to make sure the all the memory happens after the lock. So it is
a full memory barrier(smp_mb), we needn't add memory barriers. Remove them.

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
---
 fs/btrfs/tree-log.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index ca960ad271fe..285c168391f3 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -2496,7 +2496,6 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 	root->log_transid++;
 	log->log_transid = root->log_transid;
 	root->log_start_pid = 0;
-	smp_mb();
 	/*
 	 * IO has been started, blocks of the log tree have WRITTEN flag set
 	 * in their headers. new modifications of the log will be written to
@@ -2589,8 +2588,6 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 				btrfs_header_level(log_root_tree->node));
 
 	log_root_tree->log_transid++;
-	smp_mb();
-
 	mutex_unlock(&log_root_tree->log_mutex);
 
 	/*
-- 
cgit v1.2.3


From bb14a59b619d3a9993c3fa04bb10347db35ca550 Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Thu, 20 Feb 2014 18:08:56 +0800
Subject: Btrfs: use signed integer instead of unsigned long integer for log
 transid

The log trans id is initialized to be 0 every time we create a log tree,
and the log tree need be re-created after a new transaction is started,
it means the log trans id is unlikely to be a huge number, so we can use
signed integer instead of unsigned long integer to save a bit space.

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
---
 fs/btrfs/btrfs_inode.h | 14 +++++++-------
 fs/btrfs/ctree.h       |  4 ++--
 fs/btrfs/tree-log.c    |  4 ++--
 3 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 8fed2125689e..c9a24444ec9a 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -109,14 +109,17 @@ struct btrfs_inode {
 	u64 last_trans;
 
 	/*
-	 * log transid when this inode was last modified
+	 * transid that last logged this inode
 	 */
-	u64 last_sub_trans;
+	u64 logged_trans;
 
 	/*
-	 * transid that last logged this inode
+	 * log transid when this inode was last modified
 	 */
-	u64 logged_trans;
+	int last_sub_trans;
+
+	/* a local copy of root's last_log_commit */
+	int last_log_commit;
 
 	/* total number of bytes pending delalloc, used by stat to calc the
 	 * real block usage of the file
@@ -155,9 +158,6 @@ struct btrfs_inode {
 	/* flags field from the on disk inode */
 	u32 flags;
 
-	/* a local copy of root's last_log_commit */
-	unsigned long last_log_commit;
-
 	/*
 	 * Counters to keep track of the number of extent item's we may use due
 	 * to delalloc and such.  outstanding_extents is the number of extent
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index dac6653d4cce..70c03f5f0953 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1721,8 +1721,8 @@ struct btrfs_root {
 	atomic_t log_writers;
 	atomic_t log_commit[2];
 	atomic_t log_batch;
-	unsigned long log_transid;
-	unsigned long last_log_commit;
+	int log_transid;
+	int last_log_commit;
 	pid_t log_start_pid;
 	bool log_multiple_pids;
 
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 285c168391f3..128a904ceac0 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -2362,7 +2362,7 @@ static int update_log_root(struct btrfs_trans_handle *trans,
 }
 
 static int wait_log_commit(struct btrfs_trans_handle *trans,
-			   struct btrfs_root *root, unsigned long transid)
+			   struct btrfs_root *root, int transid)
 {
 	DEFINE_WAIT(wait);
 	int index = transid % 2;
@@ -2434,7 +2434,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 	int ret;
 	struct btrfs_root *log = root->log_root;
 	struct btrfs_root *log_root_tree = root->fs_info->log_root_tree;
-	unsigned long log_transid = 0;
+	int log_transid = 0;
 	struct blk_plug plug;
 
 	mutex_lock(&root->log_mutex);
-- 
cgit v1.2.3


From 8b050d350c7846462a21e9e054c9154ede9b43cf Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Thu, 20 Feb 2014 18:08:58 +0800
Subject: Btrfs: fix skipped error handle when log sync failed

It is possible that many tasks sync the log tree at the same time, but
only one task can do the sync work, the others will wait for it. But those
wait tasks didn't get the result of the log sync, and returned 0 when they
ended the wait. It caused those tasks skipped the error handle, and the
serious problem was they told the users the file sync succeeded but in
fact they failed.

This patch fixes this problem by introducing a log context structure,
we insert it into the a global list. When the sync fails, we will set
the error number of every log context in the list, then the waiting tasks
get the error number of the log context and handle the error if need.

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
---
 fs/btrfs/ctree.h    |   1 +
 fs/btrfs/disk-io.c  |   2 +
 fs/btrfs/file.c     |   9 +++--
 fs/btrfs/tree-log.c | 114 ++++++++++++++++++++++++++++++++++++++++------------
 fs/btrfs/tree-log.h |  16 +++++++-
 5 files changed, 111 insertions(+), 31 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 70c03f5f0953..906410719acb 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1718,6 +1718,7 @@ struct btrfs_root {
 	struct mutex log_mutex;
 	wait_queue_head_t log_writer_wait;
 	wait_queue_head_t log_commit_wait[2];
+	struct list_head log_ctxs[2];
 	atomic_t log_writers;
 	atomic_t log_commit[2];
 	atomic_t log_batch;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index cc1b4237dc62..44f52d280b7d 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1200,6 +1200,8 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
 	init_waitqueue_head(&root->log_writer_wait);
 	init_waitqueue_head(&root->log_commit_wait[0]);
 	init_waitqueue_head(&root->log_commit_wait[1]);
+	INIT_LIST_HEAD(&root->log_ctxs[0]);
+	INIT_LIST_HEAD(&root->log_ctxs[1]);
 	atomic_set(&root->log_commit[0], 0);
 	atomic_set(&root->log_commit[1], 0);
 	atomic_set(&root->log_writers, 0);
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 006af2f4dd98..6acccc4a7f2a 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1864,8 +1864,9 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
 	struct dentry *dentry = file->f_path.dentry;
 	struct inode *inode = dentry->d_inode;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
-	int ret = 0;
 	struct btrfs_trans_handle *trans;
+	struct btrfs_log_ctx ctx;
+	int ret = 0;
 	bool full_sync = 0;
 
 	trace_btrfs_sync_file(file, datasync);
@@ -1959,7 +1960,9 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
 	}
 	trans->sync = true;
 
-	ret = btrfs_log_dentry_safe(trans, root, dentry);
+	btrfs_init_log_ctx(&ctx);
+
+	ret = btrfs_log_dentry_safe(trans, root, dentry, &ctx);
 	if (ret < 0) {
 		/* Fallthrough and commit/free transaction. */
 		ret = 1;
@@ -1979,7 +1982,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
 
 	if (ret != BTRFS_NO_LOG_SYNC) {
 		if (!ret) {
-			ret = btrfs_sync_log(trans, root);
+			ret = btrfs_sync_log(trans, root, &ctx);
 			if (!ret) {
 				ret = btrfs_end_transaction(trans, root);
 				goto out;
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 128a904ceac0..da6da274dce3 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -136,8 +136,10 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
  * syncing the tree wait for us to finish
  */
 static int start_log_trans(struct btrfs_trans_handle *trans,
-			   struct btrfs_root *root)
+			   struct btrfs_root *root,
+			   struct btrfs_log_ctx *ctx)
 {
+	int index;
 	int ret;
 
 	mutex_lock(&root->log_mutex);
@@ -151,6 +153,10 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
 
 		atomic_inc(&root->log_batch);
 		atomic_inc(&root->log_writers);
+		if (ctx) {
+			index = root->log_transid % 2;
+			list_add_tail(&ctx->list, &root->log_ctxs[index]);
+		}
 		mutex_unlock(&root->log_mutex);
 		return 0;
 	}
@@ -172,6 +178,10 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
 	root->log_start_pid = current->pid;
 	atomic_inc(&root->log_batch);
 	atomic_inc(&root->log_writers);
+	if (ctx) {
+		index = root->log_transid % 2;
+		list_add_tail(&ctx->list, &root->log_ctxs[index]);
+	}
 out:
 	mutex_unlock(&root->log_mutex);
 	return ret;
@@ -2361,12 +2371,11 @@ static int update_log_root(struct btrfs_trans_handle *trans,
 	return ret;
 }
 
-static int wait_log_commit(struct btrfs_trans_handle *trans,
-			   struct btrfs_root *root, int transid)
+static void wait_log_commit(struct btrfs_trans_handle *trans,
+			    struct btrfs_root *root, int transid)
 {
 	DEFINE_WAIT(wait);
 	int index = transid % 2;
-	int ret = 0;
 
 	/*
 	 * we only allow two pending log transactions at a time,
@@ -2374,12 +2383,6 @@ static int wait_log_commit(struct btrfs_trans_handle *trans,
 	 * current transaction, we're done
 	 */
 	do {
-		if (ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) ==
-		    trans->transid) {
-			ret = -EAGAIN;
-			break;
-		}
-
 		prepare_to_wait(&root->log_commit_wait[index],
 				&wait, TASK_UNINTERRUPTIBLE);
 		mutex_unlock(&root->log_mutex);
@@ -2392,27 +2395,55 @@ static int wait_log_commit(struct btrfs_trans_handle *trans,
 		mutex_lock(&root->log_mutex);
 	} while (root->log_transid < transid + 2 &&
 		 atomic_read(&root->log_commit[index]));
-
-	return ret;
 }
 
 static void wait_for_writer(struct btrfs_trans_handle *trans,
 			    struct btrfs_root *root)
 {
 	DEFINE_WAIT(wait);
-	while (ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) !=
-	       trans->transid && atomic_read(&root->log_writers)) {
+
+	while (atomic_read(&root->log_writers)) {
 		prepare_to_wait(&root->log_writer_wait,
 				&wait, TASK_UNINTERRUPTIBLE);
 		mutex_unlock(&root->log_mutex);
-		if (ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) !=
-		    trans->transid && atomic_read(&root->log_writers))
+		if (atomic_read(&root->log_writers))
 			schedule();
 		mutex_lock(&root->log_mutex);
 		finish_wait(&root->log_writer_wait, &wait);
 	}
 }
 
+static inline void btrfs_remove_log_ctx(struct btrfs_root *root,
+					struct btrfs_log_ctx *ctx)
+{
+	if (!ctx)
+		return;
+
+	mutex_lock(&root->log_mutex);
+	list_del_init(&ctx->list);
+	mutex_unlock(&root->log_mutex);
+}
+
+/* 
+ * Invoked in log mutex context, or be sure there is no other task which
+ * can access the list.
+ */
+static inline void btrfs_remove_all_log_ctxs(struct btrfs_root *root,
+					     int index, int error)
+{
+	struct btrfs_log_ctx *ctx;
+
+	if (!error) {
+		INIT_LIST_HEAD(&root->log_ctxs[index]);
+		return;
+	}
+
+	list_for_each_entry(ctx, &root->log_ctxs[index], list)
+		ctx->log_ret = error;
+
+	INIT_LIST_HEAD(&root->log_ctxs[index]);
+}
+
 /*
  * btrfs_sync_log does sends a given tree log down to the disk and
  * updates the super blocks to record it.  When this call is done,
@@ -2426,7 +2457,7 @@ static void wait_for_writer(struct btrfs_trans_handle *trans,
  * that has happened.
  */
 int btrfs_sync_log(struct btrfs_trans_handle *trans,
-		   struct btrfs_root *root)
+		   struct btrfs_root *root, struct btrfs_log_ctx *ctx)
 {
 	int index1;
 	int index2;
@@ -2435,15 +2466,16 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 	struct btrfs_root *log = root->log_root;
 	struct btrfs_root *log_root_tree = root->fs_info->log_root_tree;
 	int log_transid = 0;
+	struct btrfs_log_ctx root_log_ctx;
 	struct blk_plug plug;
 
 	mutex_lock(&root->log_mutex);
 	log_transid = root->log_transid;
 	index1 = root->log_transid % 2;
 	if (atomic_read(&root->log_commit[index1])) {
-		ret = wait_log_commit(trans, root, root->log_transid);
+		wait_log_commit(trans, root, root->log_transid);
 		mutex_unlock(&root->log_mutex);
-		return ret;
+		return ctx->log_ret;
 	}
 	atomic_set(&root->log_commit[index1], 1);
 
@@ -2534,13 +2566,18 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 	}
 
 	index2 = log_root_tree->log_transid % 2;
+
+	btrfs_init_log_ctx(&root_log_ctx);
+	list_add_tail(&root_log_ctx.list, &log_root_tree->log_ctxs[index2]);
+
 	if (atomic_read(&log_root_tree->log_commit[index2])) {
 		blk_finish_plug(&plug);
 		btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
-		ret = wait_log_commit(trans, log_root_tree,
-				      log_root_tree->log_transid);
+		wait_log_commit(trans, log_root_tree,
+				log_root_tree->log_transid);
 		btrfs_free_logged_extents(log, log_transid);
 		mutex_unlock(&log_root_tree->log_mutex);
+		ret = root_log_ctx.log_ret;
 		goto out;
 	}
 	atomic_set(&log_root_tree->log_commit[index2], 1);
@@ -2609,12 +2646,31 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 	mutex_unlock(&root->log_mutex);
 
 out_wake_log_root:
+	/*
+	 * We needn't get log_mutex here because we are sure all
+	 * the other tasks are blocked.
+	 */
+	btrfs_remove_all_log_ctxs(log_root_tree, index2, ret);
+
+	/*
+	 * It is dangerous if log_commit is changed before we set
+	 * ->log_ret of log ctx. Because the readers may not get
+	 *  the return value.
+	 */
+	smp_wmb();
+
 	atomic_set(&log_root_tree->log_commit[index2], 0);
 	smp_mb();
 	if (waitqueue_active(&log_root_tree->log_commit_wait[index2]))
 		wake_up(&log_root_tree->log_commit_wait[index2]);
 out:
+	/* See above. */
+	btrfs_remove_all_log_ctxs(root, index1, ret);
+
+	/* See above. */
+	smp_wmb();
 	atomic_set(&root->log_commit[index1], 0);
+
 	smp_mb();
 	if (waitqueue_active(&root->log_commit_wait[index1]))
 		wake_up(&root->log_commit_wait[index1]);
@@ -4076,7 +4132,8 @@ out:
  */
 static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
 			    	  struct btrfs_root *root, struct inode *inode,
-			    	  struct dentry *parent, int exists_only)
+			    	  struct dentry *parent, int exists_only,
+				  struct btrfs_log_ctx *ctx)
 {
 	int inode_only = exists_only ? LOG_INODE_EXISTS : LOG_INODE_ALL;
 	struct super_block *sb;
@@ -4113,7 +4170,7 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
 		goto end_no_trans;
 	}
 
-	ret = start_log_trans(trans, root);
+	ret = start_log_trans(trans, root, ctx);
 	if (ret)
 		goto end_no_trans;
 
@@ -4163,6 +4220,9 @@ end_trans:
 		root->fs_info->last_trans_log_full_commit = trans->transid;
 		ret = 1;
 	}
+
+	if (ret)
+		btrfs_remove_log_ctx(root, ctx);
 	btrfs_end_log_trans(root);
 end_no_trans:
 	return ret;
@@ -4175,12 +4235,14 @@ end_no_trans:
  * data on disk.
  */
 int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
-			  struct btrfs_root *root, struct dentry *dentry)
+			  struct btrfs_root *root, struct dentry *dentry,
+			  struct btrfs_log_ctx *ctx)
 {
 	struct dentry *parent = dget_parent(dentry);
 	int ret;
 
-	ret = btrfs_log_inode_parent(trans, root, dentry->d_inode, parent, 0);
+	ret = btrfs_log_inode_parent(trans, root, dentry->d_inode, parent,
+				     0, ctx);
 	dput(parent);
 
 	return ret;
@@ -4417,6 +4479,6 @@ int btrfs_log_new_name(struct btrfs_trans_handle *trans,
 		    root->fs_info->last_trans_committed))
 		return 0;
 
-	return btrfs_log_inode_parent(trans, root, inode, parent, 1);
+	return btrfs_log_inode_parent(trans, root, inode, parent, 1, NULL);
 }
 
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
index 1d4ae0d15a70..59c1edb31d19 100644
--- a/fs/btrfs/tree-log.h
+++ b/fs/btrfs/tree-log.h
@@ -22,14 +22,26 @@
 /* return value for btrfs_log_dentry_safe that means we don't need to log it at all */
 #define BTRFS_NO_LOG_SYNC 256
 
+struct btrfs_log_ctx {
+	int log_ret;
+	struct list_head list;
+};
+
+static inline void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx)
+{
+	ctx->log_ret = 0;
+	INIT_LIST_HEAD(&ctx->list);
+}
+
 int btrfs_sync_log(struct btrfs_trans_handle *trans,
-		   struct btrfs_root *root);
+		   struct btrfs_root *root, struct btrfs_log_ctx *ctx);
 int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root);
 int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
 			     struct btrfs_fs_info *fs_info);
 int btrfs_recover_log_trees(struct btrfs_root *tree_root);
 int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
-			  struct btrfs_root *root, struct dentry *dentry);
+			  struct btrfs_root *root, struct dentry *dentry,
+			  struct btrfs_log_ctx *ctx);
 int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
 				 struct btrfs_root *root,
 				 const char *name, int name_len,
-- 
cgit v1.2.3


From d1433debe7f4346cf9fc0dafc71c3137d2a97bc4 Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Thu, 20 Feb 2014 18:08:59 +0800
Subject: Btrfs: just wait or commit our own log sub-transaction

We might commit the log sub-transaction which didn't contain the metadata we
logged. It was because we didn't record the log transid and just select
the current log sub-transaction to commit, but the right one might be
committed by the other task already. Actually, we needn't do anything
and it is safe that we go back directly in this case.

This patch improves the log sync by the above idea. We record the transid
of the log sub-transaction in which we log the metadata, and the transid
of the log sub-transaction we have committed. If the committed transid
is >= the transid we record when logging the metadata, we just go back.

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
---
 fs/btrfs/ctree.h    |  3 +++
 fs/btrfs/disk-io.c  |  2 ++
 fs/btrfs/tree-log.c | 63 ++++++++++++++++++++++++++++++++++-------------------
 fs/btrfs/tree-log.h |  2 ++
 4 files changed, 47 insertions(+), 23 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 906410719acb..b2c0336db691 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1723,6 +1723,9 @@ struct btrfs_root {
 	atomic_t log_commit[2];
 	atomic_t log_batch;
 	int log_transid;
+	/* No matter the commit succeeds or not*/
+	int log_transid_committed;
+	/* Just be updated when the commit succeeds. */
 	int last_log_commit;
 	pid_t log_start_pid;
 	bool log_multiple_pids;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 44f52d280b7d..dd52146035b3 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1209,6 +1209,7 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
 	atomic_set(&root->orphan_inodes, 0);
 	atomic_set(&root->refs, 1);
 	root->log_transid = 0;
+	root->log_transid_committed = -1;
 	root->last_log_commit = 0;
 	if (fs_info)
 		extent_io_tree_init(&root->dirty_log_pages,
@@ -1422,6 +1423,7 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
 	WARN_ON(root->log_root);
 	root->log_root = log_root;
 	root->log_transid = 0;
+	root->log_transid_committed = -1;
 	root->last_log_commit = 0;
 	return 0;
 }
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index da6da274dce3..57d4ca7fd555 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -156,6 +156,7 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
 		if (ctx) {
 			index = root->log_transid % 2;
 			list_add_tail(&ctx->list, &root->log_ctxs[index]);
+			ctx->log_transid = root->log_transid;
 		}
 		mutex_unlock(&root->log_mutex);
 		return 0;
@@ -181,6 +182,7 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
 	if (ctx) {
 		index = root->log_transid % 2;
 		list_add_tail(&ctx->list, &root->log_ctxs[index]);
+		ctx->log_transid = root->log_transid;
 	}
 out:
 	mutex_unlock(&root->log_mutex);
@@ -2387,13 +2389,13 @@ static void wait_log_commit(struct btrfs_trans_handle *trans,
 				&wait, TASK_UNINTERRUPTIBLE);
 		mutex_unlock(&root->log_mutex);
 
-		if (root->log_transid < transid + 2 &&
+		if (root->log_transid_committed < transid &&
 		    atomic_read(&root->log_commit[index]))
 			schedule();
 
 		finish_wait(&root->log_commit_wait[index], &wait);
 		mutex_lock(&root->log_mutex);
-	} while (root->log_transid < transid + 2 &&
+	} while (root->log_transid_committed < transid &&
 		 atomic_read(&root->log_commit[index]));
 }
 
@@ -2470,18 +2472,24 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 	struct blk_plug plug;
 
 	mutex_lock(&root->log_mutex);
-	log_transid = root->log_transid;
-	index1 = root->log_transid % 2;
+	log_transid = ctx->log_transid;
+	if (root->log_transid_committed >= log_transid) {
+		mutex_unlock(&root->log_mutex);
+		return ctx->log_ret;
+	}
+
+	index1 = log_transid % 2;
 	if (atomic_read(&root->log_commit[index1])) {
-		wait_log_commit(trans, root, root->log_transid);
+		wait_log_commit(trans, root, log_transid);
 		mutex_unlock(&root->log_mutex);
 		return ctx->log_ret;
 	}
+	ASSERT(log_transid == root->log_transid);
 	atomic_set(&root->log_commit[index1], 1);
 
 	/* wait for previous tree log sync to complete */
 	if (atomic_read(&root->log_commit[(index1 + 1) % 2]))
-		wait_log_commit(trans, root, root->log_transid - 1);
+		wait_log_commit(trans, root, log_transid - 1);
 
 	while (1) {
 		int batch = atomic_read(&root->log_batch);
@@ -2535,9 +2543,16 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 	 */
 	mutex_unlock(&root->log_mutex);
 
+	btrfs_init_log_ctx(&root_log_ctx);
+
 	mutex_lock(&log_root_tree->log_mutex);
 	atomic_inc(&log_root_tree->log_batch);
 	atomic_inc(&log_root_tree->log_writers);
+
+	index2 = log_root_tree->log_transid % 2;
+	list_add_tail(&root_log_ctx.list, &log_root_tree->log_ctxs[index2]);
+	root_log_ctx.log_transid = log_root_tree->log_transid;
+
 	mutex_unlock(&log_root_tree->log_mutex);
 
 	ret = update_log_root(trans, log);
@@ -2550,6 +2565,9 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 	}
 
 	if (ret) {
+		if (!list_empty(&root_log_ctx.list))
+			list_del_init(&root_log_ctx.list);
+
 		blk_finish_plug(&plug);
 		if (ret != -ENOSPC) {
 			btrfs_abort_transaction(trans, root, ret);
@@ -2565,26 +2583,29 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 		goto out;
 	}
 
-	index2 = log_root_tree->log_transid % 2;
-
-	btrfs_init_log_ctx(&root_log_ctx);
-	list_add_tail(&root_log_ctx.list, &log_root_tree->log_ctxs[index2]);
+	if (log_root_tree->log_transid_committed >= root_log_ctx.log_transid) {
+		mutex_unlock(&log_root_tree->log_mutex);
+		ret = root_log_ctx.log_ret;
+		goto out;
+	}
 
+	index2 = root_log_ctx.log_transid % 2;
 	if (atomic_read(&log_root_tree->log_commit[index2])) {
 		blk_finish_plug(&plug);
 		btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
 		wait_log_commit(trans, log_root_tree,
-				log_root_tree->log_transid);
+				root_log_ctx.log_transid);
 		btrfs_free_logged_extents(log, log_transid);
 		mutex_unlock(&log_root_tree->log_mutex);
 		ret = root_log_ctx.log_ret;
 		goto out;
 	}
+	ASSERT(root_log_ctx.log_transid == log_root_tree->log_transid);
 	atomic_set(&log_root_tree->log_commit[index2], 1);
 
 	if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) {
 		wait_log_commit(trans, log_root_tree,
-				log_root_tree->log_transid - 1);
+				root_log_ctx.log_transid - 1);
 	}
 
 	wait_for_writer(trans, log_root_tree);
@@ -2652,26 +2673,22 @@ out_wake_log_root:
 	 */
 	btrfs_remove_all_log_ctxs(log_root_tree, index2, ret);
 
-	/*
-	 * It is dangerous if log_commit is changed before we set
-	 * ->log_ret of log ctx. Because the readers may not get
-	 *  the return value.
-	 */
-	smp_wmb();
-
+	mutex_lock(&log_root_tree->log_mutex);
+	log_root_tree->log_transid_committed++;
 	atomic_set(&log_root_tree->log_commit[index2], 0);
-	smp_mb();
+	mutex_unlock(&log_root_tree->log_mutex);
+
 	if (waitqueue_active(&log_root_tree->log_commit_wait[index2]))
 		wake_up(&log_root_tree->log_commit_wait[index2]);
 out:
 	/* See above. */
 	btrfs_remove_all_log_ctxs(root, index1, ret);
 
-	/* See above. */
-	smp_wmb();
+	mutex_lock(&root->log_mutex);
+	root->log_transid_committed++;
 	atomic_set(&root->log_commit[index1], 0);
+	mutex_unlock(&root->log_mutex);
 
-	smp_mb();
 	if (waitqueue_active(&root->log_commit_wait[index1]))
 		wake_up(&root->log_commit_wait[index1]);
 	return ret;
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
index 59c1edb31d19..91b145fce333 100644
--- a/fs/btrfs/tree-log.h
+++ b/fs/btrfs/tree-log.h
@@ -24,12 +24,14 @@
 
 struct btrfs_log_ctx {
 	int log_ret;
+	int log_transid;
 	struct list_head list;
 };
 
 static inline void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx)
 {
 	ctx->log_ret = 0;
+	ctx->log_transid = 0;
 	INIT_LIST_HEAD(&ctx->list);
 }
 
-- 
cgit v1.2.3


From 50471a388cf011523f3bf91d275ec3f30669f0ee Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Thu, 20 Feb 2014 18:08:57 +0800
Subject: Btrfs: stop joining the log transaction if sync log fails

If the log sync fails, there is something wrong in the log tree, we
should not continue to join the log transaction and log the metadata.
What we should do is to do a full commit.

This patch fixes this problem by setting ->last_trans_log_full_commit
to the current transaction id, it will tell the tasks not to join
the log transaction, and do a full commit.

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
---
 fs/btrfs/tree-log.c | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 57d4ca7fd555..e2f45fc02610 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -144,6 +144,12 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
 
 	mutex_lock(&root->log_mutex);
 	if (root->log_root) {
+		if (ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) ==
+		    trans->transid) {
+			ret = -EAGAIN;
+			goto out;
+		}
+
 		if (!root->log_start_pid) {
 			root->log_start_pid = current->pid;
 			root->log_multiple_pids = false;
@@ -2527,6 +2533,8 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 		blk_finish_plug(&plug);
 		btrfs_abort_transaction(trans, root, ret);
 		btrfs_free_logged_extents(log, log_transid);
+		ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) =
+								trans->transid;
 		mutex_unlock(&root->log_mutex);
 		goto out;
 	}
@@ -2569,13 +2577,13 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 			list_del_init(&root_log_ctx.list);
 
 		blk_finish_plug(&plug);
+		ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) =
+								trans->transid;
 		if (ret != -ENOSPC) {
 			btrfs_abort_transaction(trans, root, ret);
 			mutex_unlock(&log_root_tree->log_mutex);
 			goto out;
 		}
-		ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) =
-								trans->transid;
 		btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
 		btrfs_free_logged_extents(log, log_transid);
 		mutex_unlock(&log_root_tree->log_mutex);
@@ -2629,6 +2637,8 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 					 EXTENT_DIRTY | EXTENT_NEW);
 	blk_finish_plug(&plug);
 	if (ret) {
+		ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) =
+								trans->transid;
 		btrfs_abort_transaction(trans, root, ret);
 		btrfs_free_logged_extents(log, log_transid);
 		mutex_unlock(&log_root_tree->log_mutex);
@@ -2657,6 +2667,8 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 	 */
 	ret = write_ctree_super(trans, root->fs_info->tree_root, 1);
 	if (ret) {
+		ACCESS_ONCE(root->fs_info->last_trans_log_full_commit) =
+								trans->transid;
 		btrfs_abort_transaction(trans, root, ret);
 		goto out_wake_log_root;
 	}
-- 
cgit v1.2.3


From 2c6a92b0097464e08caaa1caeb8baa9d470ab990 Mon Sep 17 00:00:00 2001
From: Justin Maggard <jmaggard10@gmail.com>
Date: Thu, 20 Feb 2014 08:48:07 -0800
Subject: btrfs: wake up transaction thread upon remount

Now that we can adjust the commit interval with a remount, we need
to wake up the transaction thread or else he will continue to sleep
until the previous transaction interval has elapsed before waking
up.  So, if we go from a large commit interval to something smaller,
the transaction thread will not wake up until the large interval has
expired.  This also causes the cleaner thread to stay sleeping, since
it gets woken up by the transaction thread.

Fix it by simply waking up the transaction thread during a remount.

Signed-off-by: Justin Maggard <jmaggard10@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
---
 fs/btrfs/super.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index d04db817be5c..426b7c602653 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1479,6 +1479,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
 		sb->s_flags &= ~MS_RDONLY;
 	}
 out:
+	wake_up_process(fs_info->transaction_kthread);
 	btrfs_remount_cleanup(fs_info, old_opts);
 	return 0;
 
-- 
cgit v1.2.3


From 6103fb43fbc6d1caa78f26a1d0aa3d1a4525cea5 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@gmail.com>
Date: Wed, 12 Feb 2014 15:07:52 +0000
Subject: Btrfs: remove unnecessary ref heads rb tree search

When we didn't find the exact ref head we were looking for, if
return_bigger != 0 we set a new search key to match either the
next node after the last one we found or the first one in the
ref heads rb tree, and then did another full tree search. For both
cases this ended up being pointless as we would end up returning
an entry we already had before repeating the search.

Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
---
 fs/btrfs/delayed-ref.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index f3bff89eecf0..56cdfe988d69 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -205,7 +205,6 @@ find_ref_head(struct rb_root *root, u64 bytenr,
 	struct btrfs_delayed_ref_head *entry;
 	int cmp = 0;
 
-again:
 	n = root->rb_node;
 	entry = NULL;
 	while (n) {
@@ -234,9 +233,9 @@ again:
 				n = rb_first(root);
 			entry = rb_entry(n, struct btrfs_delayed_ref_head,
 					 href_node);
-			bytenr = entry->node.bytenr;
-			return_bigger = 0;
-			goto again;
+			if (last)
+				*last = entry;
+			return entry;
 		}
 		return entry;
 	}
-- 
cgit v1.2.3


From 85fdfdf6118dc00c2fcea8907815e48c98ee6c1d Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@gmail.com>
Date: Wed, 12 Feb 2014 15:07:53 +0000
Subject: Btrfs: cleanup delayed-ref.c:find_ref_head()

The argument last wasn't used, all callers supplied a NULL value
for it. Also removed unnecessary intermediate storage of the result
of key comparisons.

Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
---
 fs/btrfs/delayed-ref.c | 24 ++++++------------------
 1 file changed, 6 insertions(+), 18 deletions(-)

diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 56cdfe988d69..2502ba5a3ac0 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -199,42 +199,30 @@ static struct btrfs_delayed_ref_head *htree_insert(struct rb_root *root,
  */
 static struct btrfs_delayed_ref_head *
 find_ref_head(struct rb_root *root, u64 bytenr,
-	      struct btrfs_delayed_ref_head **last, int return_bigger)
+	      int return_bigger)
 {
 	struct rb_node *n;
 	struct btrfs_delayed_ref_head *entry;
-	int cmp = 0;
 
 	n = root->rb_node;
 	entry = NULL;
 	while (n) {
 		entry = rb_entry(n, struct btrfs_delayed_ref_head, href_node);
-		if (last)
-			*last = entry;
 
 		if (bytenr < entry->node.bytenr)
-			cmp = -1;
-		else if (bytenr > entry->node.bytenr)
-			cmp = 1;
-		else
-			cmp = 0;
-
-		if (cmp < 0)
 			n = n->rb_left;
-		else if (cmp > 0)
+		else if (bytenr > entry->node.bytenr)
 			n = n->rb_right;
 		else
 			return entry;
 	}
 	if (entry && return_bigger) {
-		if (cmp > 0) {
+		if (bytenr > entry->node.bytenr) {
 			n = rb_next(&entry->href_node);
 			if (!n)
 				n = rb_first(root);
 			entry = rb_entry(n, struct btrfs_delayed_ref_head,
 					 href_node);
-			if (last)
-				*last = entry;
 			return entry;
 		}
 		return entry;
@@ -414,12 +402,12 @@ btrfs_select_ref_head(struct btrfs_trans_handle *trans)
 
 again:
 	start = delayed_refs->run_delayed_start;
-	head = find_ref_head(&delayed_refs->href_root, start, NULL, 1);
+	head = find_ref_head(&delayed_refs->href_root, start, 1);
 	if (!head && !loop) {
 		delayed_refs->run_delayed_start = 0;
 		start = 0;
 		loop = true;
-		head = find_ref_head(&delayed_refs->href_root, start, NULL, 1);
+		head = find_ref_head(&delayed_refs->href_root, start, 1);
 		if (!head)
 			return NULL;
 	} else if (!head && loop) {
@@ -897,7 +885,7 @@ btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr)
 	struct btrfs_delayed_ref_root *delayed_refs;
 
 	delayed_refs = &trans->transaction->delayed_refs;
-	return find_ref_head(&delayed_refs->href_root, bytenr, NULL, 0);
+	return find_ref_head(&delayed_refs->href_root, bytenr, 0);
 }
 
 void btrfs_delayed_ref_exit(void)
-- 
cgit v1.2.3


From 12870f1c9b2de7d475d22e73fd7db1b418599725 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@gmail.com>
Date: Sat, 15 Feb 2014 15:55:58 +0000
Subject: Btrfs: don't insert useless holes when punching beyond the inode's
 size

If we punch beyond the size of an inode, we'll correctly remove any prealloc extents,
but we'll also insert file extent items representing holes (disk bytenr == 0) that start
with a key offset that lies beyond the inode's size and are not contiguous with the last
file extent item.

Example:

  $XFS_IO_PROG -f -c "truncate 118811" $SCRATCH_MNT/foo
  $XFS_IO_PROG -c "fpunch 582007 864596" $SCRATCH_MNT/foo
  $XFS_IO_PROG -c "pwrite -S 0x0d -b 39987 92267 39987" $SCRATCH_MNT/foo

btrfs-debug-tree output:

  item 4 key (257 INODE_ITEM 0) itemoff 15885 itemsize 160
	inode generation 6 transid 6 size 132254 block group 0 mode 100600 links 1
  item 5 key (257 INODE_REF 256) itemoff 15872 itemsize 13
	inode ref index 2 namelen 3 name: foo
  item 6 key (257 EXTENT_DATA 0) itemoff 15819 itemsize 53
	extent data disk byte 0 nr 0 gen 6
	extent data offset 0 nr 90112 ram 122880
	extent compression 0
  item 7 key (257 EXTENT_DATA 90112) itemoff 15766 itemsize 53
	extent data disk byte 12845056 nr 4096 gen 6
	extent data offset 0 nr 45056 ram 45056
	extent compression 2
  item 8 key (257 EXTENT_DATA 585728) itemoff 15713 itemsize 53
	extent data disk byte 0 nr 0 gen 6
	extent data offset 0 nr 860160 ram 860160
	extent compression 0

The last extent item, which represents a hole, is useless as it lies beyond the inode's
size.

Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
---
 fs/btrfs/file.c | 28 +++++++++++++++++-----------
 1 file changed, 17 insertions(+), 11 deletions(-)

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 6acccc4a7f2a..762ca32bd988 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -2168,6 +2168,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
 	bool same_page = ((offset >> PAGE_CACHE_SHIFT) ==
 			  ((offset + len - 1) >> PAGE_CACHE_SHIFT));
 	bool no_holes = btrfs_fs_incompat(root->fs_info, NO_HOLES);
+	u64 ino_size = round_up(inode->i_size, PAGE_CACHE_SIZE);
 
 	ret = btrfs_wait_ordered_range(inode, offset, len);
 	if (ret)
@@ -2183,14 +2184,14 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
 	 * entire page.
 	 */
 	if (same_page && len < PAGE_CACHE_SIZE) {
-		if (offset < round_up(inode->i_size, PAGE_CACHE_SIZE))
+		if (offset < ino_size)
 			ret = btrfs_truncate_page(inode, offset, len, 0);
 		mutex_unlock(&inode->i_mutex);
 		return ret;
 	}
 
 	/* zero back part of the first page */
-	if (offset < round_up(inode->i_size, PAGE_CACHE_SIZE)) {
+	if (offset < ino_size) {
 		ret = btrfs_truncate_page(inode, offset, 0, 0);
 		if (ret) {
 			mutex_unlock(&inode->i_mutex);
@@ -2199,7 +2200,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
 	}
 
 	/* zero the front end of the last page */
-	if (offset + len < round_up(inode->i_size, PAGE_CACHE_SIZE)) {
+	if (offset + len < ino_size) {
 		ret = btrfs_truncate_page(inode, offset + len, 0, 1);
 		if (ret) {
 			mutex_unlock(&inode->i_mutex);
@@ -2288,10 +2289,13 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
 
 		trans->block_rsv = &root->fs_info->trans_block_rsv;
 
-		ret = fill_holes(trans, inode, path, cur_offset, drop_end);
-		if (ret) {
-			err = ret;
-			break;
+		if (cur_offset < ino_size) {
+			ret = fill_holes(trans, inode, path, cur_offset,
+					 drop_end);
+			if (ret) {
+				err = ret;
+				break;
+			}
 		}
 
 		cur_offset = drop_end;
@@ -2324,10 +2328,12 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
 	}
 
 	trans->block_rsv = &root->fs_info->trans_block_rsv;
-	ret = fill_holes(trans, inode, path, cur_offset, drop_end);
-	if (ret) {
-		err = ret;
-		goto out_trans;
+	if (cur_offset < ino_size) {
+		ret = fill_holes(trans, inode, path, cur_offset, drop_end);
+		if (ret) {
+			err = ret;
+			goto out_trans;
+		}
 	}
 
 out_trans:
-- 
cgit v1.2.3


From 2b863a135f22f242ba4fc669f3a6b2f6c826832c Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@gmail.com>
Date: Sun, 16 Feb 2014 13:43:11 +0000
Subject: Btrfs: incremental send, fix invalid path after dir rename

This fixes yet one more case not caught by the commit titled:

   Btrfs: fix infinite path build loops in incremental send

In this case, even before the initial full send, we have a directory
which is a child of a directory with a higher inode number. Then we
perform the initial send, and after we rename both the child and the
parent, without moving them around. After doing these 2 renames, an
incremental send sent a rename instruction for the child directory
which contained an invalid "from" path (referenced the parent's old
name, not the new one), which made the btrfs receive command fail.

Steps to reproduce:

    $ mkfs.btrfs -f /dev/sdb3
    $ mount /dev/sdb3 /mnt/btrfs
    $ mkdir -p /mnt/btrfs/a/b
    $ mkdir /mnt/btrfs/d
    $ mkdir /mnt/btrfs/a/b/c
    $ mv /mnt/btrfs/d /mnt/btrfs/a/b/c
    $ btrfs subvolume snapshot -r /mnt/btrfs /mnt/btrfs/snap1
    $ btrfs send /mnt/btrfs/snap1 -f /tmp/base.send
    $ mv /mnt/btrfs/a/b/c /mnt/btrfs/a/b/x
    $ mv /mnt/btrfs/a/b/x/d /mnt/btrfs/a/b/x/y
    $ btrfs subvolume snapshot -r /mnt/btrfs /mnt/btrfs/snap2
    $ btrfs send -p /mnt/btrfs/snap1 /mnt/btrfs/snap2 -f /tmp/incremental.send

    $ umout /mnt/btrfs
    $ mkfs.btrfs -f /dev/sdb3
    $ mount /dev/sdb3 /mnt/btrfs
    $ btrfs receive /mnt/btrfs -f /tmp/base.send
    $ btrfs receive /mnt/btrfs -f /tmp/incremental.send

The second btrfs receive command failed with:
  "ERROR: rename a/b/c/d -> a/b/x/y failed. No such file or directory"

A test case for xfstests follows.

Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
---
 fs/btrfs/send.c | 41 ++++++++++++++++++++++++++++++++++-------
 1 file changed, 34 insertions(+), 7 deletions(-)

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 3ddd2bb75083..cb9502adccdc 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -2857,19 +2857,48 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm)
 {
 	struct fs_path *from_path = NULL;
 	struct fs_path *to_path = NULL;
+	struct fs_path *name = NULL;
 	u64 orig_progress = sctx->send_progress;
 	struct recorded_ref *cur;
+	u64 parent_ino, parent_gen;
 	int ret;
 
+	name = fs_path_alloc();
 	from_path = fs_path_alloc();
-	if (!from_path)
-		return -ENOMEM;
+	if (!name || !from_path) {
+		ret = -ENOMEM;
+		goto out;
+	}
 
-	sctx->send_progress = pm->ino;
-	ret = get_cur_path(sctx, pm->ino, pm->gen, from_path);
+	ret = del_waiting_dir_move(sctx, pm->ino);
+	ASSERT(ret == 0);
+
+	ret = get_first_ref(sctx->parent_root, pm->ino,
+			    &parent_ino, &parent_gen, name);
 	if (ret < 0)
 		goto out;
 
+	if (parent_ino == sctx->cur_ino) {
+		/* child only renamed, not moved */
+		ASSERT(parent_gen == sctx->cur_inode_gen);
+		ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen,
+				   from_path);
+		if (ret < 0)
+			goto out;
+		ret = fs_path_add_path(from_path, name);
+		if (ret < 0)
+			goto out;
+	} else {
+		/* child moved and maybe renamed too */
+		sctx->send_progress = pm->ino;
+		ret = get_cur_path(sctx, pm->ino, pm->gen, from_path);
+		if (ret < 0)
+			goto out;
+	}
+
+	fs_path_free(name);
+	name = NULL;
+
 	to_path = fs_path_alloc();
 	if (!to_path) {
 		ret = -ENOMEM;
@@ -2877,9 +2906,6 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm)
 	}
 
 	sctx->send_progress = sctx->cur_ino + 1;
-	ret = del_waiting_dir_move(sctx, pm->ino);
-	ASSERT(ret == 0);
-
 	ret = get_cur_path(sctx, pm->ino, pm->gen, to_path);
 	if (ret < 0)
 		goto out;
@@ -2903,6 +2929,7 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm)
 	}
 
 out:
+	fs_path_free(name);
 	fs_path_free(from_path);
 	fs_path_free(to_path);
 	sctx->send_progress = orig_progress;
-- 
cgit v1.2.3


From 29d6d30f5c8aa58b04f40a58442df3bcaae5a1d5 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@gmail.com>
Date: Sun, 16 Feb 2014 21:01:39 +0000
Subject: Btrfs: send, don't send rmdir for same target multiple times

When doing an incremental send, if we delete a directory that has N > 1
hardlinks for the same file and that file has the highest inode number
inside the directory contents, an incremental send would send N times an
rmdir operation against the directory. This made the btrfs receive command
fail on the second rmdir instruction, as the target directory didn't exist
anymore.

Steps to reproduce the issue:

    $ mkfs.btrfs -f /dev/sdb3
    $ mount /dev/sdb3 /mnt/btrfs
    $ mkdir -p /mnt/btrfs/a/b/c
    $ echo 'ola mundo' > /mnt/btrfs/a/b/c/foo.txt
    $ ln /mnt/btrfs/a/b/c/foo.txt /mnt/btrfs/a/b/c/bar.txt
    $ btrfs subvolume snapshot -r /mnt/btrfs /mnt/btrfs/snap1
    $ btrfs send /mnt/btrfs/snap1 -f /tmp/base.send
    $ rm -f /mnt/btrfs/a/b/c/foo.txt
    $ rm -f /mnt/btrfs/a/b/c/bar.txt
    $ rmdir /mnt/btrfs/a/b/c
    $ btrfs subvolume snapshot -r /mnt/btrfs /mnt/btrfs/snap2
    $ btrfs send -p /mnt/btrfs/snap1 /mnt/btrfs/snap2 -f /tmp/incremental.send

    $ umount /mnt/btrfs
    $ mkfs.btrfs -f /dev/sdb3
    $ mount /dev/sdb3 /mnt/btrfs
    $ btrfs receive /mnt/btrfs -f /tmp/base.send
    $ btrfs receive /mnt/btrfs -f /tmp/incremental.send

The second btrfs receive command failed with:

    ERROR: rmdir o259-6-0 failed. No such file or directory

A test case for xfstests follows.

Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
---
 fs/btrfs/send.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index cb9502adccdc..cdfd4357a784 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -3085,6 +3085,7 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
 	u64 ow_gen;
 	int did_overwrite = 0;
 	int is_orphan = 0;
+	u64 last_dir_ino_rm = 0;
 
 verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
 
@@ -3349,7 +3350,8 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
 			ret = send_utimes(sctx, cur->dir, cur->dir_gen);
 			if (ret < 0)
 				goto out;
-		} else if (ret == inode_state_did_delete) {
+		} else if (ret == inode_state_did_delete &&
+			   cur->dir != last_dir_ino_rm) {
 			ret = can_rmdir(sctx, cur->dir, sctx->cur_ino);
 			if (ret < 0)
 				goto out;
@@ -3361,6 +3363,7 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
 				ret = send_rmdir(sctx, valid_path);
 				if (ret < 0)
 					goto out;
+				last_dir_ino_rm = cur->dir;
 			}
 		}
 	}
-- 
cgit v1.2.3


From 9dc442143b9874ba677fc83bf8c60744ec642998 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@gmail.com>
Date: Wed, 19 Feb 2014 14:31:44 +0000
Subject: Btrfs: fix send attempting to rmdir non-empty directories

The incremental send algorithm assumed that it was possible to issue
a directory remove (rmdir) if the the inode number it was currently
processing was greater than (or equal) to any inode that referenced
the directory's inode. This wasn't a valid assumption because any such
inode might be a child directory that is pending a move/rename operation,
because it was moved into a directory that has a higher inode number and
was moved/renamed too - in other words, the case the following commit
addressed:

    9f03740a956d7ac6a1b8f8c455da6fa5cae11c22
    (Btrfs: fix infinite path build loops in incremental send)

This made an incremental send issue an rmdir operation before the
target directory was actually empty, which made btrfs receive fail.
Therefore it needs to wait for all pending child directory inodes to
be moved/renamed before sending an rmdir operation.

Simple steps to reproduce this issue:

    $ mkfs.btrfs -f /dev/sdb3
    $ mount /dev/sdb3 /mnt/btrfs
    $ mkdir -p /mnt/btrfs/a/b/c/x
    $ mkdir /mnt/btrfs/a/b/y
    $ btrfs subvolume snapshot -r /mnt/btrfs /mnt/btrfs/snap1
    $ btrfs send /mnt/btrfs/snap1 -f /tmp/base.send
    $ mv /mnt/btrfs/a/b/y /mnt/btrfs/a/b/YY
    $ mv /mnt/btrfs/a/b/c/x /mnt/btrfs/a/b/YY
    $ rmdir /mnt/btrfs/a/b/c
    $ btrfs subvolume snapshot -r /mnt/btrfs /mnt/btrfs/snap2
    $ btrfs send -p /mnt/btrfs/snap1 /mnt/btrfs/snap2 -f /tmp/incremental.send

    $ umount /mnt/btrfs
    $ mkfs.btrfs -f /dev/sdb3
    $ mount /dev/sdb3 /mnt/btrfs
    $ btrfs receive /mnt/btrfs -f /tmp/base.send
    $ btrfs receive /mnt/btrfs -f /tmp/incremental.send

The second btrfs receive command failed with:

    ERROR: rmdir o259-6-0 failed. Directory not empty

A test case for xfstests follows.

Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
---
 fs/btrfs/send.c | 247 ++++++++++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 221 insertions(+), 26 deletions(-)

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index cdfd4357a784..46c6b5442f20 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -178,6 +178,47 @@ struct send_ctx {
 	 * own move/rename can be performed.
 	 */
 	struct rb_root waiting_dir_moves;
+
+	/*
+	 * A directory that is going to be rm'ed might have a child directory
+	 * which is in the pending directory moves index above. In this case,
+	 * the directory can only be removed after the move/rename of its child
+	 * is performed. Example:
+	 *
+	 * Parent snapshot:
+	 *
+	 * .                        (ino 256)
+	 * |-- a/                   (ino 257)
+	 *     |-- b/               (ino 258)
+	 *         |-- c/           (ino 259)
+	 *         |   |-- x/       (ino 260)
+	 *         |
+	 *         |-- y/           (ino 261)
+	 *
+	 * Send snapshot:
+	 *
+	 * .                        (ino 256)
+	 * |-- a/                   (ino 257)
+	 *     |-- b/               (ino 258)
+	 *         |-- YY/          (ino 261)
+	 *              |-- x/      (ino 260)
+	 *
+	 * Sequence of steps that lead to the send snapshot:
+	 * rm -f /a/b/c/foo.txt
+	 * mv /a/b/y /a/b/YY
+	 * mv /a/b/c/x /a/b/YY
+	 * rmdir /a/b/c
+	 *
+	 * When the child is processed, its move/rename is delayed until its
+	 * parent is processed (as explained above), but all other operations
+	 * like update utimes, chown, chgrp, etc, are performed and the paths
+	 * that it uses for those operations must use the orphanized name of
+	 * its parent (the directory we're going to rm later), so we need to
+	 * memorize that name.
+	 *
+	 * Indexed by the inode number of the directory to be deleted.
+	 */
+	struct rb_root orphan_dirs;
 };
 
 struct pending_dir_move {
@@ -192,6 +233,18 @@ struct pending_dir_move {
 struct waiting_dir_move {
 	struct rb_node node;
 	u64 ino;
+	/*
+	 * There might be some directory that could not be removed because it
+	 * was waiting for this directory inode to be moved first. Therefore
+	 * after this directory is moved, we can try to rmdir the ino rmdir_ino.
+	 */
+	u64 rmdir_ino;
+};
+
+struct orphan_dir_info {
+	struct rb_node node;
+	u64 ino;
+	u64 gen;
 };
 
 struct name_cache_entry {
@@ -217,6 +270,11 @@ struct name_cache_entry {
 
 static int is_waiting_for_move(struct send_ctx *sctx, u64 ino);
 
+static struct waiting_dir_move *
+get_waiting_dir_move(struct send_ctx *sctx, u64 ino);
+
+static int is_waiting_for_rm(struct send_ctx *sctx, u64 dir_ino);
+
 static int need_send_hole(struct send_ctx *sctx)
 {
 	return (sctx->parent_root && !sctx->cur_inode_new &&
@@ -2113,6 +2171,14 @@ static int get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen,
 	while (!stop && ino != BTRFS_FIRST_FREE_OBJECTID) {
 		fs_path_reset(name);
 
+		if (is_waiting_for_rm(sctx, ino)) {
+			ret = gen_unique_name(sctx, ino, gen, name);
+			if (ret < 0)
+				goto out;
+			ret = fs_path_add_path(dest, name);
+			break;
+		}
+
 		ret = __get_cur_name_and_parent(sctx, ino, gen, skip_name_cache,
 				&parent_inode, &parent_gen, name);
 		if (ret < 0)
@@ -2641,12 +2707,78 @@ out:
 	return ret;
 }
 
+static struct orphan_dir_info *
+add_orphan_dir_info(struct send_ctx *sctx, u64 dir_ino)
+{
+	struct rb_node **p = &sctx->orphan_dirs.rb_node;
+	struct rb_node *parent = NULL;
+	struct orphan_dir_info *entry, *odi;
+
+	odi = kmalloc(sizeof(*odi), GFP_NOFS);
+	if (!odi)
+		return ERR_PTR(-ENOMEM);
+	odi->ino = dir_ino;
+	odi->gen = 0;
+
+	while (*p) {
+		parent = *p;
+		entry = rb_entry(parent, struct orphan_dir_info, node);
+		if (dir_ino < entry->ino) {
+			p = &(*p)->rb_left;
+		} else if (dir_ino > entry->ino) {
+			p = &(*p)->rb_right;
+		} else {
+			kfree(odi);
+			return entry;
+		}
+	}
+
+	rb_link_node(&odi->node, parent, p);
+	rb_insert_color(&odi->node, &sctx->orphan_dirs);
+	return odi;
+}
+
+static struct orphan_dir_info *
+get_orphan_dir_info(struct send_ctx *sctx, u64 dir_ino)
+{
+	struct rb_node *n = sctx->orphan_dirs.rb_node;
+	struct orphan_dir_info *entry;
+
+	while (n) {
+		entry = rb_entry(n, struct orphan_dir_info, node);
+		if (dir_ino < entry->ino)
+			n = n->rb_left;
+		else if (dir_ino > entry->ino)
+			n = n->rb_right;
+		else
+			return entry;
+	}
+	return NULL;
+}
+
+static int is_waiting_for_rm(struct send_ctx *sctx, u64 dir_ino)
+{
+	struct orphan_dir_info *odi = get_orphan_dir_info(sctx, dir_ino);
+
+	return odi != NULL;
+}
+
+static void free_orphan_dir_info(struct send_ctx *sctx,
+				 struct orphan_dir_info *odi)
+{
+	if (!odi)
+		return;
+	rb_erase(&odi->node, &sctx->orphan_dirs);
+	kfree(odi);
+}
+
 /*
  * Returns 1 if a directory can be removed at this point in time.
  * We check this by iterating all dir items and checking if the inode behind
  * the dir item was already processed.
  */
-static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 send_progress)
+static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen,
+		     u64 send_progress)
 {
 	int ret = 0;
 	struct btrfs_root *root = sctx->parent_root;
@@ -2674,6 +2806,8 @@ static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 send_progress)
 		goto out;
 
 	while (1) {
+		struct waiting_dir_move *dm;
+
 		if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
 			ret = btrfs_next_leaf(root, path);
 			if (ret < 0)
@@ -2692,6 +2826,21 @@ static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 send_progress)
 				struct btrfs_dir_item);
 		btrfs_dir_item_key_to_cpu(path->nodes[0], di, &loc);
 
+		dm = get_waiting_dir_move(sctx, loc.objectid);
+		if (dm) {
+			struct orphan_dir_info *odi;
+
+			odi = add_orphan_dir_info(sctx, dir);
+			if (IS_ERR(odi)) {
+				ret = PTR_ERR(odi);
+				goto out;
+			}
+			odi->gen = dir_gen;
+			dm->rmdir_ino = dir;
+			ret = 0;
+			goto out;
+		}
+
 		if (loc.objectid > send_progress) {
 			ret = 0;
 			goto out;
@@ -2709,19 +2858,9 @@ out:
 
 static int is_waiting_for_move(struct send_ctx *sctx, u64 ino)
 {
-	struct rb_node *n = sctx->waiting_dir_moves.rb_node;
-	struct waiting_dir_move *entry;
+	struct waiting_dir_move *entry = get_waiting_dir_move(sctx, ino);
 
-	while (n) {
-		entry = rb_entry(n, struct waiting_dir_move, node);
-		if (ino < entry->ino)
-			n = n->rb_left;
-		else if (ino > entry->ino)
-			n = n->rb_right;
-		else
-			return 1;
-	}
-	return 0;
+	return entry != NULL;
 }
 
 static int add_waiting_dir_move(struct send_ctx *sctx, u64 ino)
@@ -2734,6 +2873,7 @@ static int add_waiting_dir_move(struct send_ctx *sctx, u64 ino)
 	if (!dm)
 		return -ENOMEM;
 	dm->ino = ino;
+	dm->rmdir_ino = 0;
 
 	while (*p) {
 		parent = *p;
@@ -2753,24 +2893,31 @@ static int add_waiting_dir_move(struct send_ctx *sctx, u64 ino)
 	return 0;
 }
 
-static int del_waiting_dir_move(struct send_ctx *sctx, u64 ino)
+static struct waiting_dir_move *
+get_waiting_dir_move(struct send_ctx *sctx, u64 ino)
 {
 	struct rb_node *n = sctx->waiting_dir_moves.rb_node;
 	struct waiting_dir_move *entry;
 
 	while (n) {
 		entry = rb_entry(n, struct waiting_dir_move, node);
-		if (ino < entry->ino) {
+		if (ino < entry->ino)
 			n = n->rb_left;
-		} else if (ino > entry->ino) {
+		else if (ino > entry->ino)
 			n = n->rb_right;
-		} else {
-			rb_erase(&entry->node, &sctx->waiting_dir_moves);
-			kfree(entry);
-			return 0;
-		}
+		else
+			return entry;
 	}
-	return -ENOENT;
+	return NULL;
+}
+
+static void free_waiting_dir_move(struct send_ctx *sctx,
+				  struct waiting_dir_move *dm)
+{
+	if (!dm)
+		return;
+	rb_erase(&dm->node, &sctx->waiting_dir_moves);
+	kfree(dm);
 }
 
 static int add_pending_dir_move(struct send_ctx *sctx, u64 parent_ino)
@@ -2861,6 +3008,8 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm)
 	u64 orig_progress = sctx->send_progress;
 	struct recorded_ref *cur;
 	u64 parent_ino, parent_gen;
+	struct waiting_dir_move *dm = NULL;
+	u64 rmdir_ino = 0;
 	int ret;
 
 	name = fs_path_alloc();
@@ -2870,8 +3019,10 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm)
 		goto out;
 	}
 
-	ret = del_waiting_dir_move(sctx, pm->ino);
-	ASSERT(ret == 0);
+	dm = get_waiting_dir_move(sctx, pm->ino);
+	ASSERT(dm);
+	rmdir_ino = dm->rmdir_ino;
+	free_waiting_dir_move(sctx, dm);
 
 	ret = get_first_ref(sctx->parent_root, pm->ino,
 			    &parent_ino, &parent_gen, name);
@@ -2914,6 +3065,35 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm)
 	if (ret < 0)
 		goto out;
 
+	if (rmdir_ino) {
+		struct orphan_dir_info *odi;
+
+		odi = get_orphan_dir_info(sctx, rmdir_ino);
+		if (!odi) {
+			/* already deleted */
+			goto finish;
+		}
+		ret = can_rmdir(sctx, rmdir_ino, odi->gen, sctx->cur_ino + 1);
+		if (ret < 0)
+			goto out;
+		if (!ret)
+			goto finish;
+
+		name = fs_path_alloc();
+		if (!name) {
+			ret = -ENOMEM;
+			goto out;
+		}
+		ret = get_cur_path(sctx, rmdir_ino, odi->gen, name);
+		if (ret < 0)
+			goto out;
+		ret = send_rmdir(sctx, name);
+		if (ret < 0)
+			goto out;
+		free_orphan_dir_info(sctx, odi);
+	}
+
+finish:
 	ret = send_utimes(sctx, pm->ino, pm->gen);
 	if (ret < 0)
 		goto out;
@@ -2923,6 +3103,8 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm)
 	 * and old parent(s).
 	 */
 	list_for_each_entry(cur, &pm->update_refs, list) {
+		if (cur->dir == rmdir_ino)
+			continue;
 		ret = send_utimes(sctx, cur->dir, cur->dir_gen);
 		if (ret < 0)
 			goto out;
@@ -3259,7 +3441,8 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
 		 * later, we do this check again and rmdir it then if possible.
 		 * See the use of check_dirs for more details.
 		 */
-		ret = can_rmdir(sctx, sctx->cur_ino, sctx->cur_ino);
+		ret = can_rmdir(sctx, sctx->cur_ino, sctx->cur_inode_gen,
+				sctx->cur_ino);
 		if (ret < 0)
 			goto out;
 		if (ret) {
@@ -3352,7 +3535,8 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
 				goto out;
 		} else if (ret == inode_state_did_delete &&
 			   cur->dir != last_dir_ino_rm) {
-			ret = can_rmdir(sctx, cur->dir, sctx->cur_ino);
+			ret = can_rmdir(sctx, cur->dir, cur->dir_gen,
+					sctx->cur_ino);
 			if (ret < 0)
 				goto out;
 			if (ret) {
@@ -5389,6 +5573,7 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
 
 	sctx->pending_dir_moves = RB_ROOT;
 	sctx->waiting_dir_moves = RB_ROOT;
+	sctx->orphan_dirs = RB_ROOT;
 
 	sctx->clone_roots = vzalloc(sizeof(struct clone_root) *
 			(arg->clone_sources_count + 1));
@@ -5526,6 +5711,16 @@ out:
 		kfree(dm);
 	}
 
+	WARN_ON(sctx && !ret && !RB_EMPTY_ROOT(&sctx->orphan_dirs));
+	while (sctx && !RB_EMPTY_ROOT(&sctx->orphan_dirs)) {
+		struct rb_node *n;
+		struct orphan_dir_info *odi;
+
+		n = rb_first(&sctx->orphan_dirs);
+		odi = rb_entry(n, struct orphan_dir_info, node);
+		free_orphan_dir_info(sctx, odi);
+	}
+
 	if (sort_clone_roots) {
 		for (i = 0; i < sctx->clone_roots_cnt; i++)
 			btrfs_root_dec_send_in_progress(
-- 
cgit v1.2.3


From 6baa4293af8abe95018e911c3df60ed5bfacc76f Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@gmail.com>
Date: Thu, 20 Feb 2014 21:15:25 +0000
Subject: Btrfs: correctly determine if blocks are shared in
 btrfs_compare_trees

Just comparing the pointers (logical disk addresses) of the btree nodes is
not completely bullet proof, we have to check if their generation numbers
match too.

It is guaranteed that a COW operation will result in a block with a different
logical disk address than the original block's address, but over time we can
reuse that former logical disk address.

For example, creating a 2Gb filesystem on a loop device, and having a script
running in a loop always updating the access timestamp of a file, resulted in
the same logical disk address being reused for the same fs btree block in about
only 4 minutes.

This could make us skip entire subtrees when doing an incremental send (which
is currently the only user of btrfs_compare_trees). However the odds of getting
2 blocks at the same tree level, with the same logical disk address, equal first
slot keys and different generations, should hopefully be very low.

Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
---
 fs/btrfs/ctree.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index cbd3a7d6fa68..88d1b1eedc9c 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -5376,6 +5376,8 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
 	int advance_right;
 	u64 left_blockptr;
 	u64 right_blockptr;
+	u64 left_gen;
+	u64 right_gen;
 	u64 left_start_ctransid;
 	u64 right_start_ctransid;
 	u64 ctransid;
@@ -5640,7 +5642,14 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
 				right_blockptr = btrfs_node_blockptr(
 						right_path->nodes[right_level],
 						right_path->slots[right_level]);
-				if (left_blockptr == right_blockptr) {
+				left_gen = btrfs_node_ptr_generation(
+						left_path->nodes[left_level],
+						left_path->slots[left_level]);
+				right_gen = btrfs_node_ptr_generation(
+						right_path->nodes[right_level],
+						right_path->slots[right_level]);
+				if (left_blockptr == right_blockptr &&
+				    left_gen == right_gen) {
 					/*
 					 * As we're on a shared block, don't
 					 * allow to go deeper.
-- 
cgit v1.2.3


From bf0d1f441d1679136c25e6141dd7e66cc7a14218 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@gmail.com>
Date: Fri, 21 Feb 2014 00:01:32 +0000
Subject: Btrfs: fix send issuing outdated paths for utimes, chown and chmod

When doing an incremental send, if we had a directory pending a move/rename
operation and none of its parents, except for the immediate parent, were
pending a move/rename, after processing the directory's references, we would
be issuing utimes, chown and chmod intructions against am outdated path - a
path which matched the one in the parent root.

This change also simplifies a bit the code that deals with building a path
for a directory which has a move/rename operation delayed.

Steps to reproduce:

    $ mkfs.btrfs -f /dev/sdb3
    $ mount /dev/sdb3 /mnt/btrfs
    $ mkdir -p /mnt/btrfs/a/b/c/d/e
    $ mkdir /mnt/btrfs/a/b/c/f
    $ chmod 0777 /mnt/btrfs/a/b/c/d/e
    $ btrfs subvolume snapshot -r /mnt/btrfs /mnt/btrfs/snap1
    $ btrfs send /mnt/btrfs/snap1 -f /tmp/base.send
    $ mv /mnt/btrfs/a/b/c/f /mnt/btrfs/a/b/f2
    $ mv /mnt/btrfs/a/b/c/d/e /mnt/btrfs/a/b/f2/e2
    $ mv /mnt/btrfs/a/b/c /mnt/btrfs/a/b/c2
    $ mv /mnt/btrfs/a/b/c2/d /mnt/btrfs/a/b/c2/d2
    $ chmod 0700 /mnt/btrfs/a/b/f2/e2
    $ btrfs subvolume snapshot -r /mnt/btrfs /mnt/btrfs/snap2
    $ btrfs send -p /mnt/btrfs/snap1 /mnt/btrfs/snap2 -f /tmp/incremental.send

    $ umount /mnt/btrfs
    $ mkfs.btrfs -f /dev/sdb3
    $ mount /dev/sdb3 /mnt/btrfs
    $ btrfs receive /mnt/btrfs -f /tmp/base.send
    $ btrfs receive /mnt/btrfs -f /tmp/incremental.send

The second btrfs receive command failed with:

    ERROR: chmod a/b/c/d/e failed. No such file or directory

A test case for xfstests follows.

Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
---
 fs/btrfs/send.c | 31 ++++++++++++-------------------
 1 file changed, 12 insertions(+), 19 deletions(-)

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 46c6b5442f20..298e25de13ab 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -2000,7 +2000,6 @@ static void name_cache_free(struct send_ctx *sctx)
  */
 static int __get_cur_name_and_parent(struct send_ctx *sctx,
 				     u64 ino, u64 gen,
-				     int skip_name_cache,
 				     u64 *parent_ino,
 				     u64 *parent_gen,
 				     struct fs_path *dest)
@@ -2010,8 +2009,6 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx,
 	struct btrfs_path *path = NULL;
 	struct name_cache_entry *nce = NULL;
 
-	if (skip_name_cache)
-		goto get_ref;
 	/*
 	 * First check if we already did a call to this function with the same
 	 * ino/gen. If yes, check if the cache entry is still up-to-date. If yes
@@ -2056,12 +2053,11 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx,
 		goto out_cache;
 	}
 
-get_ref:
 	/*
 	 * Depending on whether the inode was already processed or not, use
 	 * send_root or parent_root for ref lookup.
 	 */
-	if (ino < sctx->send_progress && !skip_name_cache)
+	if (ino < sctx->send_progress)
 		ret = get_first_ref(sctx->send_root, ino,
 				    parent_ino, parent_gen, dest);
 	else
@@ -2085,8 +2081,6 @@ get_ref:
 			goto out;
 		ret = 1;
 	}
-	if (skip_name_cache)
-		goto out;
 
 out_cache:
 	/*
@@ -2154,7 +2148,6 @@ static int get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen,
 	u64 parent_inode = 0;
 	u64 parent_gen = 0;
 	int stop = 0;
-	int skip_name_cache = 0;
 
 	name = fs_path_alloc();
 	if (!name) {
@@ -2162,9 +2155,6 @@ static int get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen,
 		goto out;
 	}
 
-	if (is_waiting_for_move(sctx, ino))
-		skip_name_cache = 1;
-
 	dest->reversed = 1;
 	fs_path_reset(dest);
 
@@ -2179,16 +2169,19 @@ static int get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen,
 			break;
 		}
 
-		ret = __get_cur_name_and_parent(sctx, ino, gen, skip_name_cache,
-				&parent_inode, &parent_gen, name);
+		if (is_waiting_for_move(sctx, ino)) {
+			ret = get_first_ref(sctx->parent_root, ino,
+					    &parent_inode, &parent_gen, name);
+		} else {
+			ret = __get_cur_name_and_parent(sctx, ino, gen,
+							&parent_inode,
+							&parent_gen, name);
+			if (ret)
+				stop = 1;
+		}
+
 		if (ret < 0)
 			goto out;
-		if (ret)
-			stop = 1;
-
-		if (!skip_name_cache &&
-		    is_waiting_for_move(sctx, parent_inode))
-			skip_name_cache = 1;
 
 		ret = fs_path_add_path(dest, name);
 		if (ret < 0)
-- 
cgit v1.2.3


From 886322e8e787604731e782d36c34327a8970bda9 Mon Sep 17 00:00:00 2001
From: Sachin Kamat <sachin.kamat@linaro.org>
Date: Mon, 17 Feb 2014 09:13:57 +0530
Subject: btrfs: Use PTR_ERR_OR_ZERO

PTR_RET is deprecated. Use PTR_ERR_OR_ZERO instead. While at it
also include missing err.h header.

Signed-off-by: Sachin Kamat <sachin.kamat@linaro.org>
Signed-off-by: Josef Bacik <jbacik@fb.com>
---
 fs/btrfs/root-tree.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 1389b69059de..38bb47e7d6b1 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -16,6 +16,7 @@
  * Boston, MA 021110-1307, USA.
  */
 
+#include <linux/err.h>
 #include <linux/uuid.h>
 #include "ctree.h"
 #include "transaction.h"
@@ -271,7 +272,7 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
 		key.offset++;
 
 		root = btrfs_read_fs_root(tree_root, &root_key);
-		err = PTR_RET(root);
+		err = PTR_ERR_OR_ZERO(root);
 		if (err && err != -ENOENT) {
 			break;
 		} else if (err == -ENOENT) {
-- 
cgit v1.2.3


From 6cf7f77e6ba55cc1469aaf795507d274402892e9 Mon Sep 17 00:00:00 2001
From: Wang Shilong <wangsl.fnst@cn.fujitsu.com>
Date: Wed, 19 Feb 2014 19:24:16 +0800
Subject: Btrfs: fix a possible deadlock between scrub and transaction
 committing

btrfs_scrub_continue() will be called when cleaning up transaction.However,
this can only be called if btrfs_scrub_pause() is called before.

Signed-off-by: Wang Shilong <wangsl.fnst@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
---
 fs/btrfs/transaction.c | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 34cd83184c4a..84da6669f384 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -1578,8 +1578,6 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans,
 
 	trace_btrfs_transaction_commit(root);
 
-	btrfs_scrub_continue(root);
-
 	if (current->journal_info == trans)
 		current->journal_info = NULL;
 
@@ -1754,7 +1752,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 	/* ->aborted might be set after the previous check, so check it */
 	if (unlikely(ACCESS_ONCE(cur_trans->aborted))) {
 		ret = cur_trans->aborted;
-		goto cleanup_transaction;
+		goto scrub_continue;
 	}
 	/*
 	 * the reloc mutex makes sure that we stop
@@ -1771,7 +1769,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 	ret = create_pending_snapshots(trans, root->fs_info);
 	if (ret) {
 		mutex_unlock(&root->fs_info->reloc_mutex);
-		goto cleanup_transaction;
+		goto scrub_continue;
 	}
 
 	/*
@@ -1787,13 +1785,13 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 	ret = btrfs_run_delayed_items(trans, root);
 	if (ret) {
 		mutex_unlock(&root->fs_info->reloc_mutex);
-		goto cleanup_transaction;
+		goto scrub_continue;
 	}
 
 	ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
 	if (ret) {
 		mutex_unlock(&root->fs_info->reloc_mutex);
-		goto cleanup_transaction;
+		goto scrub_continue;
 	}
 
 	/*
@@ -1823,7 +1821,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 	if (ret) {
 		mutex_unlock(&root->fs_info->tree_log_mutex);
 		mutex_unlock(&root->fs_info->reloc_mutex);
-		goto cleanup_transaction;
+		goto scrub_continue;
 	}
 
 	/*
@@ -1844,7 +1842,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 	if (ret) {
 		mutex_unlock(&root->fs_info->tree_log_mutex);
 		mutex_unlock(&root->fs_info->reloc_mutex);
-		goto cleanup_transaction;
+		goto scrub_continue;
 	}
 
 	/*
@@ -1855,7 +1853,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 		ret = cur_trans->aborted;
 		mutex_unlock(&root->fs_info->tree_log_mutex);
 		mutex_unlock(&root->fs_info->reloc_mutex);
-		goto cleanup_transaction;
+		goto scrub_continue;
 	}
 
 	btrfs_prepare_extent_commit(trans, root);
@@ -1891,13 +1889,13 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 		btrfs_error(root->fs_info, ret,
 			    "Error while writing out transaction");
 		mutex_unlock(&root->fs_info->tree_log_mutex);
-		goto cleanup_transaction;
+		goto scrub_continue;
 	}
 
 	ret = write_ctree_super(trans, root, 0);
 	if (ret) {
 		mutex_unlock(&root->fs_info->tree_log_mutex);
-		goto cleanup_transaction;
+		goto scrub_continue;
 	}
 
 	/*
@@ -1940,6 +1938,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 
 	return ret;
 
+scrub_continue:
+	btrfs_scrub_continue(root);
 cleanup_transaction:
 	btrfs_trans_release_metadata(trans, root);
 	trans->block_rsv = NULL;
-- 
cgit v1.2.3


From 12cf93728dfba237b46001a95479829c7179cdc9 Mon Sep 17 00:00:00 2001
From: Wang Shilong <wangsl.fnst@cn.fujitsu.com>
Date: Wed, 19 Feb 2014 19:24:17 +0800
Subject: Btrfs: device_replace: fix deadlock for nocow case

commit cb7ab02156e4 cause a following deadlock found by
xfstests,btrfs/011:

Thread1 is commiting transaction which is blocked at
btrfs_scrub_pause().

Thread2 is calling btrfs_file_aio_write() which has held
inode's @i_mutex and commit transaction(blocked because
Thread1 is committing transaction).

Thread3 is copy_nocow_page worker which will also try to
hold inode @i_mutex, so thread3 will wait Thread1 finished.

Thread4 is waiting pending workers finished which will wait
Thread3 finished. So the problem is like this:

Thread1--->Thread4--->Thread3--->Thread2---->Thread1

Deadlock happens! we fix it by letting Thread1 go firstly,
which means we won't block transaction commit while we are
waiting pending workers finished.

Reported-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Signed-off-by: Wang Shilong <wangsl.fnst@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
---
 fs/btrfs/scrub.c | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 51c342b9f5ef..f2f8803ea79c 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -2686,10 +2686,23 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
 
 		wait_event(sctx->list_wait,
 			   atomic_read(&sctx->bios_in_flight) == 0);
-		atomic_set(&sctx->wr_ctx.flush_all_writes, 0);
+		atomic_inc(&fs_info->scrubs_paused);
+		wake_up(&fs_info->scrub_pause_wait);
+
+		/*
+		 * must be called before we decrease @scrub_paused.
+		 * make sure we don't block transaction commit while
+		 * we are waiting pending workers finished.
+		 */
 		wait_event(sctx->list_wait,
 			   atomic_read(&sctx->workers_pending) == 0);
-		scrub_blocked_if_needed(fs_info);
+		atomic_set(&sctx->wr_ctx.flush_all_writes, 0);
+
+		mutex_lock(&fs_info->scrub_lock);
+		__scrub_blocked_if_needed(fs_info);
+		atomic_dec(&fs_info->scrubs_paused);
+		mutex_unlock(&fs_info->scrub_lock);
+		wake_up(&fs_info->scrub_pause_wait);
 
 		btrfs_put_block_group(cache);
 		if (ret)
-- 
cgit v1.2.3


From c0af8f0b1cf7ec5cde4450be9f8bfeb8c211d40a Mon Sep 17 00:00:00 2001
From: Wang Shilong <wangsl.fnst@cn.fujitsu.com>
Date: Wed, 19 Feb 2014 19:24:18 +0800
Subject: Btrfs: cancel scrub on transaction abortion

If we fail to commit transaction, we'd better
cancel scrub operations.

Suggested-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Wang Shilong <wangsl.fnst@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
---
 fs/btrfs/transaction.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 84da6669f384..79a4186b724a 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -1580,6 +1580,7 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans,
 
 	if (current->journal_info == trans)
 		current->journal_info = NULL;
+	btrfs_scrub_cancel(root->fs_info);
 
 	kmem_cache_free(btrfs_trans_handle_cachep, trans);
 }
-- 
cgit v1.2.3


From 32a447896c7b4c5cf5502a3ca7f5ef57d498fb03 Mon Sep 17 00:00:00 2001
From: Wang Shilong <wangsl.fnst@cn.fujitsu.com>
Date: Wed, 19 Feb 2014 19:24:19 +0800
Subject: Btrfs: wake up @scrub_pause_wait as much as we can

check if @scrubs_running=@scrubs_paused condition inside wait_event()
is not an atomic operation which means we may inc/dec @scrub_running/
paused at any time. Let's wake up @scrub_pause_wait as much as we can
to let commit transaction blocked less.

An example below:

Thread1				Thread2
|->scrub_blocked_if_needed()	|->scrub_pending_trans_workers_inc
  |->increase @scrub_paused
                                       |->increase @scrub_running
  |->wake up scrub_pause_wait list
                                       |->scrub blocked
                                       |->increase @scrub_paused

Thread3 is commiting transaction which is blocked at btrfs_scrub_pause().
So after Thread2 increase @scrub_paused, we meet the condition
@scrub_paused=@scrub_running, but transaction will be still blocked until
another calling to wake up @scrub_pause_wait.

Signed-off-by: Wang Shilong <wangsl.fnst@cn.fujitsu.com>
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
---
 fs/btrfs/scrub.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index f2f8803ea79c..682ec3fca4a1 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -315,6 +315,16 @@ static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx)
 	atomic_inc(&fs_info->scrubs_running);
 	atomic_inc(&fs_info->scrubs_paused);
 	mutex_unlock(&fs_info->scrub_lock);
+
+	/*
+	 * check if @scrubs_running=@scrubs_paused condition
+	 * inside wait_event() is not an atomic operation.
+	 * which means we may inc/dec @scrub_running/paused
+	 * at any time. Let's wake up @scrub_pause_wait as
+	 * much as we can to let commit transaction blocked less.
+	 */
+	wake_up(&fs_info->scrub_pause_wait);
+
 	atomic_inc(&sctx->workers_pending);
 }
 
-- 
cgit v1.2.3


From e84752d434b5cca0869e906e7b94d0531b25c6d3 Mon Sep 17 00:00:00 2001
From: Wang Shilong <wangsl.fnst@cn.fujitsu.com>
Date: Thu, 13 Feb 2014 11:19:47 +0800
Subject: Btrfs: skip locking when searching commit root

We won't change commit root, skip locking dance with commit root
when walking backrefs, this can speed up btrfs send operations.

Signed-off-by: Wang Shilong <wangsl.fnst@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
---
 fs/btrfs/backref.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index a88da721dfc5..860f4f22b9b0 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -873,8 +873,10 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans,
 	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;
-	if (!trans)
+	if (!trans) {
 		path->search_commit_root = 1;
+		path->skip_locking = 1;
+	}
 
 	/*
 	 * grab both a lock on the path and a lock on the delayed ref head.
-- 
cgit v1.2.3


From cbc0e9287d710ce7dce5f8daf667729e83316c45 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@gmail.com>
Date: Tue, 25 Feb 2014 14:15:12 +0000
Subject: Btrfs: remove unneeded field / smaller extent_map structure

We don't need to have an unsigned int field in the extent_map struct
to tell us whether the extent map is in the inode's extent_map tree or
not. We can use the rb_node struct field and the RB_CLEAR_NODE and
RB_EMPTY_NODE macros to achieve the same task.

This reduces sizeof(struct extent_map) from 152 bytes to 144 bytes (on a
64 bits system).

Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Reviewed-by: David Sterba <dsterba@suse.cz>
Signed-off-by: Josef Bacik <jbacik@fb.com>
---
 fs/btrfs/extent_io.c  |  2 +-
 fs/btrfs/extent_map.c | 17 ++++++-----------
 fs/btrfs/extent_map.h |  6 +++++-
 3 files changed, 12 insertions(+), 13 deletions(-)

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index fbe501d3bd01..2c9fbe3b6eec 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2763,7 +2763,7 @@ __get_extent_map(struct inode *inode, struct page *page, size_t pg_offset,
 
 	if (em_cached && *em_cached) {
 		em = *em_cached;
-		if (em->in_tree && start >= em->start &&
+		if (extent_map_in_tree(em) && start >= em->start &&
 		    start < extent_map_end(em)) {
 			atomic_inc(&em->refs);
 			return em;
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 996ad56b57db..64d08f94485d 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -51,7 +51,7 @@ struct extent_map *alloc_extent_map(void)
 	em = kmem_cache_zalloc(extent_map_cache, GFP_NOFS);
 	if (!em)
 		return NULL;
-	em->in_tree = 0;
+	RB_CLEAR_NODE(&em->rb_node);
 	em->flags = 0;
 	em->compress_type = BTRFS_COMPRESS_NONE;
 	em->generation = 0;
@@ -73,7 +73,7 @@ void free_extent_map(struct extent_map *em)
 		return;
 	WARN_ON(atomic_read(&em->refs) == 0);
 	if (atomic_dec_and_test(&em->refs)) {
-		WARN_ON(em->in_tree);
+		WARN_ON(extent_map_in_tree(em));
 		WARN_ON(!list_empty(&em->list));
 		kmem_cache_free(extent_map_cache, em);
 	}
@@ -99,8 +99,6 @@ static int tree_insert(struct rb_root *root, struct extent_map *em)
 		parent = *p;
 		entry = rb_entry(parent, struct extent_map, rb_node);
 
-		WARN_ON(!entry->in_tree);
-
 		if (em->start < entry->start)
 			p = &(*p)->rb_left;
 		else if (em->start >= extent_map_end(entry))
@@ -128,7 +126,6 @@ static int tree_insert(struct rb_root *root, struct extent_map *em)
 		if (end > entry->start && em->start < extent_map_end(entry))
 			return -EEXIST;
 
-	em->in_tree = 1;
 	rb_link_node(&em->rb_node, orig_parent, p);
 	rb_insert_color(&em->rb_node, root);
 	return 0;
@@ -153,8 +150,6 @@ static struct rb_node *__tree_search(struct rb_root *root, u64 offset,
 		prev = n;
 		prev_entry = entry;
 
-		WARN_ON(!entry->in_tree);
-
 		if (offset < entry->start)
 			n = n->rb_left;
 		else if (offset >= extent_map_end(entry))
@@ -240,12 +235,12 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)
 			em->len += merge->len;
 			em->block_len += merge->block_len;
 			em->block_start = merge->block_start;
-			merge->in_tree = 0;
 			em->mod_len = (em->mod_len + em->mod_start) - merge->mod_start;
 			em->mod_start = merge->mod_start;
 			em->generation = max(em->generation, merge->generation);
 
 			rb_erase(&merge->rb_node, &tree->map);
+			RB_CLEAR_NODE(&merge->rb_node);
 			free_extent_map(merge);
 		}
 	}
@@ -257,7 +252,7 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)
 		em->len += merge->len;
 		em->block_len += merge->block_len;
 		rb_erase(&merge->rb_node, &tree->map);
-		merge->in_tree = 0;
+		RB_CLEAR_NODE(&merge->rb_node);
 		em->mod_len = (merge->mod_start + merge->mod_len) - em->mod_start;
 		em->generation = max(em->generation, merge->generation);
 		free_extent_map(merge);
@@ -319,7 +314,7 @@ out:
 void clear_em_logging(struct extent_map_tree *tree, struct extent_map *em)
 {
 	clear_bit(EXTENT_FLAG_LOGGING, &em->flags);
-	if (em->in_tree)
+	if (extent_map_in_tree(em))
 		try_merge_map(tree, em);
 }
 
@@ -434,6 +429,6 @@ int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em)
 	rb_erase(&em->rb_node, &tree->map);
 	if (!test_bit(EXTENT_FLAG_LOGGING, &em->flags))
 		list_del_init(&em->list);
-	em->in_tree = 0;
+	RB_CLEAR_NODE(&em->rb_node);
 	return ret;
 }
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index 93fba716d7f8..f0a645a14d6e 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -33,7 +33,6 @@ struct extent_map {
 	unsigned long flags;
 	struct block_device *bdev;
 	atomic_t refs;
-	unsigned int in_tree;
 	unsigned int compress_type;
 	struct list_head list;
 };
@@ -44,6 +43,11 @@ struct extent_map_tree {
 	rwlock_t lock;
 };
 
+static inline int extent_map_in_tree(const struct extent_map *em)
+{
+	return !RB_EMPTY_NODE(&em->rb_node);
+}
+
 static inline u64 extent_map_end(struct extent_map *em)
 {
 	if (em->start + em->len < em->start)
-- 
cgit v1.2.3


From f2071b21553bf8f1eae583e32b9068393f61cbe9 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@gmail.com>
Date: Wed, 12 Feb 2014 15:05:53 +0000
Subject: Btrfs: more efficient split extent state insertion

When we split an extent state there's no need to start the rbtree search
from the root node - we can start it from the original extent state node,
since we would end up in its subtree if we do the search starting at the
root node anyway.

Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
---
 fs/btrfs/extent_io.c | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 2c9fbe3b6eec..fd78c84821c8 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -229,12 +229,14 @@ void free_extent_state(struct extent_state *state)
 	}
 }
 
-static struct rb_node *tree_insert(struct rb_root *root, u64 offset,
+static struct rb_node *tree_insert(struct rb_root *root,
+				   struct rb_node *search_start,
+				   u64 offset,
 				   struct rb_node *node,
 				   struct rb_node ***p_in,
 				   struct rb_node **parent_in)
 {
-	struct rb_node **p = &root->rb_node;
+	struct rb_node **p;
 	struct rb_node *parent = NULL;
 	struct tree_entry *entry;
 
@@ -244,6 +246,7 @@ static struct rb_node *tree_insert(struct rb_root *root, u64 offset,
 		goto do_insert;
 	}
 
+	p = search_start ? &search_start : &root->rb_node;
 	while (*p) {
 		parent = *p;
 		entry = rb_entry(parent, struct tree_entry, rb_node);
@@ -430,7 +433,7 @@ static int insert_state(struct extent_io_tree *tree,
 
 	set_state_bits(tree, state, bits);
 
-	node = tree_insert(&tree->state, end, &state->rb_node, p, parent);
+	node = tree_insert(&tree->state, NULL, end, &state->rb_node, p, parent);
 	if (node) {
 		struct extent_state *found;
 		found = rb_entry(node, struct extent_state, rb_node);
@@ -477,8 +480,8 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
 	prealloc->state = orig->state;
 	orig->start = split;
 
-	node = tree_insert(&tree->state, prealloc->end, &prealloc->rb_node,
-			   NULL, NULL);
+	node = tree_insert(&tree->state, &orig->rb_node, prealloc->end,
+			   &prealloc->rb_node, NULL, NULL);
 	if (node) {
 		free_extent_state(prealloc);
 		return -EEXIST;
-- 
cgit v1.2.3


From 176840b3aa3cb795ddec4fc665ffbd707abff906 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@gmail.com>
Date: Tue, 25 Feb 2014 14:15:13 +0000
Subject: Btrfs: more efficient btrfs_drop_extent_cache

While droping extent map structures from the extent cache that cover our
target range, we would remove each extent map structure from the red black
tree and then add either 1 or 2 new extent map structures if the former
extent map covered sections outside our target range.

This change simply attempts to replace the existing extent map structure
with a new one that covers the subsection we're not interested in, instead
of doing a red black remove operation followed by an insertion operation.

The number of elements in an inode's extent map tree can get very high for large
files under random writes. For example, while running the following test:

    sysbench --test=fileio --file-num=1 --file-total-size=10G \
        --file-test-mode=rndrw --num-threads=32 --file-block-size=32768 \
        --max-requests=500000 --file-rw-ratio=2 [prepare|run]

I captured the following histogram capturing the number of extent_map items
in the red black tree while that test was running:

    Count: 122462
    Range:  1.000 - 172231.000; Mean: 96415.831; Median: 101855.000; Stddev: 49700.981
    Percentiles:  90th: 160120.000; 95th: 166335.000; 99th: 171070.000
       1.000 -    5.231:   452 |
       5.231 -  187.392:    87 |
     187.392 -  585.911:   206 |
     585.911 - 1827.438:   623 |
    1827.438 - 5695.245:  1962 #
    5695.245 - 17744.861:  6204 ####
   17744.861 - 55283.764: 21115 ############
   55283.764 - 172231.000: 91813 #####################################################

Benchmark:

    sysbench --test=fileio --file-num=1 --file-total-size=10G --file-test-mode=rndwr \
        --num-threads=64 --file-block-size=32768 --max-requests=0 --max-time=60 \
        --file-io-mode=sync --file-fsync-freq=0 [prepare|run]

Before this change: 122.1Mb/sec
After this change:  125.07Mb/sec
(averages of 5 test runs)

Test machine: quad core intel i5-3570K, 32Gb of ram, SSD

Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
---
 fs/btrfs/extent_map.c | 39 ++++++++++++++++++++++++++++++---------
 fs/btrfs/extent_map.h |  4 ++++
 fs/btrfs/file.c       | 16 +++++++++++-----
 3 files changed, 45 insertions(+), 14 deletions(-)

diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 64d08f94485d..1874aee69c86 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -318,6 +318,20 @@ void clear_em_logging(struct extent_map_tree *tree, struct extent_map *em)
 		try_merge_map(tree, em);
 }
 
+static inline void setup_extent_mapping(struct extent_map_tree *tree,
+					struct extent_map *em,
+					int modified)
+{
+	atomic_inc(&em->refs);
+	em->mod_start = em->start;
+	em->mod_len = em->len;
+
+	if (modified)
+		list_move(&em->list, &tree->modified_extents);
+	else
+		try_merge_map(tree, em);
+}
+
 /**
  * add_extent_mapping - add new extent map to the extent tree
  * @tree:	tree to insert new map in
@@ -337,15 +351,7 @@ int add_extent_mapping(struct extent_map_tree *tree,
 	if (ret)
 		goto out;
 
-	atomic_inc(&em->refs);
-
-	em->mod_start = em->start;
-	em->mod_len = em->len;
-
-	if (modified)
-		list_move(&em->list, &tree->modified_extents);
-	else
-		try_merge_map(tree, em);
+	setup_extent_mapping(tree, em, modified);
 out:
 	return ret;
 }
@@ -432,3 +438,18 @@ int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em)
 	RB_CLEAR_NODE(&em->rb_node);
 	return ret;
 }
+
+void replace_extent_mapping(struct extent_map_tree *tree,
+			    struct extent_map *cur,
+			    struct extent_map *new,
+			    int modified)
+{
+	WARN_ON(test_bit(EXTENT_FLAG_PINNED, &cur->flags));
+	ASSERT(extent_map_in_tree(cur));
+	if (!test_bit(EXTENT_FLAG_LOGGING, &cur->flags))
+		list_del_init(&cur->list);
+	rb_replace_node(&cur->rb_node, &new->rb_node, &tree->map);
+	RB_CLEAR_NODE(&cur->rb_node);
+
+	setup_extent_mapping(tree, new, modified);
+}
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index f0a645a14d6e..e7fd8a56a140 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -68,6 +68,10 @@ struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
 int add_extent_mapping(struct extent_map_tree *tree,
 		       struct extent_map *em, int modified);
 int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em);
+void replace_extent_mapping(struct extent_map_tree *tree,
+			    struct extent_map *cur,
+			    struct extent_map *new,
+			    int modified);
 
 struct extent_map *alloc_extent_map(void);
 void free_extent_map(struct extent_map *em);
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 762ca32bd988..31e48b947060 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -591,7 +591,6 @@ void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
 		clear_bit(EXTENT_FLAG_PINNED, &em->flags);
 		clear_bit(EXTENT_FLAG_LOGGING, &flags);
 		modified = !list_empty(&em->list);
-		remove_extent_mapping(em_tree, em);
 		if (no_splits)
 			goto next;
 
@@ -622,8 +621,7 @@ void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
 			split->bdev = em->bdev;
 			split->flags = flags;
 			split->compress_type = em->compress_type;
-			ret = add_extent_mapping(em_tree, split, modified);
-			BUG_ON(ret); /* Logic error */
+			replace_extent_mapping(em_tree, em, split, modified);
 			free_extent_map(split);
 			split = split2;
 			split2 = NULL;
@@ -661,12 +659,20 @@ void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
 				split->orig_block_len = 0;
 			}
 
-			ret = add_extent_mapping(em_tree, split, modified);
-			BUG_ON(ret); /* Logic error */
+			if (extent_map_in_tree(em)) {
+				replace_extent_mapping(em_tree, em, split,
+						       modified);
+			} else {
+				ret = add_extent_mapping(em_tree, split,
+							 modified);
+				ASSERT(ret == 0); /* Logic error */
+			}
 			free_extent_map(split);
 			split = NULL;
 		}
 next:
+		if (extent_map_in_tree(em))
+			remove_extent_mapping(em_tree, em);
 		write_unlock(&em_tree->lock);
 
 		/* once for us */
-- 
cgit v1.2.3


From 1b2782c8ed24db03ad49942fa37c9f196b7c4af3 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.cz>
Date: Tue, 25 Feb 2014 19:32:59 +0100
Subject: btrfs: send: fix old buffer length in fs_path_ensure_buf

In "btrfs: send: lower memory requirements in common case" the code to
save the old_buf_len was incorrectly moved to a wrong place and broke
the original logic.

Reported-by: Filipe David Manana <fdmanana@gmail.com>
Signed-off-by: David Sterba <dsterba@suse.cz>
Reviewed-by: Filipe David Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
---
 fs/btrfs/send.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 298e25de13ab..246df8513c8a 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -346,6 +346,9 @@ static int fs_path_ensure_buf(struct fs_path *p, int len)
 	if (p->buf_len >= len)
 		return 0;
 
+	path_len = p->end - p->start;
+	old_buf_len = p->buf_len;
+
 	/*
 	 * First time the inline_buf does not suffice
 	 */
@@ -368,9 +371,6 @@ static int fs_path_ensure_buf(struct fs_path *p, int len)
 		p->buf_len = ksize(p->buf);
 	}
 
-	path_len = p->end - p->start;
-	old_buf_len = p->buf_len;
-
 	if (p->reversed) {
 		tmp_buf = p->buf + old_buf_len - path_len - 1;
 		p->end = p->buf + p->buf_len - 1;
-- 
cgit v1.2.3


From 9c9ca00bd31989f1a3dcbf54e97c979024e44409 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.cz>
Date: Tue, 25 Feb 2014 19:33:08 +0100
Subject: btrfs: send: simplify allocation code in fs_path_ensure_buf

Signed-off-by: David Sterba <dsterba@suse.cz>
Signed-off-by: Josef Bacik <jbacik@fb.com>
---
 fs/btrfs/send.c | 30 ++++++++++++------------------
 1 file changed, 12 insertions(+), 18 deletions(-)

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 246df8513c8a..ba23fef3c5e5 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -352,24 +352,18 @@ static int fs_path_ensure_buf(struct fs_path *p, int len)
 	/*
 	 * First time the inline_buf does not suffice
 	 */
-	if (p->buf == p->inline_buf) {
-		p->buf = kmalloc(len, GFP_NOFS);
-		if (!p->buf)
-			return -ENOMEM;
-		/*
-		 * The real size of the buffer is bigger, this will let the
-		 * fast path happen most of the time
-		 */
-		p->buf_len = ksize(p->buf);
-	} else {
-		char *tmp;
-
-		tmp = krealloc(p->buf, len, GFP_NOFS);
-		if (!tmp)
-			return -ENOMEM;
-		p->buf = tmp;
-		p->buf_len = ksize(p->buf);
-	}
+	if (p->buf == p->inline_buf)
+		tmp_buf = kmalloc(len, GFP_NOFS);
+	else
+		tmp_buf = krealloc(p->buf, len, GFP_NOFS);
+	if (!tmp_buf)
+		return -ENOMEM;
+	p->buf = tmp_buf;
+	/*
+	 * The real size of the buffer is bigger, this will let the fast path
+	 * happen most of the time
+	 */
+	p->buf_len = ksize(p->buf);
 
 	if (p->reversed) {
 		tmp_buf = p->buf + old_buf_len - path_len - 1;
-- 
cgit v1.2.3


From c933956ddf80bc455d33cbcf39d35d935daf45a9 Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Thu, 27 Feb 2014 13:58:04 +0800
Subject: Btrfs: fix wrong lock range and write size in check_can_nocow()

The write range may not be sector-aligned, for example:

       |--------|--------|	<- write range, sector-unaligned, size: 2blocks
  |--------|--------|--------|  <- correct lock range, size: 3blocks

But according to the old code, we used the size of write range to calculate
the lock range directly, not considered the offset, we would get a wrong lock
range:

       |--------|--------|	<- write range, sector-unaligned, size: 2blocks
  |--------|--------|		<- wrong lock range, size: 2blocks

And besides that, the old code also had the same problem when calculating
the real write size. Correct them.

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
---
 fs/btrfs/file.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 31e48b947060..fc2d21b0a022 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1411,7 +1411,7 @@ static noinline int check_can_nocow(struct inode *inode, loff_t pos,
 	int ret;
 
 	lockstart = round_down(pos, root->sectorsize);
-	lockend = lockstart + round_up(*write_bytes, root->sectorsize) - 1;
+	lockend = round_up(pos + *write_bytes, root->sectorsize) - 1;
 
 	while (1) {
 		lock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend);
@@ -1434,7 +1434,8 @@ static noinline int check_can_nocow(struct inode *inode, loff_t pos,
 				 EXTENT_DIRTY | EXTENT_DELALLOC |
 				 EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 0, 0,
 				 NULL, GFP_NOFS);
-		*write_bytes = min_t(size_t, *write_bytes, num_bytes);
+		*write_bytes = min_t(size_t, *write_bytes ,
+				     num_bytes - pos + lockstart);
 	}
 
 	unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend);
-- 
cgit v1.2.3


From 7b2b70851f862b68714f357d2926adbb6c574fdd Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Thu, 27 Feb 2014 13:58:05 +0800
Subject: Btrfs: fix preallocate vs double nocow write

We can not release the reserved metadata space for the first write if we
find the write position is pre-allocated. Because the kernel might write
the data on the disk before we do the second write but after the can-nocow
check, if we release the space for the first write, we might fail to update
the metadata because of no space.

Fix this problem by end nocow write if there is dirty data in the range whose
space is pre-allocated.

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
---
 fs/btrfs/file.c  |  9 ++-------
 fs/btrfs/inode.c | 16 +++++++++++++++-
 2 files changed, 17 insertions(+), 8 deletions(-)

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index fc2d21b0a022..d88f2dc4d045 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1427,16 +1427,11 @@ static noinline int check_can_nocow(struct inode *inode, loff_t pos,
 
 	num_bytes = lockend - lockstart + 1;
 	ret = can_nocow_extent(inode, lockstart, &num_bytes, NULL, NULL, NULL);
-	if (ret <= 0) {
+	if (ret <= 0)
 		ret = 0;
-	} else {
-		clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
-				 EXTENT_DIRTY | EXTENT_DELALLOC |
-				 EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 0, 0,
-				 NULL, GFP_NOFS);
+	else
 		*write_bytes = min_t(size_t, *write_bytes ,
 				     num_bytes - pos + lockstart);
-	}
 
 	unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend);
 
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 8dba152883d3..0182f081d499 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -6557,6 +6557,7 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
 	int ret;
 	struct extent_buffer *leaf;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
 	struct btrfs_file_extent_item *fi;
 	struct btrfs_key key;
 	u64 disk_bytenr;
@@ -6633,6 +6634,20 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
 
 	if (btrfs_extent_readonly(root, disk_bytenr))
 		goto out;
+
+	num_bytes = min(offset + *len, extent_end) - offset;
+	if (!nocow && found_type == BTRFS_FILE_EXTENT_PREALLOC) {
+		u64 range_end;
+
+		range_end = round_up(offset + num_bytes, root->sectorsize) - 1;
+		ret = test_range_bit(io_tree, offset, range_end,
+				     EXTENT_DELALLOC, 0, NULL);
+		if (ret) {
+			ret = -EAGAIN;
+			goto out;
+		}
+	}
+
 	btrfs_release_path(path);
 
 	/*
@@ -6661,7 +6676,6 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
 	 */
 	disk_bytenr += backref_offset;
 	disk_bytenr += offset - key.offset;
-	num_bytes = min(offset + *len, extent_end) - offset;
 	if (csum_exist_in_range(root, disk_bytenr, num_bytes))
 				goto out;
 	/*
-- 
cgit v1.2.3


From 644d1940ab0f20d1ba13295827a86a8a0c8583f3 Mon Sep 17 00:00:00 2001
From: Liu Bo <bo.li.liu@oracle.com>
Date: Thu, 27 Feb 2014 17:29:01 +0800
Subject: Btrfs: skip search tree for REG files

It is really unnecessary to search tree again for @gen, @mode and @rdev
in the case of REG inodes' creation, as we've got btrfs_inode_item in sctx,
and @gen, @mode and @rdev can easily be fetched.

Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
---
 fs/btrfs/send.c | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index ba23fef3c5e5..c2522e4e2c59 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -112,6 +112,7 @@ struct send_ctx {
 	int cur_inode_deleted;
 	u64 cur_inode_size;
 	u64 cur_inode_mode;
+	u64 cur_inode_rdev;
 	u64 cur_inode_last_extent;
 
 	u64 send_progress;
@@ -2439,10 +2440,16 @@ verbose_printk("btrfs: send_create_inode %llu\n", ino);
 	if (!p)
 		return -ENOMEM;
 
-	ret = get_inode_info(sctx->send_root, ino, NULL, &gen, &mode, NULL,
-			NULL, &rdev);
-	if (ret < 0)
-		goto out;
+	if (ino != sctx->cur_ino) {
+		ret = get_inode_info(sctx->send_root, ino, NULL, &gen, &mode,
+				     NULL, NULL, &rdev);
+		if (ret < 0)
+			goto out;
+	} else {
+		gen = sctx->cur_inode_gen;
+		mode = sctx->cur_inode_mode;
+		rdev = sctx->cur_inode_rdev;
+	}
 
 	if (S_ISREG(mode)) {
 		cmd = BTRFS_SEND_C_MKFILE;
@@ -5027,6 +5034,8 @@ static int changed_inode(struct send_ctx *sctx,
 				sctx->left_path->nodes[0], left_ii);
 		sctx->cur_inode_mode = btrfs_inode_mode(
 				sctx->left_path->nodes[0], left_ii);
+		sctx->cur_inode_rdev = btrfs_inode_rdev(
+				sctx->left_path->nodes[0], left_ii);
 		if (sctx->cur_ino != BTRFS_FIRST_FREE_OBJECTID)
 			ret = send_create_inode_if_needed(sctx);
 	} else if (result == BTRFS_COMPARE_TREE_DELETED) {
@@ -5071,6 +5080,8 @@ static int changed_inode(struct send_ctx *sctx,
 					sctx->left_path->nodes[0], left_ii);
 			sctx->cur_inode_mode = btrfs_inode_mode(
 					sctx->left_path->nodes[0], left_ii);
+			sctx->cur_inode_rdev = btrfs_inode_rdev(
+					sctx->left_path->nodes[0], left_ii);
 			ret = send_create_inode_if_needed(sctx);
 			if (ret < 0)
 				goto out;
-- 
cgit v1.2.3


From f5961d41d7575faa6e2905daa08650aa388ba9d0 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <quwenruo@cn.fujitsu.com>
Date: Fri, 28 Feb 2014 10:46:02 +0800
Subject: btrfs: Cleanup the unused struct async_sched.

The struct async_sched is not used by any codes and can be removed.

Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Reviewed-by: Josef Bacik <jbacik@fusionio.com>
Tested-by: David Sterba <dsterba@suse.cz>
Signed-off-by: Josef Bacik <jbacik@fb.com>
---
 fs/btrfs/volumes.c | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 07629e99809a..82a63b1a36ef 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -5323,13 +5323,6 @@ static void btrfs_end_bio(struct bio *bio, int err)
 	}
 }
 
-struct async_sched {
-	struct bio *bio;
-	int rw;
-	struct btrfs_fs_info *info;
-	struct btrfs_work work;
-};
-
 /*
  * see run_scheduled_bios for a description of why bios are collected for
  * async submit.
-- 
cgit v1.2.3


From 08a9ff3264181986d1d692a4e6fce3669700c9f8 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <quwenruo@cn.fujitsu.com>
Date: Fri, 28 Feb 2014 10:46:03 +0800
Subject: btrfs: Added btrfs_workqueue_struct implemented ordered execution
 based on kernel workqueue

Use kernel workqueue to implement a new btrfs_workqueue_struct, which
has the ordering execution feature like the btrfs_worker.

The func is executed in a concurrency way, and the
ordred_func/ordered_free is executed in the sequence them are queued
after the corresponding func is done.

The new btrfs_workqueue works much like the original one, one workqueue
for normal work and a list for ordered work.
When a work is queued, ordered work will be added to the list and helper
function will be queued into the workqueue.
The helper function will execute a normal work and then check and execute as many
ordered work as possible in the sequence they were queued.

At this patch, high priority work queue or thresholding is not added yet.
The high priority feature and thresholding will be added in the following patches.

Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Tested-by: David Sterba <dsterba@suse.cz>
Signed-off-by: Josef Bacik <jbacik@fb.com>
---
 fs/btrfs/async-thread.c | 137 ++++++++++++++++++++++++++++++++++++++++++++++++
 fs/btrfs/async-thread.h |  27 ++++++++++
 2 files changed, 164 insertions(+)

diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 0b78bf28ff5d..905de02e4386 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -1,5 +1,6 @@
 /*
  * Copyright (C) 2007 Oracle.  All rights reserved.
+ * Copyright (C) 2014 Fujitsu.  All rights reserved.
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public
@@ -21,6 +22,7 @@
 #include <linux/list.h>
 #include <linux/spinlock.h>
 #include <linux/freezer.h>
+#include <linux/workqueue.h>
 #include "async-thread.h"
 
 #define WORK_QUEUED_BIT 0
@@ -727,3 +729,138 @@ void btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work)
 		wake_up_process(worker->task);
 	spin_unlock_irqrestore(&worker->lock, flags);
 }
+
+struct btrfs_workqueue_struct {
+	struct workqueue_struct *normal_wq;
+	/* List head pointing to ordered work list */
+	struct list_head ordered_list;
+
+	/* Spinlock for ordered_list */
+	spinlock_t list_lock;
+};
+
+struct btrfs_workqueue_struct *btrfs_alloc_workqueue(char *name,
+						     int flags,
+						     int max_active)
+{
+	struct btrfs_workqueue_struct *ret = kzalloc(sizeof(*ret), GFP_NOFS);
+
+	if (unlikely(!ret))
+		return NULL;
+
+	ret->normal_wq = alloc_workqueue("%s-%s", flags, max_active,
+					 "btrfs", name);
+	if (unlikely(!ret->normal_wq)) {
+		kfree(ret);
+		return NULL;
+	}
+
+	INIT_LIST_HEAD(&ret->ordered_list);
+	spin_lock_init(&ret->list_lock);
+	return ret;
+}
+
+static void run_ordered_work(struct btrfs_workqueue_struct *wq)
+{
+	struct list_head *list = &wq->ordered_list;
+	struct btrfs_work_struct *work;
+	spinlock_t *lock = &wq->list_lock;
+	unsigned long flags;
+
+	while (1) {
+		spin_lock_irqsave(lock, flags);
+		if (list_empty(list))
+			break;
+		work = list_entry(list->next, struct btrfs_work_struct,
+				  ordered_list);
+		if (!test_bit(WORK_DONE_BIT, &work->flags))
+			break;
+
+		/*
+		 * we are going to call the ordered done function, but
+		 * we leave the work item on the list as a barrier so
+		 * that later work items that are done don't have their
+		 * functions called before this one returns
+		 */
+		if (test_and_set_bit(WORK_ORDER_DONE_BIT, &work->flags))
+			break;
+		spin_unlock_irqrestore(lock, flags);
+		work->ordered_func(work);
+
+		/* now take the lock again and drop our item from the list */
+		spin_lock_irqsave(lock, flags);
+		list_del(&work->ordered_list);
+		spin_unlock_irqrestore(lock, flags);
+
+		/*
+		 * we don't want to call the ordered free functions
+		 * with the lock held though
+		 */
+		work->ordered_free(work);
+	}
+	spin_unlock_irqrestore(lock, flags);
+}
+
+static void normal_work_helper(struct work_struct *arg)
+{
+	struct btrfs_work_struct *work;
+	struct btrfs_workqueue_struct *wq;
+	int need_order = 0;
+
+	work = container_of(arg, struct btrfs_work_struct, normal_work);
+	/*
+	 * We should not touch things inside work in the following cases:
+	 * 1) after work->func() if it has no ordered_free
+	 *    Since the struct is freed in work->func().
+	 * 2) after setting WORK_DONE_BIT
+	 *    The work may be freed in other threads almost instantly.
+	 * So we save the needed things here.
+	 */
+	if (work->ordered_func)
+		need_order = 1;
+	wq = work->wq;
+
+	work->func(work);
+	if (need_order) {
+		set_bit(WORK_DONE_BIT, &work->flags);
+		run_ordered_work(wq);
+	}
+}
+
+void btrfs_init_work(struct btrfs_work_struct *work,
+		     void (*func)(struct btrfs_work_struct *),
+		     void (*ordered_func)(struct btrfs_work_struct *),
+		     void (*ordered_free)(struct btrfs_work_struct *))
+{
+	work->func = func;
+	work->ordered_func = ordered_func;
+	work->ordered_free = ordered_free;
+	INIT_WORK(&work->normal_work, normal_work_helper);
+	INIT_LIST_HEAD(&work->ordered_list);
+	work->flags = 0;
+}
+
+void btrfs_queue_work(struct btrfs_workqueue_struct *wq,
+		      struct btrfs_work_struct *work)
+{
+	unsigned long flags;
+
+	work->wq = wq;
+	if (work->ordered_func) {
+		spin_lock_irqsave(&wq->list_lock, flags);
+		list_add_tail(&work->ordered_list, &wq->ordered_list);
+		spin_unlock_irqrestore(&wq->list_lock, flags);
+	}
+	queue_work(wq->normal_wq, &work->normal_work);
+}
+
+void btrfs_destroy_workqueue(struct btrfs_workqueue_struct *wq)
+{
+	destroy_workqueue(wq->normal_wq);
+	kfree(wq);
+}
+
+void btrfs_workqueue_set_max(struct btrfs_workqueue_struct *wq, int max)
+{
+	workqueue_set_max_active(wq->normal_wq, max);
+}
diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h
index 1f26792683ed..9d8da53f6dd9 100644
--- a/fs/btrfs/async-thread.h
+++ b/fs/btrfs/async-thread.h
@@ -1,5 +1,6 @@
 /*
  * Copyright (C) 2007 Oracle.  All rights reserved.
+ * Copyright (C) 2014 Fujitsu.  All rights reserved.
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public
@@ -118,4 +119,30 @@ void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max,
 			struct btrfs_workers *async_starter);
 void btrfs_requeue_work(struct btrfs_work *work);
 void btrfs_set_work_high_prio(struct btrfs_work *work);
+
+struct btrfs_workqueue_struct;
+
+struct btrfs_work_struct {
+	void (*func)(struct btrfs_work_struct *arg);
+	void (*ordered_func)(struct btrfs_work_struct *arg);
+	void (*ordered_free)(struct btrfs_work_struct *arg);
+
+	/* Don't touch things below */
+	struct work_struct normal_work;
+	struct list_head ordered_list;
+	struct btrfs_workqueue_struct *wq;
+	unsigned long flags;
+};
+
+struct btrfs_workqueue_struct *btrfs_alloc_workqueue(char *name,
+						     int flags,
+						     int max_active);
+void btrfs_init_work(struct btrfs_work_struct *work,
+		     void (*func)(struct btrfs_work_struct *),
+		     void (*ordered_func)(struct btrfs_work_struct *),
+		     void (*ordered_free)(struct btrfs_work_struct *));
+void btrfs_queue_work(struct btrfs_workqueue_struct *wq,
+		      struct btrfs_work_struct *work);
+void btrfs_destroy_workqueue(struct btrfs_workqueue_struct *wq);
+void btrfs_workqueue_set_max(struct btrfs_workqueue_struct *wq, int max);
 #endif
-- 
cgit v1.2.3


From 1ca08976ae94f3594dd7303584581cf8099ce47e Mon Sep 17 00:00:00 2001
From: Qu Wenruo <quwenruo@cn.fujitsu.com>
Date: Fri, 28 Feb 2014 10:46:04 +0800
Subject: btrfs: Add high priority workqueue support for btrfs_workqueue_struct

Add high priority function to btrfs_workqueue.

This is implemented by embedding a btrfs_workqueue into a
btrfs_workqueue and use some helper functions to differ the normal
priority wq and high priority wq.
So the high priority wq is completely independent from the normal
workqueue.

Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Tested-by: David Sterba <dsterba@suse.cz>
Signed-off-by: Josef Bacik <jbacik@fb.com>
---
 fs/btrfs/async-thread.c | 91 ++++++++++++++++++++++++++++++++++++++++++-------
 fs/btrfs/async-thread.h |  5 ++-
 2 files changed, 83 insertions(+), 13 deletions(-)

diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 905de02e4386..193c84964db9 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -730,7 +730,7 @@ void btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work)
 	spin_unlock_irqrestore(&worker->lock, flags);
 }
 
-struct btrfs_workqueue_struct {
+struct __btrfs_workqueue_struct {
 	struct workqueue_struct *normal_wq;
 	/* List head pointing to ordered work list */
 	struct list_head ordered_list;
@@ -739,6 +739,38 @@ struct btrfs_workqueue_struct {
 	spinlock_t list_lock;
 };
 
+struct btrfs_workqueue_struct {
+	struct __btrfs_workqueue_struct *normal;
+	struct __btrfs_workqueue_struct *high;
+};
+
+static inline struct __btrfs_workqueue_struct
+*__btrfs_alloc_workqueue(char *name, int flags, int max_active)
+{
+	struct __btrfs_workqueue_struct *ret = kzalloc(sizeof(*ret), GFP_NOFS);
+
+	if (unlikely(!ret))
+		return NULL;
+
+	if (flags & WQ_HIGHPRI)
+		ret->normal_wq = alloc_workqueue("%s-%s-high", flags,
+						 max_active, "btrfs", name);
+	else
+		ret->normal_wq = alloc_workqueue("%s-%s", flags,
+						 max_active, "btrfs", name);
+	if (unlikely(!ret->normal_wq)) {
+		kfree(ret);
+		return NULL;
+	}
+
+	INIT_LIST_HEAD(&ret->ordered_list);
+	spin_lock_init(&ret->list_lock);
+	return ret;
+}
+
+static inline void
+__btrfs_destroy_workqueue(struct __btrfs_workqueue_struct *wq);
+
 struct btrfs_workqueue_struct *btrfs_alloc_workqueue(char *name,
 						     int flags,
 						     int max_active)
@@ -748,19 +780,25 @@ struct btrfs_workqueue_struct *btrfs_alloc_workqueue(char *name,
 	if (unlikely(!ret))
 		return NULL;
 
-	ret->normal_wq = alloc_workqueue("%s-%s", flags, max_active,
-					 "btrfs", name);
-	if (unlikely(!ret->normal_wq)) {
+	ret->normal = __btrfs_alloc_workqueue(name, flags & ~WQ_HIGHPRI,
+					      max_active);
+	if (unlikely(!ret->normal)) {
 		kfree(ret);
 		return NULL;
 	}
 
-	INIT_LIST_HEAD(&ret->ordered_list);
-	spin_lock_init(&ret->list_lock);
+	if (flags & WQ_HIGHPRI) {
+		ret->high = __btrfs_alloc_workqueue(name, flags, max_active);
+		if (unlikely(!ret->high)) {
+			__btrfs_destroy_workqueue(ret->normal);
+			kfree(ret);
+			return NULL;
+		}
+	}
 	return ret;
 }
 
-static void run_ordered_work(struct btrfs_workqueue_struct *wq)
+static void run_ordered_work(struct __btrfs_workqueue_struct *wq)
 {
 	struct list_head *list = &wq->ordered_list;
 	struct btrfs_work_struct *work;
@@ -804,7 +842,7 @@ static void run_ordered_work(struct btrfs_workqueue_struct *wq)
 static void normal_work_helper(struct work_struct *arg)
 {
 	struct btrfs_work_struct *work;
-	struct btrfs_workqueue_struct *wq;
+	struct __btrfs_workqueue_struct *wq;
 	int need_order = 0;
 
 	work = container_of(arg, struct btrfs_work_struct, normal_work);
@@ -840,8 +878,8 @@ void btrfs_init_work(struct btrfs_work_struct *work,
 	work->flags = 0;
 }
 
-void btrfs_queue_work(struct btrfs_workqueue_struct *wq,
-		      struct btrfs_work_struct *work)
+static inline void __btrfs_queue_work(struct __btrfs_workqueue_struct *wq,
+				      struct btrfs_work_struct *work)
 {
 	unsigned long flags;
 
@@ -854,13 +892,42 @@ void btrfs_queue_work(struct btrfs_workqueue_struct *wq,
 	queue_work(wq->normal_wq, &work->normal_work);
 }
 
-void btrfs_destroy_workqueue(struct btrfs_workqueue_struct *wq)
+void btrfs_queue_work(struct btrfs_workqueue_struct *wq,
+		      struct btrfs_work_struct *work)
+{
+	struct __btrfs_workqueue_struct *dest_wq;
+
+	if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags) && wq->high)
+		dest_wq = wq->high;
+	else
+		dest_wq = wq->normal;
+	__btrfs_queue_work(dest_wq, work);
+}
+
+static inline void
+__btrfs_destroy_workqueue(struct __btrfs_workqueue_struct *wq)
 {
 	destroy_workqueue(wq->normal_wq);
 	kfree(wq);
 }
 
+void btrfs_destroy_workqueue(struct btrfs_workqueue_struct *wq)
+{
+	if (!wq)
+		return;
+	if (wq->high)
+		__btrfs_destroy_workqueue(wq->high);
+	__btrfs_destroy_workqueue(wq->normal);
+}
+
 void btrfs_workqueue_set_max(struct btrfs_workqueue_struct *wq, int max)
 {
-	workqueue_set_max_active(wq->normal_wq, max);
+	workqueue_set_max_active(wq->normal->normal_wq, max);
+	if (wq->high)
+		workqueue_set_max_active(wq->high->normal_wq, max);
+}
+
+void btrfs_set_work_high_priority(struct btrfs_work_struct *work)
+{
+	set_bit(WORK_HIGH_PRIO_BIT, &work->flags);
 }
diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h
index 9d8da53f6dd9..fce623cfe3da 100644
--- a/fs/btrfs/async-thread.h
+++ b/fs/btrfs/async-thread.h
@@ -121,6 +121,8 @@ void btrfs_requeue_work(struct btrfs_work *work);
 void btrfs_set_work_high_prio(struct btrfs_work *work);
 
 struct btrfs_workqueue_struct;
+/* Internal use only */
+struct __btrfs_workqueue_struct;
 
 struct btrfs_work_struct {
 	void (*func)(struct btrfs_work_struct *arg);
@@ -130,7 +132,7 @@ struct btrfs_work_struct {
 	/* Don't touch things below */
 	struct work_struct normal_work;
 	struct list_head ordered_list;
-	struct btrfs_workqueue_struct *wq;
+	struct __btrfs_workqueue_struct *wq;
 	unsigned long flags;
 };
 
@@ -145,4 +147,5 @@ void btrfs_queue_work(struct btrfs_workqueue_struct *wq,
 		      struct btrfs_work_struct *work);
 void btrfs_destroy_workqueue(struct btrfs_workqueue_struct *wq);
 void btrfs_workqueue_set_max(struct btrfs_workqueue_struct *wq, int max);
+void btrfs_set_work_high_priority(struct btrfs_work_struct *work);
 #endif
-- 
cgit v1.2.3


From 0bd9289c28c3b6a38f5a05a812afae0274674fa2 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <quwenruo@cn.fujitsu.com>
Date: Fri, 28 Feb 2014 10:46:05 +0800
Subject: btrfs: Add threshold workqueue based on kernel workqueue

The original btrfs_workers has thresholding functions to dynamically
create or destroy kthreads.

Though there is no such function in kernel workqueue because the worker
is not created manually, we can still use the workqueue_set_max_active
to simulated the behavior, mainly to achieve a better HDD performance by
setting a high threshold on submit_workers.
(Sadly, no resource can be saved)

So in this patch, extra workqueue pending counters are introduced to
dynamically change the max active of each btrfs_workqueue_struct, hoping
to restore the behavior of the original thresholding function.

Also, workqueue_set_max_active use a mutex to protect workqueue_struct,
which is not meant to be called too frequently, so a new interval
mechanism is applied, that will only call workqueue_set_max_active after
a count of work is queued. Hoping to balance both the random and
sequence performance on HDD.

Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Tested-by: David Sterba <dsterba@suse.cz>
Signed-off-by: Josef Bacik <jbacik@fb.com>
---
 fs/btrfs/async-thread.c | 107 ++++++++++++++++++++++++++++++++++++++++++++----
 fs/btrfs/async-thread.h |   3 +-
 2 files changed, 101 insertions(+), 9 deletions(-)

diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 193c84964db9..977bce2ec887 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -30,6 +30,9 @@
 #define WORK_ORDER_DONE_BIT 2
 #define WORK_HIGH_PRIO_BIT 3
 
+#define NO_THRESHOLD (-1)
+#define DFT_THRESHOLD (32)
+
 /*
  * container for the kthread task pointer and the list of pending work
  * One of these is allocated per thread.
@@ -737,6 +740,14 @@ struct __btrfs_workqueue_struct {
 
 	/* Spinlock for ordered_list */
 	spinlock_t list_lock;
+
+	/* Thresholding related variants */
+	atomic_t pending;
+	int max_active;
+	int current_max;
+	int thresh;
+	unsigned int count;
+	spinlock_t thres_lock;
 };
 
 struct btrfs_workqueue_struct {
@@ -745,19 +756,34 @@ struct btrfs_workqueue_struct {
 };
 
 static inline struct __btrfs_workqueue_struct
-*__btrfs_alloc_workqueue(char *name, int flags, int max_active)
+*__btrfs_alloc_workqueue(char *name, int flags, int max_active, int thresh)
 {
 	struct __btrfs_workqueue_struct *ret = kzalloc(sizeof(*ret), GFP_NOFS);
 
 	if (unlikely(!ret))
 		return NULL;
 
+	ret->max_active = max_active;
+	atomic_set(&ret->pending, 0);
+	if (thresh == 0)
+		thresh = DFT_THRESHOLD;
+	/* For low threshold, disabling threshold is a better choice */
+	if (thresh < DFT_THRESHOLD) {
+		ret->current_max = max_active;
+		ret->thresh = NO_THRESHOLD;
+	} else {
+		ret->current_max = 1;
+		ret->thresh = thresh;
+	}
+
 	if (flags & WQ_HIGHPRI)
 		ret->normal_wq = alloc_workqueue("%s-%s-high", flags,
-						 max_active, "btrfs", name);
+						 ret->max_active,
+						 "btrfs", name);
 	else
 		ret->normal_wq = alloc_workqueue("%s-%s", flags,
-						 max_active, "btrfs", name);
+						 ret->max_active, "btrfs",
+						 name);
 	if (unlikely(!ret->normal_wq)) {
 		kfree(ret);
 		return NULL;
@@ -765,6 +791,7 @@ static inline struct __btrfs_workqueue_struct
 
 	INIT_LIST_HEAD(&ret->ordered_list);
 	spin_lock_init(&ret->list_lock);
+	spin_lock_init(&ret->thres_lock);
 	return ret;
 }
 
@@ -773,7 +800,8 @@ __btrfs_destroy_workqueue(struct __btrfs_workqueue_struct *wq);
 
 struct btrfs_workqueue_struct *btrfs_alloc_workqueue(char *name,
 						     int flags,
-						     int max_active)
+						     int max_active,
+						     int thresh)
 {
 	struct btrfs_workqueue_struct *ret = kzalloc(sizeof(*ret), GFP_NOFS);
 
@@ -781,14 +809,15 @@ struct btrfs_workqueue_struct *btrfs_alloc_workqueue(char *name,
 		return NULL;
 
 	ret->normal = __btrfs_alloc_workqueue(name, flags & ~WQ_HIGHPRI,
-					      max_active);
+					      max_active, thresh);
 	if (unlikely(!ret->normal)) {
 		kfree(ret);
 		return NULL;
 	}
 
 	if (flags & WQ_HIGHPRI) {
-		ret->high = __btrfs_alloc_workqueue(name, flags, max_active);
+		ret->high = __btrfs_alloc_workqueue(name, flags, max_active,
+						    thresh);
 		if (unlikely(!ret->high)) {
 			__btrfs_destroy_workqueue(ret->normal);
 			kfree(ret);
@@ -798,6 +827,66 @@ struct btrfs_workqueue_struct *btrfs_alloc_workqueue(char *name,
 	return ret;
 }
 
+/*
+ * Hook for threshold which will be called in btrfs_queue_work.
+ * This hook WILL be called in IRQ handler context,
+ * so workqueue_set_max_active MUST NOT be called in this hook
+ */
+static inline void thresh_queue_hook(struct __btrfs_workqueue_struct *wq)
+{
+	if (wq->thresh == NO_THRESHOLD)
+		return;
+	atomic_inc(&wq->pending);
+}
+
+/*
+ * Hook for threshold which will be called before executing the work,
+ * This hook is called in kthread content.
+ * So workqueue_set_max_active is called here.
+ */
+static inline void thresh_exec_hook(struct __btrfs_workqueue_struct *wq)
+{
+	int new_max_active;
+	long pending;
+	int need_change = 0;
+
+	if (wq->thresh == NO_THRESHOLD)
+		return;
+
+	atomic_dec(&wq->pending);
+	spin_lock(&wq->thres_lock);
+	/*
+	 * Use wq->count to limit the calling frequency of
+	 * workqueue_set_max_active.
+	 */
+	wq->count++;
+	wq->count %= (wq->thresh / 4);
+	if (!wq->count)
+		goto  out;
+	new_max_active = wq->current_max;
+
+	/*
+	 * pending may be changed later, but it's OK since we really
+	 * don't need it so accurate to calculate new_max_active.
+	 */
+	pending = atomic_read(&wq->pending);
+	if (pending > wq->thresh)
+		new_max_active++;
+	if (pending < wq->thresh / 2)
+		new_max_active--;
+	new_max_active = clamp_val(new_max_active, 1, wq->max_active);
+	if (new_max_active != wq->current_max)  {
+		need_change = 1;
+		wq->current_max = new_max_active;
+	}
+out:
+	spin_unlock(&wq->thres_lock);
+
+	if (need_change) {
+		workqueue_set_max_active(wq->normal_wq, wq->current_max);
+	}
+}
+
 static void run_ordered_work(struct __btrfs_workqueue_struct *wq)
 {
 	struct list_head *list = &wq->ordered_list;
@@ -858,6 +947,7 @@ static void normal_work_helper(struct work_struct *arg)
 		need_order = 1;
 	wq = work->wq;
 
+	thresh_exec_hook(wq);
 	work->func(work);
 	if (need_order) {
 		set_bit(WORK_DONE_BIT, &work->flags);
@@ -884,6 +974,7 @@ static inline void __btrfs_queue_work(struct __btrfs_workqueue_struct *wq,
 	unsigned long flags;
 
 	work->wq = wq;
+	thresh_queue_hook(wq);
 	if (work->ordered_func) {
 		spin_lock_irqsave(&wq->list_lock, flags);
 		list_add_tail(&work->ordered_list, &wq->ordered_list);
@@ -922,9 +1013,9 @@ void btrfs_destroy_workqueue(struct btrfs_workqueue_struct *wq)
 
 void btrfs_workqueue_set_max(struct btrfs_workqueue_struct *wq, int max)
 {
-	workqueue_set_max_active(wq->normal->normal_wq, max);
+	wq->normal->max_active = max;
 	if (wq->high)
-		workqueue_set_max_active(wq->high->normal_wq, max);
+		wq->high->max_active = max;
 }
 
 void btrfs_set_work_high_priority(struct btrfs_work_struct *work)
diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h
index fce623cfe3da..3129d8a6128b 100644
--- a/fs/btrfs/async-thread.h
+++ b/fs/btrfs/async-thread.h
@@ -138,7 +138,8 @@ struct btrfs_work_struct {
 
 struct btrfs_workqueue_struct *btrfs_alloc_workqueue(char *name,
 						     int flags,
-						     int max_active);
+						     int max_active,
+						     int thresh);
 void btrfs_init_work(struct btrfs_work_struct *work,
 		     void (*func)(struct btrfs_work_struct *),
 		     void (*ordered_func)(struct btrfs_work_struct *),
-- 
cgit v1.2.3


From 5cdc7ad337fb08f630ac3538fb10e4a75de2572d Mon Sep 17 00:00:00 2001
From: Qu Wenruo <quwenruo@cn.fujitsu.com>
Date: Fri, 28 Feb 2014 10:46:06 +0800
Subject: btrfs: Replace fs_info->workers with btrfs_workqueue.

Use the newly created btrfs_workqueue_struct to replace the original
fs_info->workers

Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Tested-by: David Sterba <dsterba@suse.cz>
Signed-off-by: Josef Bacik <jbacik@fb.com>
---
 fs/btrfs/ctree.h   |  2 +-
 fs/btrfs/disk-io.c | 41 +++++++++++++++++++++--------------------
 fs/btrfs/super.c   |  2 +-
 3 files changed, 23 insertions(+), 22 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index b2c0336db691..bd7cb8ca4d6c 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1505,7 +1505,7 @@ struct btrfs_fs_info {
 	 * two
 	 */
 	struct btrfs_workers generic_worker;
-	struct btrfs_workers workers;
+	struct btrfs_workqueue_struct *workers;
 	struct btrfs_workers delalloc_workers;
 	struct btrfs_workers flush_workers;
 	struct btrfs_workers endio_workers;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index dd52146035b3..faafa5153fbd 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -108,7 +108,7 @@ struct async_submit_bio {
 	 * can't tell us where in the file the bio should go
 	 */
 	u64 bio_offset;
-	struct btrfs_work work;
+	struct btrfs_work_struct work;
 	int error;
 };
 
@@ -738,12 +738,12 @@ int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
 unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info)
 {
 	unsigned long limit = min_t(unsigned long,
-				    info->workers.max_workers,
+				    info->thread_pool_size,
 				    info->fs_devices->open_devices);
 	return 256 * limit;
 }
 
-static void run_one_async_start(struct btrfs_work *work)
+static void run_one_async_start(struct btrfs_work_struct *work)
 {
 	struct async_submit_bio *async;
 	int ret;
@@ -756,7 +756,7 @@ static void run_one_async_start(struct btrfs_work *work)
 		async->error = ret;
 }
 
-static void run_one_async_done(struct btrfs_work *work)
+static void run_one_async_done(struct btrfs_work_struct *work)
 {
 	struct btrfs_fs_info *fs_info;
 	struct async_submit_bio *async;
@@ -783,7 +783,7 @@ static void run_one_async_done(struct btrfs_work *work)
 			       async->bio_offset);
 }
 
-static void run_one_async_free(struct btrfs_work *work)
+static void run_one_async_free(struct btrfs_work_struct *work)
 {
 	struct async_submit_bio *async;
 
@@ -811,11 +811,9 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
 	async->submit_bio_start = submit_bio_start;
 	async->submit_bio_done = submit_bio_done;
 
-	async->work.func = run_one_async_start;
-	async->work.ordered_func = run_one_async_done;
-	async->work.ordered_free = run_one_async_free;
+	btrfs_init_work(&async->work, run_one_async_start,
+			run_one_async_done, run_one_async_free);
 
-	async->work.flags = 0;
 	async->bio_flags = bio_flags;
 	async->bio_offset = bio_offset;
 
@@ -824,9 +822,9 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
 	atomic_inc(&fs_info->nr_async_submits);
 
 	if (rw & REQ_SYNC)
-		btrfs_set_work_high_prio(&async->work);
+		btrfs_set_work_high_priority(&async->work);
 
-	btrfs_queue_worker(&fs_info->workers, &async->work);
+	btrfs_queue_work(fs_info->workers, &async->work);
 
 	while (atomic_read(&fs_info->async_submit_draining) &&
 	      atomic_read(&fs_info->nr_async_submits)) {
@@ -2000,7 +1998,7 @@ static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info)
 	btrfs_stop_workers(&fs_info->generic_worker);
 	btrfs_stop_workers(&fs_info->fixup_workers);
 	btrfs_stop_workers(&fs_info->delalloc_workers);
-	btrfs_stop_workers(&fs_info->workers);
+	btrfs_destroy_workqueue(fs_info->workers);
 	btrfs_stop_workers(&fs_info->endio_workers);
 	btrfs_stop_workers(&fs_info->endio_meta_workers);
 	btrfs_stop_workers(&fs_info->endio_raid56_workers);
@@ -2104,6 +2102,8 @@ int open_ctree(struct super_block *sb,
 	int err = -EINVAL;
 	int num_backups_tried = 0;
 	int backup_index = 0;
+	int max_active;
+	int flags = WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_UNBOUND;
 	bool create_uuid_tree;
 	bool check_uuid_tree;
 
@@ -2472,12 +2472,13 @@ int open_ctree(struct super_block *sb,
 		goto fail_alloc;
 	}
 
+	max_active = fs_info->thread_pool_size;
 	btrfs_init_workers(&fs_info->generic_worker,
 			   "genwork", 1, NULL);
 
-	btrfs_init_workers(&fs_info->workers, "worker",
-			   fs_info->thread_pool_size,
-			   &fs_info->generic_worker);
+	fs_info->workers =
+		btrfs_alloc_workqueue("worker", flags | WQ_HIGHPRI,
+				      max_active, 16);
 
 	btrfs_init_workers(&fs_info->delalloc_workers, "delalloc",
 			   fs_info->thread_pool_size, NULL);
@@ -2498,9 +2499,6 @@ int open_ctree(struct super_block *sb,
 	 */
 	fs_info->submit_workers.idle_thresh = 64;
 
-	fs_info->workers.idle_thresh = 16;
-	fs_info->workers.ordered = 1;
-
 	fs_info->delalloc_workers.idle_thresh = 2;
 	fs_info->delalloc_workers.ordered = 1;
 
@@ -2552,8 +2550,7 @@ int open_ctree(struct super_block *sb,
 	 * btrfs_start_workers can really only fail because of ENOMEM so just
 	 * return -ENOMEM if any of these fail.
 	 */
-	ret = btrfs_start_workers(&fs_info->workers);
-	ret |= btrfs_start_workers(&fs_info->generic_worker);
+	ret = btrfs_start_workers(&fs_info->generic_worker);
 	ret |= btrfs_start_workers(&fs_info->submit_workers);
 	ret |= btrfs_start_workers(&fs_info->delalloc_workers);
 	ret |= btrfs_start_workers(&fs_info->fixup_workers);
@@ -2573,6 +2570,10 @@ int open_ctree(struct super_block *sb,
 		err = -ENOMEM;
 		goto fail_sb_buffer;
 	}
+	if (!(fs_info->workers)) {
+		err = -ENOMEM;
+		goto fail_sb_buffer;
+	}
 
 	fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super);
 	fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages,
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 426b7c602653..6f66d8a2ef38 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1324,7 +1324,7 @@ static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info,
 	       old_pool_size, new_pool_size);
 
 	btrfs_set_max_workers(&fs_info->generic_worker, new_pool_size);
-	btrfs_set_max_workers(&fs_info->workers, new_pool_size);
+	btrfs_workqueue_set_max(fs_info->workers, new_pool_size);
 	btrfs_set_max_workers(&fs_info->delalloc_workers, new_pool_size);
 	btrfs_set_max_workers(&fs_info->submit_workers, new_pool_size);
 	btrfs_set_max_workers(&fs_info->caching_workers, new_pool_size);
-- 
cgit v1.2.3


From afe3d24267926eb78ba863016bdd65cfe718aef5 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <quwenruo@cn.fujitsu.com>
Date: Fri, 28 Feb 2014 10:46:07 +0800
Subject: btrfs: Replace fs_info->delalloc_workers with btrfs_workqueue

Much like the fs_info->workers, replace the fs_info->delalloc_workers
use the same btrfs_workqueue.

Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Tested-by: David Sterba <dsterba@suse.cz>
Signed-off-by: Josef Bacik <jbacik@fb.com>
---
 fs/btrfs/ctree.h   |  2 +-
 fs/btrfs/disk-io.c | 12 ++++--------
 fs/btrfs/inode.c   | 18 ++++++++----------
 fs/btrfs/super.c   |  2 +-
 4 files changed, 14 insertions(+), 20 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index bd7cb8ca4d6c..3b2c30d870e1 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1506,7 +1506,7 @@ struct btrfs_fs_info {
 	 */
 	struct btrfs_workers generic_worker;
 	struct btrfs_workqueue_struct *workers;
-	struct btrfs_workers delalloc_workers;
+	struct btrfs_workqueue_struct *delalloc_workers;
 	struct btrfs_workers flush_workers;
 	struct btrfs_workers endio_workers;
 	struct btrfs_workers endio_meta_workers;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index faafa5153fbd..7eeb45f649bf 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1997,7 +1997,7 @@ static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info)
 {
 	btrfs_stop_workers(&fs_info->generic_worker);
 	btrfs_stop_workers(&fs_info->fixup_workers);
-	btrfs_stop_workers(&fs_info->delalloc_workers);
+	btrfs_destroy_workqueue(fs_info->delalloc_workers);
 	btrfs_destroy_workqueue(fs_info->workers);
 	btrfs_stop_workers(&fs_info->endio_workers);
 	btrfs_stop_workers(&fs_info->endio_meta_workers);
@@ -2480,8 +2480,8 @@ int open_ctree(struct super_block *sb,
 		btrfs_alloc_workqueue("worker", flags | WQ_HIGHPRI,
 				      max_active, 16);
 
-	btrfs_init_workers(&fs_info->delalloc_workers, "delalloc",
-			   fs_info->thread_pool_size, NULL);
+	fs_info->delalloc_workers =
+		btrfs_alloc_workqueue("delalloc", flags, max_active, 2);
 
 	btrfs_init_workers(&fs_info->flush_workers, "flush_delalloc",
 			   fs_info->thread_pool_size, NULL);
@@ -2499,9 +2499,6 @@ int open_ctree(struct super_block *sb,
 	 */
 	fs_info->submit_workers.idle_thresh = 64;
 
-	fs_info->delalloc_workers.idle_thresh = 2;
-	fs_info->delalloc_workers.ordered = 1;
-
 	btrfs_init_workers(&fs_info->fixup_workers, "fixup", 1,
 			   &fs_info->generic_worker);
 	btrfs_init_workers(&fs_info->endio_workers, "endio",
@@ -2552,7 +2549,6 @@ int open_ctree(struct super_block *sb,
 	 */
 	ret = btrfs_start_workers(&fs_info->generic_worker);
 	ret |= btrfs_start_workers(&fs_info->submit_workers);
-	ret |= btrfs_start_workers(&fs_info->delalloc_workers);
 	ret |= btrfs_start_workers(&fs_info->fixup_workers);
 	ret |= btrfs_start_workers(&fs_info->endio_workers);
 	ret |= btrfs_start_workers(&fs_info->endio_meta_workers);
@@ -2570,7 +2566,7 @@ int open_ctree(struct super_block *sb,
 		err = -ENOMEM;
 		goto fail_sb_buffer;
 	}
-	if (!(fs_info->workers)) {
+	if (!(fs_info->workers && fs_info->delalloc_workers)) {
 		err = -ENOMEM;
 		goto fail_sb_buffer;
 	}
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 0182f081d499..a41a5a7aa3cb 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -324,7 +324,7 @@ struct async_cow {
 	u64 start;
 	u64 end;
 	struct list_head extents;
-	struct btrfs_work work;
+	struct btrfs_work_struct work;
 };
 
 static noinline int add_async_extent(struct async_cow *cow,
@@ -1000,7 +1000,7 @@ out_unlock:
 /*
  * work queue call back to started compression on a file and pages
  */
-static noinline void async_cow_start(struct btrfs_work *work)
+static noinline void async_cow_start(struct btrfs_work_struct *work)
 {
 	struct async_cow *async_cow;
 	int num_added = 0;
@@ -1018,7 +1018,7 @@ static noinline void async_cow_start(struct btrfs_work *work)
 /*
  * work queue call back to submit previously compressed pages
  */
-static noinline void async_cow_submit(struct btrfs_work *work)
+static noinline void async_cow_submit(struct btrfs_work_struct *work)
 {
 	struct async_cow *async_cow;
 	struct btrfs_root *root;
@@ -1039,7 +1039,7 @@ static noinline void async_cow_submit(struct btrfs_work *work)
 		submit_compressed_extents(async_cow->inode, async_cow);
 }
 
-static noinline void async_cow_free(struct btrfs_work *work)
+static noinline void async_cow_free(struct btrfs_work_struct *work)
 {
 	struct async_cow *async_cow;
 	async_cow = container_of(work, struct async_cow, work);
@@ -1076,17 +1076,15 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,
 		async_cow->end = cur_end;
 		INIT_LIST_HEAD(&async_cow->extents);
 
-		async_cow->work.func = async_cow_start;
-		async_cow->work.ordered_func = async_cow_submit;
-		async_cow->work.ordered_free = async_cow_free;
-		async_cow->work.flags = 0;
+		btrfs_init_work(&async_cow->work, async_cow_start,
+				async_cow_submit, async_cow_free);
 
 		nr_pages = (cur_end - start + PAGE_CACHE_SIZE) >>
 			PAGE_CACHE_SHIFT;
 		atomic_add(nr_pages, &root->fs_info->async_delalloc_pages);
 
-		btrfs_queue_worker(&root->fs_info->delalloc_workers,
-				   &async_cow->work);
+		btrfs_queue_work(root->fs_info->delalloc_workers,
+				 &async_cow->work);
 
 		if (atomic_read(&root->fs_info->async_delalloc_pages) > limit) {
 			wait_event(root->fs_info->async_submit_wait,
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 6f66d8a2ef38..be0019903264 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1325,7 +1325,7 @@ static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info,
 
 	btrfs_set_max_workers(&fs_info->generic_worker, new_pool_size);
 	btrfs_workqueue_set_max(fs_info->workers, new_pool_size);
-	btrfs_set_max_workers(&fs_info->delalloc_workers, new_pool_size);
+	btrfs_workqueue_set_max(fs_info->delalloc_workers, new_pool_size);
 	btrfs_set_max_workers(&fs_info->submit_workers, new_pool_size);
 	btrfs_set_max_workers(&fs_info->caching_workers, new_pool_size);
 	btrfs_set_max_workers(&fs_info->fixup_workers, new_pool_size);
-- 
cgit v1.2.3


From a8c93d4ef6f6727764a61a2ee1c1878a755637c5 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <quwenruo@cn.fujitsu.com>
Date: Fri, 28 Feb 2014 10:46:08 +0800
Subject: btrfs: Replace fs_info->submit_workers with btrfs_workqueue.

Much like the fs_info->workers, replace the fs_info->submit_workers
use the same btrfs_workqueue.

Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Tested-by: David Sterba <dsterba@suse.cz>
Signed-off-by: Josef Bacik <jbacik@fb.com>
---
 fs/btrfs/ctree.h   |  2 +-
 fs/btrfs/disk-io.c | 17 +++++++++--------
 fs/btrfs/super.c   |  2 +-
 fs/btrfs/volumes.c | 11 ++++++-----
 fs/btrfs/volumes.h |  2 +-
 5 files changed, 18 insertions(+), 16 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 3b2c30d870e1..abed94213e6a 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1515,7 +1515,7 @@ struct btrfs_fs_info {
 	struct btrfs_workers endio_meta_write_workers;
 	struct btrfs_workers endio_write_workers;
 	struct btrfs_workers endio_freespace_worker;
-	struct btrfs_workers submit_workers;
+	struct btrfs_workqueue_struct *submit_workers;
 	struct btrfs_workers caching_workers;
 	struct btrfs_workers readahead_workers;
 
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 7eeb45f649bf..420328bacf49 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2006,7 +2006,7 @@ static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info)
 	btrfs_stop_workers(&fs_info->endio_meta_write_workers);
 	btrfs_stop_workers(&fs_info->endio_write_workers);
 	btrfs_stop_workers(&fs_info->endio_freespace_worker);
-	btrfs_stop_workers(&fs_info->submit_workers);
+	btrfs_destroy_workqueue(fs_info->submit_workers);
 	btrfs_stop_workers(&fs_info->delayed_workers);
 	btrfs_stop_workers(&fs_info->caching_workers);
 	btrfs_stop_workers(&fs_info->readahead_workers);
@@ -2486,18 +2486,19 @@ int open_ctree(struct super_block *sb,
 	btrfs_init_workers(&fs_info->flush_workers, "flush_delalloc",
 			   fs_info->thread_pool_size, NULL);
 
-	btrfs_init_workers(&fs_info->submit_workers, "submit",
-			   min_t(u64, fs_devices->num_devices,
-			   fs_info->thread_pool_size), NULL);
 
 	btrfs_init_workers(&fs_info->caching_workers, "cache",
 			   fs_info->thread_pool_size, NULL);
 
-	/* a higher idle thresh on the submit workers makes it much more
+	/*
+	 * a higher idle thresh on the submit workers makes it much more
 	 * likely that bios will be send down in a sane order to the
 	 * devices
 	 */
-	fs_info->submit_workers.idle_thresh = 64;
+	fs_info->submit_workers =
+		btrfs_alloc_workqueue("submit", flags,
+				      min_t(u64, fs_devices->num_devices,
+					    max_active), 64);
 
 	btrfs_init_workers(&fs_info->fixup_workers, "fixup", 1,
 			   &fs_info->generic_worker);
@@ -2548,7 +2549,6 @@ int open_ctree(struct super_block *sb,
 	 * return -ENOMEM if any of these fail.
 	 */
 	ret = btrfs_start_workers(&fs_info->generic_worker);
-	ret |= btrfs_start_workers(&fs_info->submit_workers);
 	ret |= btrfs_start_workers(&fs_info->fixup_workers);
 	ret |= btrfs_start_workers(&fs_info->endio_workers);
 	ret |= btrfs_start_workers(&fs_info->endio_meta_workers);
@@ -2566,7 +2566,8 @@ int open_ctree(struct super_block *sb,
 		err = -ENOMEM;
 		goto fail_sb_buffer;
 	}
-	if (!(fs_info->workers && fs_info->delalloc_workers)) {
+	if (!(fs_info->workers && fs_info->delalloc_workers &&
+	      fs_info->submit_workers)) {
 		err = -ENOMEM;
 		goto fail_sb_buffer;
 	}
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index be0019903264..9ed559ebe914 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1326,7 +1326,7 @@ static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info,
 	btrfs_set_max_workers(&fs_info->generic_worker, new_pool_size);
 	btrfs_workqueue_set_max(fs_info->workers, new_pool_size);
 	btrfs_workqueue_set_max(fs_info->delalloc_workers, new_pool_size);
-	btrfs_set_max_workers(&fs_info->submit_workers, new_pool_size);
+	btrfs_workqueue_set_max(fs_info->submit_workers, new_pool_size);
 	btrfs_set_max_workers(&fs_info->caching_workers, new_pool_size);
 	btrfs_set_max_workers(&fs_info->fixup_workers, new_pool_size);
 	btrfs_set_max_workers(&fs_info->endio_workers, new_pool_size);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 82a63b1a36ef..0066cff077ce 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -415,7 +415,8 @@ loop_lock:
 			device->running_pending = 1;
 
 			spin_unlock(&device->io_lock);
-			btrfs_requeue_work(&device->work);
+			btrfs_queue_work(fs_info->submit_workers,
+					 &device->work);
 			goto done;
 		}
 		/* unplug every 64 requests just for good measure */
@@ -439,7 +440,7 @@ done:
 	blk_finish_plug(&plug);
 }
 
-static void pending_bios_fn(struct btrfs_work *work)
+static void pending_bios_fn(struct btrfs_work_struct *work)
 {
 	struct btrfs_device *device;
 
@@ -5379,8 +5380,8 @@ static noinline void btrfs_schedule_bio(struct btrfs_root *root,
 	spin_unlock(&device->io_lock);
 
 	if (should_queue)
-		btrfs_queue_worker(&root->fs_info->submit_workers,
-				   &device->work);
+		btrfs_queue_work(root->fs_info->submit_workers,
+				 &device->work);
 }
 
 static int bio_size_ok(struct block_device *bdev, struct bio *bio,
@@ -5668,7 +5669,7 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
 	else
 		generate_random_uuid(dev->uuid);
 
-	dev->work.func = pending_bios_fn;
+	btrfs_init_work(&dev->work, pending_bios_fn, NULL, NULL);
 
 	return dev;
 }
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 80754f9dd3df..5d9a03773ca6 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -95,7 +95,7 @@ struct btrfs_device {
 	/* per-device scrub information */
 	struct scrub_ctx *scrub_device;
 
-	struct btrfs_work work;
+	struct btrfs_work_struct work;
 	struct rcu_head rcu;
 	struct work_struct rcu_work;
 
-- 
cgit v1.2.3


From a44903abe9dc23ffa305898368a7a910dbae13c5 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <quwenruo@cn.fujitsu.com>
Date: Fri, 28 Feb 2014 10:46:09 +0800
Subject: btrfs: Replace fs_info->flush_workers with btrfs_workqueue.

Replace the fs_info->submit_workers with the newly created
btrfs_workqueue.

Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Tested-by: David Sterba <dsterba@suse.cz>
Signed-off-by: Josef Bacik <jbacik@fb.com>
---
 fs/btrfs/ctree.h        |  4 ++--
 fs/btrfs/disk-io.c      | 10 ++++------
 fs/btrfs/inode.c        |  8 ++++----
 fs/btrfs/ordered-data.c | 13 +++++++------
 fs/btrfs/ordered-data.h |  2 +-
 5 files changed, 18 insertions(+), 19 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index abed94213e6a..c31a102d34de 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1507,7 +1507,7 @@ struct btrfs_fs_info {
 	struct btrfs_workers generic_worker;
 	struct btrfs_workqueue_struct *workers;
 	struct btrfs_workqueue_struct *delalloc_workers;
-	struct btrfs_workers flush_workers;
+	struct btrfs_workqueue_struct *flush_workers;
 	struct btrfs_workers endio_workers;
 	struct btrfs_workers endio_meta_workers;
 	struct btrfs_workers endio_raid56_workers;
@@ -3681,7 +3681,7 @@ struct btrfs_delalloc_work {
 	int delay_iput;
 	struct completion completion;
 	struct list_head list;
-	struct btrfs_work work;
+	struct btrfs_work_struct work;
 };
 
 struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 420328bacf49..5b82b0b31ec8 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2010,7 +2010,7 @@ static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info)
 	btrfs_stop_workers(&fs_info->delayed_workers);
 	btrfs_stop_workers(&fs_info->caching_workers);
 	btrfs_stop_workers(&fs_info->readahead_workers);
-	btrfs_stop_workers(&fs_info->flush_workers);
+	btrfs_destroy_workqueue(fs_info->flush_workers);
 	btrfs_stop_workers(&fs_info->qgroup_rescan_workers);
 }
 
@@ -2483,9 +2483,8 @@ int open_ctree(struct super_block *sb,
 	fs_info->delalloc_workers =
 		btrfs_alloc_workqueue("delalloc", flags, max_active, 2);
 
-	btrfs_init_workers(&fs_info->flush_workers, "flush_delalloc",
-			   fs_info->thread_pool_size, NULL);
-
+	fs_info->flush_workers =
+		btrfs_alloc_workqueue("flush_delalloc", flags, max_active, 0);
 
 	btrfs_init_workers(&fs_info->caching_workers, "cache",
 			   fs_info->thread_pool_size, NULL);
@@ -2560,14 +2559,13 @@ int open_ctree(struct super_block *sb,
 	ret |= btrfs_start_workers(&fs_info->delayed_workers);
 	ret |= btrfs_start_workers(&fs_info->caching_workers);
 	ret |= btrfs_start_workers(&fs_info->readahead_workers);
-	ret |= btrfs_start_workers(&fs_info->flush_workers);
 	ret |= btrfs_start_workers(&fs_info->qgroup_rescan_workers);
 	if (ret) {
 		err = -ENOMEM;
 		goto fail_sb_buffer;
 	}
 	if (!(fs_info->workers && fs_info->delalloc_workers &&
-	      fs_info->submit_workers)) {
+	      fs_info->submit_workers && fs_info->flush_workers)) {
 		err = -ENOMEM;
 		goto fail_sb_buffer;
 	}
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index a41a5a7aa3cb..6c043bed0c32 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -8386,7 +8386,7 @@ out_notrans:
 	return ret;
 }
 
-static void btrfs_run_delalloc_work(struct btrfs_work *work)
+static void btrfs_run_delalloc_work(struct btrfs_work_struct *work)
 {
 	struct btrfs_delalloc_work *delalloc_work;
 	struct inode *inode;
@@ -8424,7 +8424,7 @@ struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode,
 	work->inode = inode;
 	work->wait = wait;
 	work->delay_iput = delay_iput;
-	work->work.func = btrfs_run_delalloc_work;
+	btrfs_init_work(&work->work, btrfs_run_delalloc_work, NULL, NULL);
 
 	return work;
 }
@@ -8476,8 +8476,8 @@ static int __start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
 			goto out;
 		}
 		list_add_tail(&work->list, &works);
-		btrfs_queue_worker(&root->fs_info->flush_workers,
-				   &work->work);
+		btrfs_queue_work(root->fs_info->flush_workers,
+				 &work->work);
 
 		cond_resched();
 		spin_lock(&root->delalloc_lock);
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 138a7d7e9c90..6fa8219b5d03 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -576,7 +576,7 @@ void btrfs_remove_ordered_extent(struct inode *inode,
 	wake_up(&entry->wait);
 }
 
-static void btrfs_run_ordered_extent_work(struct btrfs_work *work)
+static void btrfs_run_ordered_extent_work(struct btrfs_work_struct *work)
 {
 	struct btrfs_ordered_extent *ordered;
 
@@ -609,10 +609,11 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr)
 		atomic_inc(&ordered->refs);
 		spin_unlock(&root->ordered_extent_lock);
 
-		ordered->flush_work.func = btrfs_run_ordered_extent_work;
+		btrfs_init_work(&ordered->flush_work,
+				btrfs_run_ordered_extent_work, NULL, NULL);
 		list_add_tail(&ordered->work_list, &works);
-		btrfs_queue_worker(&root->fs_info->flush_workers,
-				   &ordered->flush_work);
+		btrfs_queue_work(root->fs_info->flush_workers,
+				 &ordered->flush_work);
 
 		cond_resched();
 		spin_lock(&root->ordered_extent_lock);
@@ -725,8 +726,8 @@ int btrfs_run_ordered_operations(struct btrfs_trans_handle *trans,
 			goto out;
 		}
 		list_add_tail(&work->list, &works);
-		btrfs_queue_worker(&root->fs_info->flush_workers,
-				   &work->work);
+		btrfs_queue_work(root->fs_info->flush_workers,
+				 &work->work);
 
 		cond_resched();
 		spin_lock(&root->fs_info->ordered_root_lock);
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 246897058efb..fe9f4dbab09c 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -133,7 +133,7 @@ struct btrfs_ordered_extent {
 	struct btrfs_work work;
 
 	struct completion completion;
-	struct btrfs_work flush_work;
+	struct btrfs_work_struct flush_work;
 	struct list_head work_list;
 };
 
-- 
cgit v1.2.3


From fccb5d86d8f52161e013025ccf3101d8fab99a32 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <quwenruo@cn.fujitsu.com>
Date: Fri, 28 Feb 2014 10:46:10 +0800
Subject: btrfs: Replace fs_info->endio_* workqueue with btrfs_workqueue.

Replace the fs_info->endio_* workqueues with the newly created
btrfs_workqueue.

Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Tested-by: David Sterba <dsterba@suse.cz>
Signed-off-by: Josef Bacik <jbacik@fb.com>
---
 fs/btrfs/ctree.h        |  12 +++---
 fs/btrfs/disk-io.c      | 104 +++++++++++++++++++++---------------------------
 fs/btrfs/inode.c        |  20 +++++-----
 fs/btrfs/ordered-data.h |   2 +-
 fs/btrfs/super.c        |  11 ++---
 5 files changed, 68 insertions(+), 81 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index c31a102d34de..42bf0da250f5 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1508,13 +1508,13 @@ struct btrfs_fs_info {
 	struct btrfs_workqueue_struct *workers;
 	struct btrfs_workqueue_struct *delalloc_workers;
 	struct btrfs_workqueue_struct *flush_workers;
-	struct btrfs_workers endio_workers;
-	struct btrfs_workers endio_meta_workers;
-	struct btrfs_workers endio_raid56_workers;
+	struct btrfs_workqueue_struct *endio_workers;
+	struct btrfs_workqueue_struct *endio_meta_workers;
+	struct btrfs_workqueue_struct *endio_raid56_workers;
 	struct btrfs_workers rmw_workers;
-	struct btrfs_workers endio_meta_write_workers;
-	struct btrfs_workers endio_write_workers;
-	struct btrfs_workers endio_freespace_worker;
+	struct btrfs_workqueue_struct *endio_meta_write_workers;
+	struct btrfs_workqueue_struct *endio_write_workers;
+	struct btrfs_workqueue_struct *endio_freespace_worker;
 	struct btrfs_workqueue_struct *submit_workers;
 	struct btrfs_workers caching_workers;
 	struct btrfs_workers readahead_workers;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 5b82b0b31ec8..8ce0214e3bac 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -55,7 +55,7 @@
 #endif
 
 static struct extent_io_ops btree_extent_io_ops;
-static void end_workqueue_fn(struct btrfs_work *work);
+static void end_workqueue_fn(struct btrfs_work_struct *work);
 static void free_fs_root(struct btrfs_root *root);
 static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
 				    int read_only);
@@ -86,7 +86,7 @@ struct end_io_wq {
 	int error;
 	int metadata;
 	struct list_head list;
-	struct btrfs_work work;
+	struct btrfs_work_struct work;
 };
 
 /*
@@ -678,32 +678,31 @@ static void end_workqueue_bio(struct bio *bio, int err)
 
 	fs_info = end_io_wq->info;
 	end_io_wq->error = err;
-	end_io_wq->work.func = end_workqueue_fn;
-	end_io_wq->work.flags = 0;
+	btrfs_init_work(&end_io_wq->work, end_workqueue_fn, NULL, NULL);
 
 	if (bio->bi_rw & REQ_WRITE) {
 		if (end_io_wq->metadata == BTRFS_WQ_ENDIO_METADATA)
-			btrfs_queue_worker(&fs_info->endio_meta_write_workers,
-					   &end_io_wq->work);
+			btrfs_queue_work(fs_info->endio_meta_write_workers,
+					 &end_io_wq->work);
 		else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_FREE_SPACE)
-			btrfs_queue_worker(&fs_info->endio_freespace_worker,
-					   &end_io_wq->work);
+			btrfs_queue_work(fs_info->endio_freespace_worker,
+					 &end_io_wq->work);
 		else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56)
-			btrfs_queue_worker(&fs_info->endio_raid56_workers,
-					   &end_io_wq->work);
+			btrfs_queue_work(fs_info->endio_raid56_workers,
+					 &end_io_wq->work);
 		else
-			btrfs_queue_worker(&fs_info->endio_write_workers,
-					   &end_io_wq->work);
+			btrfs_queue_work(fs_info->endio_write_workers,
+					 &end_io_wq->work);
 	} else {
 		if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56)
-			btrfs_queue_worker(&fs_info->endio_raid56_workers,
-					   &end_io_wq->work);
+			btrfs_queue_work(fs_info->endio_raid56_workers,
+					 &end_io_wq->work);
 		else if (end_io_wq->metadata)
-			btrfs_queue_worker(&fs_info->endio_meta_workers,
-					   &end_io_wq->work);
+			btrfs_queue_work(fs_info->endio_meta_workers,
+					 &end_io_wq->work);
 		else
-			btrfs_queue_worker(&fs_info->endio_workers,
-					   &end_io_wq->work);
+			btrfs_queue_work(fs_info->endio_workers,
+					 &end_io_wq->work);
 	}
 }
 
@@ -1669,7 +1668,7 @@ static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi)
  * called by the kthread helper functions to finally call the bio end_io
  * functions.  This is where read checksum verification actually happens
  */
-static void end_workqueue_fn(struct btrfs_work *work)
+static void end_workqueue_fn(struct btrfs_work_struct *work)
 {
 	struct bio *bio;
 	struct end_io_wq *end_io_wq;
@@ -1999,13 +1998,13 @@ static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info)
 	btrfs_stop_workers(&fs_info->fixup_workers);
 	btrfs_destroy_workqueue(fs_info->delalloc_workers);
 	btrfs_destroy_workqueue(fs_info->workers);
-	btrfs_stop_workers(&fs_info->endio_workers);
-	btrfs_stop_workers(&fs_info->endio_meta_workers);
-	btrfs_stop_workers(&fs_info->endio_raid56_workers);
+	btrfs_destroy_workqueue(fs_info->endio_workers);
+	btrfs_destroy_workqueue(fs_info->endio_meta_workers);
+	btrfs_destroy_workqueue(fs_info->endio_raid56_workers);
 	btrfs_stop_workers(&fs_info->rmw_workers);
-	btrfs_stop_workers(&fs_info->endio_meta_write_workers);
-	btrfs_stop_workers(&fs_info->endio_write_workers);
-	btrfs_stop_workers(&fs_info->endio_freespace_worker);
+	btrfs_destroy_workqueue(fs_info->endio_meta_write_workers);
+	btrfs_destroy_workqueue(fs_info->endio_write_workers);
+	btrfs_destroy_workqueue(fs_info->endio_freespace_worker);
 	btrfs_destroy_workqueue(fs_info->submit_workers);
 	btrfs_stop_workers(&fs_info->delayed_workers);
 	btrfs_stop_workers(&fs_info->caching_workers);
@@ -2501,26 +2500,26 @@ int open_ctree(struct super_block *sb,
 
 	btrfs_init_workers(&fs_info->fixup_workers, "fixup", 1,
 			   &fs_info->generic_worker);
-	btrfs_init_workers(&fs_info->endio_workers, "endio",
-			   fs_info->thread_pool_size,
-			   &fs_info->generic_worker);
-	btrfs_init_workers(&fs_info->endio_meta_workers, "endio-meta",
-			   fs_info->thread_pool_size,
-			   &fs_info->generic_worker);
-	btrfs_init_workers(&fs_info->endio_meta_write_workers,
-			   "endio-meta-write", fs_info->thread_pool_size,
-			   &fs_info->generic_worker);
-	btrfs_init_workers(&fs_info->endio_raid56_workers,
-			   "endio-raid56", fs_info->thread_pool_size,
-			   &fs_info->generic_worker);
+
+	/*
+	 * endios are largely parallel and should have a very
+	 * low idle thresh
+	 */
+	fs_info->endio_workers =
+		btrfs_alloc_workqueue("endio", flags, max_active, 4);
+	fs_info->endio_meta_workers =
+		btrfs_alloc_workqueue("endio-meta", flags, max_active, 4);
+	fs_info->endio_meta_write_workers =
+		btrfs_alloc_workqueue("endio-meta-write", flags, max_active, 2);
+	fs_info->endio_raid56_workers =
+		btrfs_alloc_workqueue("endio-raid56", flags, max_active, 4);
 	btrfs_init_workers(&fs_info->rmw_workers,
 			   "rmw", fs_info->thread_pool_size,
 			   &fs_info->generic_worker);
-	btrfs_init_workers(&fs_info->endio_write_workers, "endio-write",
-			   fs_info->thread_pool_size,
-			   &fs_info->generic_worker);
-	btrfs_init_workers(&fs_info->endio_freespace_worker, "freespace-write",
-			   1, &fs_info->generic_worker);
+	fs_info->endio_write_workers =
+		btrfs_alloc_workqueue("endio-write", flags, max_active, 2);
+	fs_info->endio_freespace_worker =
+		btrfs_alloc_workqueue("freespace-write", flags, max_active, 0);
 	btrfs_init_workers(&fs_info->delayed_workers, "delayed-meta",
 			   fs_info->thread_pool_size,
 			   &fs_info->generic_worker);
@@ -2530,17 +2529,8 @@ int open_ctree(struct super_block *sb,
 	btrfs_init_workers(&fs_info->qgroup_rescan_workers, "qgroup-rescan", 1,
 			   &fs_info->generic_worker);
 
-	/*
-	 * endios are largely parallel and should have a very
-	 * low idle thresh
-	 */
-	fs_info->endio_workers.idle_thresh = 4;
-	fs_info->endio_meta_workers.idle_thresh = 4;
-	fs_info->endio_raid56_workers.idle_thresh = 4;
 	fs_info->rmw_workers.idle_thresh = 2;
 
-	fs_info->endio_write_workers.idle_thresh = 2;
-	fs_info->endio_meta_write_workers.idle_thresh = 2;
 	fs_info->readahead_workers.idle_thresh = 2;
 
 	/*
@@ -2549,13 +2539,7 @@ int open_ctree(struct super_block *sb,
 	 */
 	ret = btrfs_start_workers(&fs_info->generic_worker);
 	ret |= btrfs_start_workers(&fs_info->fixup_workers);
-	ret |= btrfs_start_workers(&fs_info->endio_workers);
-	ret |= btrfs_start_workers(&fs_info->endio_meta_workers);
 	ret |= btrfs_start_workers(&fs_info->rmw_workers);
-	ret |= btrfs_start_workers(&fs_info->endio_raid56_workers);
-	ret |= btrfs_start_workers(&fs_info->endio_meta_write_workers);
-	ret |= btrfs_start_workers(&fs_info->endio_write_workers);
-	ret |= btrfs_start_workers(&fs_info->endio_freespace_worker);
 	ret |= btrfs_start_workers(&fs_info->delayed_workers);
 	ret |= btrfs_start_workers(&fs_info->caching_workers);
 	ret |= btrfs_start_workers(&fs_info->readahead_workers);
@@ -2565,7 +2549,11 @@ int open_ctree(struct super_block *sb,
 		goto fail_sb_buffer;
 	}
 	if (!(fs_info->workers && fs_info->delalloc_workers &&
-	      fs_info->submit_workers && fs_info->flush_workers)) {
+	      fs_info->submit_workers && fs_info->flush_workers &&
+	      fs_info->endio_workers && fs_info->endio_meta_workers &&
+	      fs_info->endio_meta_write_workers &&
+	      fs_info->endio_write_workers && fs_info->endio_raid56_workers &&
+	      fs_info->endio_freespace_worker)) {
 		err = -ENOMEM;
 		goto fail_sb_buffer;
 	}
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 6c043bed0c32..ce3f73046605 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2750,7 +2750,7 @@ out:
 	return ret;
 }
 
-static void finish_ordered_fn(struct btrfs_work *work)
+static void finish_ordered_fn(struct btrfs_work_struct *work)
 {
 	struct btrfs_ordered_extent *ordered_extent;
 	ordered_extent = container_of(work, struct btrfs_ordered_extent, work);
@@ -2763,7 +2763,7 @@ static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
 	struct inode *inode = page->mapping->host;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_ordered_extent *ordered_extent = NULL;
-	struct btrfs_workers *workers;
+	struct btrfs_workqueue_struct *workers;
 
 	trace_btrfs_writepage_end_io_hook(page, start, end, uptodate);
 
@@ -2772,14 +2772,13 @@ static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
 					    end - start + 1, uptodate))
 		return 0;
 
-	ordered_extent->work.func = finish_ordered_fn;
-	ordered_extent->work.flags = 0;
+	btrfs_init_work(&ordered_extent->work, finish_ordered_fn, NULL, NULL);
 
 	if (btrfs_is_free_space_inode(inode))
-		workers = &root->fs_info->endio_freespace_worker;
+		workers = root->fs_info->endio_freespace_worker;
 	else
-		workers = &root->fs_info->endio_write_workers;
-	btrfs_queue_worker(workers, &ordered_extent->work);
+		workers = root->fs_info->endio_write_workers;
+	btrfs_queue_work(workers, &ordered_extent->work);
 
 	return 0;
 }
@@ -7046,10 +7045,9 @@ again:
 	if (!ret)
 		goto out_test;
 
-	ordered->work.func = finish_ordered_fn;
-	ordered->work.flags = 0;
-	btrfs_queue_worker(&root->fs_info->endio_write_workers,
-			   &ordered->work);
+	btrfs_init_work(&ordered->work, finish_ordered_fn, NULL, NULL);
+	btrfs_queue_work(root->fs_info->endio_write_workers,
+			 &ordered->work);
 out_test:
 	/*
 	 * our bio might span multiple ordered extents.  If we haven't
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index fe9f4dbab09c..84bb236119fe 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -130,7 +130,7 @@ struct btrfs_ordered_extent {
 	/* a per root list of all the pending ordered extents */
 	struct list_head root_extent_list;
 
-	struct btrfs_work work;
+	struct btrfs_work_struct work;
 
 	struct completion completion;
 	struct btrfs_work_struct flush_work;
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 9ed559ebe914..d95d98d3e72c 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1329,11 +1329,12 @@ static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info,
 	btrfs_workqueue_set_max(fs_info->submit_workers, new_pool_size);
 	btrfs_set_max_workers(&fs_info->caching_workers, new_pool_size);
 	btrfs_set_max_workers(&fs_info->fixup_workers, new_pool_size);
-	btrfs_set_max_workers(&fs_info->endio_workers, new_pool_size);
-	btrfs_set_max_workers(&fs_info->endio_meta_workers, new_pool_size);
-	btrfs_set_max_workers(&fs_info->endio_meta_write_workers, new_pool_size);
-	btrfs_set_max_workers(&fs_info->endio_write_workers, new_pool_size);
-	btrfs_set_max_workers(&fs_info->endio_freespace_worker, new_pool_size);
+	btrfs_workqueue_set_max(fs_info->endio_workers, new_pool_size);
+	btrfs_workqueue_set_max(fs_info->endio_meta_workers, new_pool_size);
+	btrfs_workqueue_set_max(fs_info->endio_meta_write_workers,
+				new_pool_size);
+	btrfs_workqueue_set_max(fs_info->endio_write_workers, new_pool_size);
+	btrfs_workqueue_set_max(fs_info->endio_freespace_worker, new_pool_size);
 	btrfs_set_max_workers(&fs_info->delayed_workers, new_pool_size);
 	btrfs_set_max_workers(&fs_info->readahead_workers, new_pool_size);
 	btrfs_set_max_workers(&fs_info->scrub_wr_completion_workers,
-- 
cgit v1.2.3


From d05a33ac265c62d4be35788dd978b2665033f077 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <quwenruo@cn.fujitsu.com>
Date: Fri, 28 Feb 2014 10:46:11 +0800
Subject: btrfs: Replace fs_info->rmw_workers workqueue with btrfs_workqueue.

Replace the fs_info->rmw_workers with the newly created
btrfs_workqueue.

Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Tested-by: David Sterba <dsterba@suse.cz>
Signed-off-by: Josef Bacik <jbacik@fb.com>
---
 fs/btrfs/ctree.h   |  2 +-
 fs/btrfs/disk-io.c | 12 ++++--------
 fs/btrfs/raid56.c  | 35 ++++++++++++++++-------------------
 3 files changed, 21 insertions(+), 28 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 42bf0da250f5..8102fcd8f1fe 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1511,7 +1511,7 @@ struct btrfs_fs_info {
 	struct btrfs_workqueue_struct *endio_workers;
 	struct btrfs_workqueue_struct *endio_meta_workers;
 	struct btrfs_workqueue_struct *endio_raid56_workers;
-	struct btrfs_workers rmw_workers;
+	struct btrfs_workqueue_struct *rmw_workers;
 	struct btrfs_workqueue_struct *endio_meta_write_workers;
 	struct btrfs_workqueue_struct *endio_write_workers;
 	struct btrfs_workqueue_struct *endio_freespace_worker;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 8ce0214e3bac..5f12806e96e8 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2001,7 +2001,7 @@ static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info)
 	btrfs_destroy_workqueue(fs_info->endio_workers);
 	btrfs_destroy_workqueue(fs_info->endio_meta_workers);
 	btrfs_destroy_workqueue(fs_info->endio_raid56_workers);
-	btrfs_stop_workers(&fs_info->rmw_workers);
+	btrfs_destroy_workqueue(fs_info->rmw_workers);
 	btrfs_destroy_workqueue(fs_info->endio_meta_write_workers);
 	btrfs_destroy_workqueue(fs_info->endio_write_workers);
 	btrfs_destroy_workqueue(fs_info->endio_freespace_worker);
@@ -2513,9 +2513,8 @@ int open_ctree(struct super_block *sb,
 		btrfs_alloc_workqueue("endio-meta-write", flags, max_active, 2);
 	fs_info->endio_raid56_workers =
 		btrfs_alloc_workqueue("endio-raid56", flags, max_active, 4);
-	btrfs_init_workers(&fs_info->rmw_workers,
-			   "rmw", fs_info->thread_pool_size,
-			   &fs_info->generic_worker);
+	fs_info->rmw_workers =
+		btrfs_alloc_workqueue("rmw", flags, max_active, 2);
 	fs_info->endio_write_workers =
 		btrfs_alloc_workqueue("endio-write", flags, max_active, 2);
 	fs_info->endio_freespace_worker =
@@ -2529,8 +2528,6 @@ int open_ctree(struct super_block *sb,
 	btrfs_init_workers(&fs_info->qgroup_rescan_workers, "qgroup-rescan", 1,
 			   &fs_info->generic_worker);
 
-	fs_info->rmw_workers.idle_thresh = 2;
-
 	fs_info->readahead_workers.idle_thresh = 2;
 
 	/*
@@ -2539,7 +2536,6 @@ int open_ctree(struct super_block *sb,
 	 */
 	ret = btrfs_start_workers(&fs_info->generic_worker);
 	ret |= btrfs_start_workers(&fs_info->fixup_workers);
-	ret |= btrfs_start_workers(&fs_info->rmw_workers);
 	ret |= btrfs_start_workers(&fs_info->delayed_workers);
 	ret |= btrfs_start_workers(&fs_info->caching_workers);
 	ret |= btrfs_start_workers(&fs_info->readahead_workers);
@@ -2553,7 +2549,7 @@ int open_ctree(struct super_block *sb,
 	      fs_info->endio_workers && fs_info->endio_meta_workers &&
 	      fs_info->endio_meta_write_workers &&
 	      fs_info->endio_write_workers && fs_info->endio_raid56_workers &&
-	      fs_info->endio_freespace_worker)) {
+	      fs_info->endio_freespace_worker && fs_info->rmw_workers)) {
 		err = -ENOMEM;
 		goto fail_sb_buffer;
 	}
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 24ac21840a9a..5afa564201a2 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -87,7 +87,7 @@ struct btrfs_raid_bio {
 	/*
 	 * for scheduling work in the helper threads
 	 */
-	struct btrfs_work work;
+	struct btrfs_work_struct work;
 
 	/*
 	 * bio list and bio_list_lock are used
@@ -166,8 +166,8 @@ struct btrfs_raid_bio {
 
 static int __raid56_parity_recover(struct btrfs_raid_bio *rbio);
 static noinline void finish_rmw(struct btrfs_raid_bio *rbio);
-static void rmw_work(struct btrfs_work *work);
-static void read_rebuild_work(struct btrfs_work *work);
+static void rmw_work(struct btrfs_work_struct *work);
+static void read_rebuild_work(struct btrfs_work_struct *work);
 static void async_rmw_stripe(struct btrfs_raid_bio *rbio);
 static void async_read_rebuild(struct btrfs_raid_bio *rbio);
 static int fail_bio_stripe(struct btrfs_raid_bio *rbio, struct bio *bio);
@@ -1416,20 +1416,18 @@ cleanup:
 
 static void async_rmw_stripe(struct btrfs_raid_bio *rbio)
 {
-	rbio->work.flags = 0;
-	rbio->work.func = rmw_work;
+	btrfs_init_work(&rbio->work, rmw_work, NULL, NULL);
 
-	btrfs_queue_worker(&rbio->fs_info->rmw_workers,
-			   &rbio->work);
+	btrfs_queue_work(rbio->fs_info->rmw_workers,
+			 &rbio->work);
 }
 
 static void async_read_rebuild(struct btrfs_raid_bio *rbio)
 {
-	rbio->work.flags = 0;
-	rbio->work.func = read_rebuild_work;
+	btrfs_init_work(&rbio->work, read_rebuild_work, NULL, NULL);
 
-	btrfs_queue_worker(&rbio->fs_info->rmw_workers,
-			   &rbio->work);
+	btrfs_queue_work(rbio->fs_info->rmw_workers,
+			 &rbio->work);
 }
 
 /*
@@ -1590,7 +1588,7 @@ struct btrfs_plug_cb {
 	struct blk_plug_cb cb;
 	struct btrfs_fs_info *info;
 	struct list_head rbio_list;
-	struct btrfs_work work;
+	struct btrfs_work_struct work;
 };
 
 /*
@@ -1654,7 +1652,7 @@ static void run_plug(struct btrfs_plug_cb *plug)
  * if the unplug comes from schedule, we have to push the
  * work off to a helper thread
  */
-static void unplug_work(struct btrfs_work *work)
+static void unplug_work(struct btrfs_work_struct *work)
 {
 	struct btrfs_plug_cb *plug;
 	plug = container_of(work, struct btrfs_plug_cb, work);
@@ -1667,10 +1665,9 @@ static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule)
 	plug = container_of(cb, struct btrfs_plug_cb, cb);
 
 	if (from_schedule) {
-		plug->work.flags = 0;
-		plug->work.func = unplug_work;
-		btrfs_queue_worker(&plug->info->rmw_workers,
-				   &plug->work);
+		btrfs_init_work(&plug->work, unplug_work, NULL, NULL);
+		btrfs_queue_work(plug->info->rmw_workers,
+				 &plug->work);
 		return;
 	}
 	run_plug(plug);
@@ -2082,7 +2079,7 @@ int raid56_parity_recover(struct btrfs_root *root, struct bio *bio,
 
 }
 
-static void rmw_work(struct btrfs_work *work)
+static void rmw_work(struct btrfs_work_struct *work)
 {
 	struct btrfs_raid_bio *rbio;
 
@@ -2090,7 +2087,7 @@ static void rmw_work(struct btrfs_work *work)
 	raid56_rmw_stripe(rbio);
 }
 
-static void read_rebuild_work(struct btrfs_work *work)
+static void read_rebuild_work(struct btrfs_work_struct *work)
 {
 	struct btrfs_raid_bio *rbio;
 
-- 
cgit v1.2.3


From e66f0bb14465371d4c86fa70cff2acc331efa1fb Mon Sep 17 00:00:00 2001
From: Qu Wenruo <quwenruo@cn.fujitsu.com>
Date: Fri, 28 Feb 2014 10:46:12 +0800
Subject: btrfs: Replace fs_info->cache_workers workqueue with btrfs_workqueue.

Replace the fs_info->cache_workers with the newly created
btrfs_workqueue.

Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Tested-by: David Sterba <dsterba@suse.cz>
Signed-off-by: Josef Bacik <jbacik@fb.com>
---
 fs/btrfs/ctree.h       |  4 ++--
 fs/btrfs/disk-io.c     | 10 +++++-----
 fs/btrfs/extent-tree.c |  6 +++---
 fs/btrfs/super.c       |  2 +-
 4 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 8102fcd8f1fe..e5c94cbaa3fa 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1221,7 +1221,7 @@ struct btrfs_caching_control {
 	struct list_head list;
 	struct mutex mutex;
 	wait_queue_head_t wait;
-	struct btrfs_work work;
+	struct btrfs_work_struct work;
 	struct btrfs_block_group_cache *block_group;
 	u64 progress;
 	atomic_t count;
@@ -1516,7 +1516,7 @@ struct btrfs_fs_info {
 	struct btrfs_workqueue_struct *endio_write_workers;
 	struct btrfs_workqueue_struct *endio_freespace_worker;
 	struct btrfs_workqueue_struct *submit_workers;
-	struct btrfs_workers caching_workers;
+	struct btrfs_workqueue_struct *caching_workers;
 	struct btrfs_workers readahead_workers;
 
 	/*
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 5f12806e96e8..9c14e3bd078c 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2007,7 +2007,7 @@ static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info)
 	btrfs_destroy_workqueue(fs_info->endio_freespace_worker);
 	btrfs_destroy_workqueue(fs_info->submit_workers);
 	btrfs_stop_workers(&fs_info->delayed_workers);
-	btrfs_stop_workers(&fs_info->caching_workers);
+	btrfs_destroy_workqueue(fs_info->caching_workers);
 	btrfs_stop_workers(&fs_info->readahead_workers);
 	btrfs_destroy_workqueue(fs_info->flush_workers);
 	btrfs_stop_workers(&fs_info->qgroup_rescan_workers);
@@ -2485,8 +2485,8 @@ int open_ctree(struct super_block *sb,
 	fs_info->flush_workers =
 		btrfs_alloc_workqueue("flush_delalloc", flags, max_active, 0);
 
-	btrfs_init_workers(&fs_info->caching_workers, "cache",
-			   fs_info->thread_pool_size, NULL);
+	fs_info->caching_workers =
+		btrfs_alloc_workqueue("cache", flags, max_active, 0);
 
 	/*
 	 * a higher idle thresh on the submit workers makes it much more
@@ -2537,7 +2537,6 @@ int open_ctree(struct super_block *sb,
 	ret = btrfs_start_workers(&fs_info->generic_worker);
 	ret |= btrfs_start_workers(&fs_info->fixup_workers);
 	ret |= btrfs_start_workers(&fs_info->delayed_workers);
-	ret |= btrfs_start_workers(&fs_info->caching_workers);
 	ret |= btrfs_start_workers(&fs_info->readahead_workers);
 	ret |= btrfs_start_workers(&fs_info->qgroup_rescan_workers);
 	if (ret) {
@@ -2549,7 +2548,8 @@ int open_ctree(struct super_block *sb,
 	      fs_info->endio_workers && fs_info->endio_meta_workers &&
 	      fs_info->endio_meta_write_workers &&
 	      fs_info->endio_write_workers && fs_info->endio_raid56_workers &&
-	      fs_info->endio_freespace_worker && fs_info->rmw_workers)) {
+	      fs_info->endio_freespace_worker && fs_info->rmw_workers &&
+	      fs_info->caching_workers)) {
 		err = -ENOMEM;
 		goto fail_sb_buffer;
 	}
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 32312e09f0f5..bb58082f6d61 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -378,7 +378,7 @@ static u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
 	return total_added;
 }
 
-static noinline void caching_thread(struct btrfs_work *work)
+static noinline void caching_thread(struct btrfs_work_struct *work)
 {
 	struct btrfs_block_group_cache *block_group;
 	struct btrfs_fs_info *fs_info;
@@ -549,7 +549,7 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
 	caching_ctl->block_group = cache;
 	caching_ctl->progress = cache->key.objectid;
 	atomic_set(&caching_ctl->count, 1);
-	caching_ctl->work.func = caching_thread;
+	btrfs_init_work(&caching_ctl->work, caching_thread, NULL, NULL);
 
 	spin_lock(&cache->lock);
 	/*
@@ -640,7 +640,7 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
 
 	btrfs_get_block_group(cache);
 
-	btrfs_queue_worker(&fs_info->caching_workers, &caching_ctl->work);
+	btrfs_queue_work(fs_info->caching_workers, &caching_ctl->work);
 
 	return ret;
 }
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index d95d98d3e72c..b84fbe04f05a 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1327,7 +1327,7 @@ static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info,
 	btrfs_workqueue_set_max(fs_info->workers, new_pool_size);
 	btrfs_workqueue_set_max(fs_info->delalloc_workers, new_pool_size);
 	btrfs_workqueue_set_max(fs_info->submit_workers, new_pool_size);
-	btrfs_set_max_workers(&fs_info->caching_workers, new_pool_size);
+	btrfs_workqueue_set_max(fs_info->caching_workers, new_pool_size);
 	btrfs_set_max_workers(&fs_info->fixup_workers, new_pool_size);
 	btrfs_workqueue_set_max(fs_info->endio_workers, new_pool_size);
 	btrfs_workqueue_set_max(fs_info->endio_meta_workers, new_pool_size);
-- 
cgit v1.2.3


From 736cfa15e89a654436d4149c109bf1ae09fc67cf Mon Sep 17 00:00:00 2001
From: Qu Wenruo <quwenruo@cn.fujitsu.com>
Date: Fri, 28 Feb 2014 10:46:13 +0800
Subject: btrfs: Replace fs_info->readahead_workers workqueue with
 btrfs_workqueue.

Replace the fs_info->readahead_workers with the newly created
btrfs_workqueue.

Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Tested-by: David Sterba <dsterba@suse.cz>
Signed-off-by: Josef Bacik <jbacik@fb.com>
---
 fs/btrfs/ctree.h   |  2 +-
 fs/btrfs/disk-io.c | 12 ++++--------
 fs/btrfs/reada.c   |  9 +++++----
 fs/btrfs/super.c   |  2 +-
 4 files changed, 11 insertions(+), 14 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index e5c94cbaa3fa..b5f2a19177e8 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1517,7 +1517,7 @@ struct btrfs_fs_info {
 	struct btrfs_workqueue_struct *endio_freespace_worker;
 	struct btrfs_workqueue_struct *submit_workers;
 	struct btrfs_workqueue_struct *caching_workers;
-	struct btrfs_workers readahead_workers;
+	struct btrfs_workqueue_struct *readahead_workers;
 
 	/*
 	 * fixup workers take dirty pages that didn't properly go through
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 9c14e3bd078c..c0b003bb66cd 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2008,7 +2008,7 @@ static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info)
 	btrfs_destroy_workqueue(fs_info->submit_workers);
 	btrfs_stop_workers(&fs_info->delayed_workers);
 	btrfs_destroy_workqueue(fs_info->caching_workers);
-	btrfs_stop_workers(&fs_info->readahead_workers);
+	btrfs_destroy_workqueue(fs_info->readahead_workers);
 	btrfs_destroy_workqueue(fs_info->flush_workers);
 	btrfs_stop_workers(&fs_info->qgroup_rescan_workers);
 }
@@ -2522,14 +2522,11 @@ int open_ctree(struct super_block *sb,
 	btrfs_init_workers(&fs_info->delayed_workers, "delayed-meta",
 			   fs_info->thread_pool_size,
 			   &fs_info->generic_worker);
-	btrfs_init_workers(&fs_info->readahead_workers, "readahead",
-			   fs_info->thread_pool_size,
-			   &fs_info->generic_worker);
+	fs_info->readahead_workers =
+		btrfs_alloc_workqueue("readahead", flags, max_active, 2);
 	btrfs_init_workers(&fs_info->qgroup_rescan_workers, "qgroup-rescan", 1,
 			   &fs_info->generic_worker);
 
-	fs_info->readahead_workers.idle_thresh = 2;
-
 	/*
 	 * btrfs_start_workers can really only fail because of ENOMEM so just
 	 * return -ENOMEM if any of these fail.
@@ -2537,7 +2534,6 @@ int open_ctree(struct super_block *sb,
 	ret = btrfs_start_workers(&fs_info->generic_worker);
 	ret |= btrfs_start_workers(&fs_info->fixup_workers);
 	ret |= btrfs_start_workers(&fs_info->delayed_workers);
-	ret |= btrfs_start_workers(&fs_info->readahead_workers);
 	ret |= btrfs_start_workers(&fs_info->qgroup_rescan_workers);
 	if (ret) {
 		err = -ENOMEM;
@@ -2549,7 +2545,7 @@ int open_ctree(struct super_block *sb,
 	      fs_info->endio_meta_write_workers &&
 	      fs_info->endio_write_workers && fs_info->endio_raid56_workers &&
 	      fs_info->endio_freespace_worker && fs_info->rmw_workers &&
-	      fs_info->caching_workers)) {
+	      fs_info->caching_workers && fs_info->readahead_workers)) {
 		err = -ENOMEM;
 		goto fail_sb_buffer;
 	}
diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c
index 31c797c48c3e..9e01d3677355 100644
--- a/fs/btrfs/reada.c
+++ b/fs/btrfs/reada.c
@@ -91,7 +91,8 @@ struct reada_zone {
 };
 
 struct reada_machine_work {
-	struct btrfs_work	work;
+	struct btrfs_work_struct
+				work;
 	struct btrfs_fs_info	*fs_info;
 };
 
@@ -733,7 +734,7 @@ static int reada_start_machine_dev(struct btrfs_fs_info *fs_info,
 
 }
 
-static void reada_start_machine_worker(struct btrfs_work *work)
+static void reada_start_machine_worker(struct btrfs_work_struct *work)
 {
 	struct reada_machine_work *rmw;
 	struct btrfs_fs_info *fs_info;
@@ -793,10 +794,10 @@ static void reada_start_machine(struct btrfs_fs_info *fs_info)
 		/* FIXME we cannot handle this properly right now */
 		BUG();
 	}
-	rmw->work.func = reada_start_machine_worker;
+	btrfs_init_work(&rmw->work, reada_start_machine_worker, NULL, NULL);
 	rmw->fs_info = fs_info;
 
-	btrfs_queue_worker(&fs_info->readahead_workers, &rmw->work);
+	btrfs_queue_work(fs_info->readahead_workers, &rmw->work);
 }
 
 #ifdef DEBUG
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index b84fbe04f05a..ce9d012a77e8 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1336,7 +1336,7 @@ static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info,
 	btrfs_workqueue_set_max(fs_info->endio_write_workers, new_pool_size);
 	btrfs_workqueue_set_max(fs_info->endio_freespace_worker, new_pool_size);
 	btrfs_set_max_workers(&fs_info->delayed_workers, new_pool_size);
-	btrfs_set_max_workers(&fs_info->readahead_workers, new_pool_size);
+	btrfs_workqueue_set_max(fs_info->readahead_workers, new_pool_size);
 	btrfs_set_max_workers(&fs_info->scrub_wr_completion_workers,
 			      new_pool_size);
 }
-- 
cgit v1.2.3


From dc6e320998fb907e4c19032d545d461bfe5040d1 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <quwenruo@cn.fujitsu.com>
Date: Fri, 28 Feb 2014 10:46:14 +0800
Subject: btrfs: Replace fs_info->fixup_workers workqueue with btrfs_workqueue.

Replace the fs_info->fixup_workers with the newly created
btrfs_workqueue.

Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Tested-by: David Sterba <dsterba@suse.cz>
Signed-off-by: Josef Bacik <jbacik@fb.com>
---
 fs/btrfs/ctree.h   |  2 +-
 fs/btrfs/disk-io.c | 10 +++++-----
 fs/btrfs/inode.c   |  8 ++++----
 fs/btrfs/super.c   |  1 -
 4 files changed, 10 insertions(+), 11 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index b5f2a19177e8..dd79fc5a8c99 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1524,7 +1524,7 @@ struct btrfs_fs_info {
 	 * the cow mechanism and make them safe to write.  It happens
 	 * for the sys_munmap function call path
 	 */
-	struct btrfs_workers fixup_workers;
+	struct btrfs_workqueue_struct *fixup_workers;
 	struct btrfs_workers delayed_workers;
 	struct task_struct *transaction_kthread;
 	struct task_struct *cleaner_kthread;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index c0b003bb66cd..392cd3baefe4 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1995,7 +1995,7 @@ static noinline int next_root_backup(struct btrfs_fs_info *info,
 static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info)
 {
 	btrfs_stop_workers(&fs_info->generic_worker);
-	btrfs_stop_workers(&fs_info->fixup_workers);
+	btrfs_destroy_workqueue(fs_info->fixup_workers);
 	btrfs_destroy_workqueue(fs_info->delalloc_workers);
 	btrfs_destroy_workqueue(fs_info->workers);
 	btrfs_destroy_workqueue(fs_info->endio_workers);
@@ -2498,8 +2498,8 @@ int open_ctree(struct super_block *sb,
 				      min_t(u64, fs_devices->num_devices,
 					    max_active), 64);
 
-	btrfs_init_workers(&fs_info->fixup_workers, "fixup", 1,
-			   &fs_info->generic_worker);
+	fs_info->fixup_workers =
+		btrfs_alloc_workqueue("fixup", flags, 1, 0);
 
 	/*
 	 * endios are largely parallel and should have a very
@@ -2532,7 +2532,6 @@ int open_ctree(struct super_block *sb,
 	 * return -ENOMEM if any of these fail.
 	 */
 	ret = btrfs_start_workers(&fs_info->generic_worker);
-	ret |= btrfs_start_workers(&fs_info->fixup_workers);
 	ret |= btrfs_start_workers(&fs_info->delayed_workers);
 	ret |= btrfs_start_workers(&fs_info->qgroup_rescan_workers);
 	if (ret) {
@@ -2545,7 +2544,8 @@ int open_ctree(struct super_block *sb,
 	      fs_info->endio_meta_write_workers &&
 	      fs_info->endio_write_workers && fs_info->endio_raid56_workers &&
 	      fs_info->endio_freespace_worker && fs_info->rmw_workers &&
-	      fs_info->caching_workers && fs_info->readahead_workers)) {
+	      fs_info->caching_workers && fs_info->readahead_workers &&
+	      fs_info->fixup_workers)) {
 		err = -ENOMEM;
 		goto fail_sb_buffer;
 	}
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index ce3f73046605..0885f333574d 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1748,10 +1748,10 @@ int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
 /* see btrfs_writepage_start_hook for details on why this is required */
 struct btrfs_writepage_fixup {
 	struct page *page;
-	struct btrfs_work work;
+	struct btrfs_work_struct work;
 };
 
-static void btrfs_writepage_fixup_worker(struct btrfs_work *work)
+static void btrfs_writepage_fixup_worker(struct btrfs_work_struct *work)
 {
 	struct btrfs_writepage_fixup *fixup;
 	struct btrfs_ordered_extent *ordered;
@@ -1842,9 +1842,9 @@ static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end)
 
 	SetPageChecked(page);
 	page_cache_get(page);
-	fixup->work.func = btrfs_writepage_fixup_worker;
+	btrfs_init_work(&fixup->work, btrfs_writepage_fixup_worker, NULL, NULL);
 	fixup->page = page;
-	btrfs_queue_worker(&root->fs_info->fixup_workers, &fixup->work);
+	btrfs_queue_work(root->fs_info->fixup_workers, &fixup->work);
 	return -EBUSY;
 }
 
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index ce9d012a77e8..2e1d6cf4dc66 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1328,7 +1328,6 @@ static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info,
 	btrfs_workqueue_set_max(fs_info->delalloc_workers, new_pool_size);
 	btrfs_workqueue_set_max(fs_info->submit_workers, new_pool_size);
 	btrfs_workqueue_set_max(fs_info->caching_workers, new_pool_size);
-	btrfs_set_max_workers(&fs_info->fixup_workers, new_pool_size);
 	btrfs_workqueue_set_max(fs_info->endio_workers, new_pool_size);
 	btrfs_workqueue_set_max(fs_info->endio_meta_workers, new_pool_size);
 	btrfs_workqueue_set_max(fs_info->endio_meta_write_workers,
-- 
cgit v1.2.3


From 5b3bc44e2e69d42edf40ca3785040d233ca949f4 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <quwenruo@cn.fujitsu.com>
Date: Fri, 28 Feb 2014 10:46:15 +0800
Subject: btrfs: Replace fs_info->delayed_workers workqueue with
 btrfs_workqueue.

Replace the fs_info->delayed_workers with the newly created
btrfs_workqueue.

Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Tested-by: David Sterba <dsterba@suse.cz>
Signed-off-by: Josef Bacik <jbacik@fb.com>
---
 fs/btrfs/ctree.h         |  2 +-
 fs/btrfs/delayed-inode.c | 10 +++++-----
 fs/btrfs/disk-io.c       | 10 ++++------
 fs/btrfs/super.c         |  2 +-
 4 files changed, 11 insertions(+), 13 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index dd79fc5a8c99..c07b67f6f924 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1525,7 +1525,7 @@ struct btrfs_fs_info {
 	 * for the sys_munmap function call path
 	 */
 	struct btrfs_workqueue_struct *fixup_workers;
-	struct btrfs_workers delayed_workers;
+	struct btrfs_workqueue_struct *delayed_workers;
 	struct task_struct *transaction_kthread;
 	struct task_struct *cleaner_kthread;
 	int thread_pool_size;
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 451b00c86f6c..76e85d66801f 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -1318,10 +1318,10 @@ void btrfs_remove_delayed_node(struct inode *inode)
 struct btrfs_async_delayed_work {
 	struct btrfs_delayed_root *delayed_root;
 	int nr;
-	struct btrfs_work work;
+	struct btrfs_work_struct work;
 };
 
-static void btrfs_async_run_delayed_root(struct btrfs_work *work)
+static void btrfs_async_run_delayed_root(struct btrfs_work_struct *work)
 {
 	struct btrfs_async_delayed_work *async_work;
 	struct btrfs_delayed_root *delayed_root;
@@ -1392,11 +1392,11 @@ static int btrfs_wq_run_delayed_node(struct btrfs_delayed_root *delayed_root,
 		return -ENOMEM;
 
 	async_work->delayed_root = delayed_root;
-	async_work->work.func = btrfs_async_run_delayed_root;
-	async_work->work.flags = 0;
+	btrfs_init_work(&async_work->work, btrfs_async_run_delayed_root,
+			NULL, NULL);
 	async_work->nr = nr;
 
-	btrfs_queue_worker(&root->fs_info->delayed_workers, &async_work->work);
+	btrfs_queue_work(root->fs_info->delayed_workers, &async_work->work);
 	return 0;
 }
 
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 392cd3baefe4..f5da1fd23ee9 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2006,7 +2006,7 @@ static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info)
 	btrfs_destroy_workqueue(fs_info->endio_write_workers);
 	btrfs_destroy_workqueue(fs_info->endio_freespace_worker);
 	btrfs_destroy_workqueue(fs_info->submit_workers);
-	btrfs_stop_workers(&fs_info->delayed_workers);
+	btrfs_destroy_workqueue(fs_info->delayed_workers);
 	btrfs_destroy_workqueue(fs_info->caching_workers);
 	btrfs_destroy_workqueue(fs_info->readahead_workers);
 	btrfs_destroy_workqueue(fs_info->flush_workers);
@@ -2519,9 +2519,8 @@ int open_ctree(struct super_block *sb,
 		btrfs_alloc_workqueue("endio-write", flags, max_active, 2);
 	fs_info->endio_freespace_worker =
 		btrfs_alloc_workqueue("freespace-write", flags, max_active, 0);
-	btrfs_init_workers(&fs_info->delayed_workers, "delayed-meta",
-			   fs_info->thread_pool_size,
-			   &fs_info->generic_worker);
+	fs_info->delayed_workers =
+		btrfs_alloc_workqueue("delayed-meta", flags, max_active, 0);
 	fs_info->readahead_workers =
 		btrfs_alloc_workqueue("readahead", flags, max_active, 2);
 	btrfs_init_workers(&fs_info->qgroup_rescan_workers, "qgroup-rescan", 1,
@@ -2532,7 +2531,6 @@ int open_ctree(struct super_block *sb,
 	 * return -ENOMEM if any of these fail.
 	 */
 	ret = btrfs_start_workers(&fs_info->generic_worker);
-	ret |= btrfs_start_workers(&fs_info->delayed_workers);
 	ret |= btrfs_start_workers(&fs_info->qgroup_rescan_workers);
 	if (ret) {
 		err = -ENOMEM;
@@ -2545,7 +2543,7 @@ int open_ctree(struct super_block *sb,
 	      fs_info->endio_write_workers && fs_info->endio_raid56_workers &&
 	      fs_info->endio_freespace_worker && fs_info->rmw_workers &&
 	      fs_info->caching_workers && fs_info->readahead_workers &&
-	      fs_info->fixup_workers)) {
+	      fs_info->fixup_workers && fs_info->delayed_workers)) {
 		err = -ENOMEM;
 		goto fail_sb_buffer;
 	}
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 2e1d6cf4dc66..fd07d039b2de 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1334,7 +1334,7 @@ static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info,
 				new_pool_size);
 	btrfs_workqueue_set_max(fs_info->endio_write_workers, new_pool_size);
 	btrfs_workqueue_set_max(fs_info->endio_freespace_worker, new_pool_size);
-	btrfs_set_max_workers(&fs_info->delayed_workers, new_pool_size);
+	btrfs_workqueue_set_max(fs_info->delayed_workers, new_pool_size);
 	btrfs_workqueue_set_max(fs_info->readahead_workers, new_pool_size);
 	btrfs_set_max_workers(&fs_info->scrub_wr_completion_workers,
 			      new_pool_size);
-- 
cgit v1.2.3


From fc97fab0ea59fb923cbe91b7d208ffc6f1d8a95c Mon Sep 17 00:00:00 2001
From: Qu Wenruo <quwenruo@cn.fujitsu.com>
Date: Fri, 28 Feb 2014 10:46:16 +0800
Subject: btrfs: Replace fs_info->qgroup_rescan_worker workqueue with
 btrfs_workqueue.

Replace the fs_info->qgroup_rescan_worker with the newly created
btrfs_workqueue.

Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Tested-by: David Sterba <dsterba@suse.cz>
Signed-off-by: Josef Bacik <jbacik@fb.com>
---
 fs/btrfs/ctree.h   |  4 ++--
 fs/btrfs/disk-io.c | 10 +++++-----
 fs/btrfs/qgroup.c  | 17 +++++++++--------
 3 files changed, 16 insertions(+), 15 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index c07b67f6f924..7b50def3f206 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1648,9 +1648,9 @@ struct btrfs_fs_info {
 	/* qgroup rescan items */
 	struct mutex qgroup_rescan_lock; /* protects the progress item */
 	struct btrfs_key qgroup_rescan_progress;
-	struct btrfs_workers qgroup_rescan_workers;
+	struct btrfs_workqueue_struct *qgroup_rescan_workers;
 	struct completion qgroup_rescan_completion;
-	struct btrfs_work qgroup_rescan_work;
+	struct btrfs_work_struct qgroup_rescan_work;
 
 	/* filesystem state */
 	unsigned long fs_state;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index f5da1fd23ee9..9aaf9c309b54 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2010,7 +2010,7 @@ static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info)
 	btrfs_destroy_workqueue(fs_info->caching_workers);
 	btrfs_destroy_workqueue(fs_info->readahead_workers);
 	btrfs_destroy_workqueue(fs_info->flush_workers);
-	btrfs_stop_workers(&fs_info->qgroup_rescan_workers);
+	btrfs_destroy_workqueue(fs_info->qgroup_rescan_workers);
 }
 
 static void free_root_extent_buffers(struct btrfs_root *root)
@@ -2523,15 +2523,14 @@ int open_ctree(struct super_block *sb,
 		btrfs_alloc_workqueue("delayed-meta", flags, max_active, 0);
 	fs_info->readahead_workers =
 		btrfs_alloc_workqueue("readahead", flags, max_active, 2);
-	btrfs_init_workers(&fs_info->qgroup_rescan_workers, "qgroup-rescan", 1,
-			   &fs_info->generic_worker);
+	fs_info->qgroup_rescan_workers =
+		btrfs_alloc_workqueue("qgroup-rescan", flags, 1, 0);
 
 	/*
 	 * btrfs_start_workers can really only fail because of ENOMEM so just
 	 * return -ENOMEM if any of these fail.
 	 */
 	ret = btrfs_start_workers(&fs_info->generic_worker);
-	ret |= btrfs_start_workers(&fs_info->qgroup_rescan_workers);
 	if (ret) {
 		err = -ENOMEM;
 		goto fail_sb_buffer;
@@ -2543,7 +2542,8 @@ int open_ctree(struct super_block *sb,
 	      fs_info->endio_write_workers && fs_info->endio_raid56_workers &&
 	      fs_info->endio_freespace_worker && fs_info->rmw_workers &&
 	      fs_info->caching_workers && fs_info->readahead_workers &&
-	      fs_info->fixup_workers && fs_info->delayed_workers)) {
+	      fs_info->fixup_workers && fs_info->delayed_workers &&
+	      fs_info->qgroup_rescan_workers)) {
 		err = -ENOMEM;
 		goto fail_sb_buffer;
 	}
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 472302a2d745..38617cc2fdd5 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1509,8 +1509,8 @@ int btrfs_run_qgroups(struct btrfs_trans_handle *trans,
 		ret = qgroup_rescan_init(fs_info, 0, 1);
 		if (!ret) {
 			qgroup_rescan_zero_tracking(fs_info);
-			btrfs_queue_worker(&fs_info->qgroup_rescan_workers,
-					   &fs_info->qgroup_rescan_work);
+			btrfs_queue_work(fs_info->qgroup_rescan_workers,
+					 &fs_info->qgroup_rescan_work);
 		}
 		ret = 0;
 	}
@@ -1984,7 +1984,7 @@ out:
 	return ret;
 }
 
-static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
+static void btrfs_qgroup_rescan_worker(struct btrfs_work_struct *work)
 {
 	struct btrfs_fs_info *fs_info = container_of(work, struct btrfs_fs_info,
 						     qgroup_rescan_work);
@@ -2095,7 +2095,8 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
 
 	memset(&fs_info->qgroup_rescan_work, 0,
 	       sizeof(fs_info->qgroup_rescan_work));
-	fs_info->qgroup_rescan_work.func = btrfs_qgroup_rescan_worker;
+	btrfs_init_work(&fs_info->qgroup_rescan_work,
+			btrfs_qgroup_rescan_worker, NULL, NULL);
 
 	if (ret) {
 err:
@@ -2158,8 +2159,8 @@ btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info)
 
 	qgroup_rescan_zero_tracking(fs_info);
 
-	btrfs_queue_worker(&fs_info->qgroup_rescan_workers,
-			   &fs_info->qgroup_rescan_work);
+	btrfs_queue_work(fs_info->qgroup_rescan_workers,
+			 &fs_info->qgroup_rescan_work);
 
 	return 0;
 }
@@ -2190,6 +2191,6 @@ void
 btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info)
 {
 	if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN)
-		btrfs_queue_worker(&fs_info->qgroup_rescan_workers,
-				   &fs_info->qgroup_rescan_work);
+		btrfs_queue_work(fs_info->qgroup_rescan_workers,
+				 &fs_info->qgroup_rescan_work);
 }
-- 
cgit v1.2.3


From 0339ef2f42bcfbb2d4021ad6f38fe20580082c85 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <quwenruo@cn.fujitsu.com>
Date: Fri, 28 Feb 2014 10:46:17 +0800
Subject: btrfs: Replace fs_info->scrub_* workqueue with btrfs_workqueue.

Replace the fs_info->scrub_* with the newly created
btrfs_workqueue.

Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Tested-by: David Sterba <dsterba@suse.cz>
Signed-off-by: Josef Bacik <jbacik@fb.com>
---
 fs/btrfs/ctree.h |  6 ++--
 fs/btrfs/scrub.c | 93 ++++++++++++++++++++++++++++++--------------------------
 fs/btrfs/super.c |  4 +--
 3 files changed, 55 insertions(+), 48 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 7b50def3f206..a98f86ac187e 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1605,9 +1605,9 @@ struct btrfs_fs_info {
 	atomic_t scrub_cancel_req;
 	wait_queue_head_t scrub_pause_wait;
 	int scrub_workers_refcnt;
-	struct btrfs_workers scrub_workers;
-	struct btrfs_workers scrub_wr_completion_workers;
-	struct btrfs_workers scrub_nocow_workers;
+	struct btrfs_workqueue_struct *scrub_workers;
+	struct btrfs_workqueue_struct *scrub_wr_completion_workers;
+	struct btrfs_workqueue_struct *scrub_nocow_workers;
 
 #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
 	u32 check_integrity_print_mask;
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 682ec3fca4a1..5a240f5e6ceb 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -96,7 +96,8 @@ struct scrub_bio {
 #endif
 	int			page_count;
 	int			next_free;
-	struct btrfs_work	work;
+	struct btrfs_work_struct
+				work;
 };
 
 struct scrub_block {
@@ -154,7 +155,8 @@ struct scrub_fixup_nodatasum {
 	struct btrfs_device	*dev;
 	u64			logical;
 	struct btrfs_root	*root;
-	struct btrfs_work	work;
+	struct btrfs_work_struct
+				work;
 	int			mirror_num;
 };
 
@@ -172,7 +174,8 @@ struct scrub_copy_nocow_ctx {
 	int			mirror_num;
 	u64			physical_for_dev_replace;
 	struct list_head	inodes;
-	struct btrfs_work	work;
+	struct btrfs_work_struct
+				work;
 };
 
 struct scrub_warning {
@@ -231,7 +234,7 @@ static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
 		       u64 gen, int mirror_num, u8 *csum, int force,
 		       u64 physical_for_dev_replace);
 static void scrub_bio_end_io(struct bio *bio, int err);
-static void scrub_bio_end_io_worker(struct btrfs_work *work);
+static void scrub_bio_end_io_worker(struct btrfs_work_struct *work);
 static void scrub_block_complete(struct scrub_block *sblock);
 static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
 			       u64 extent_logical, u64 extent_len,
@@ -248,14 +251,14 @@ static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
 				    struct scrub_page *spage);
 static void scrub_wr_submit(struct scrub_ctx *sctx);
 static void scrub_wr_bio_end_io(struct bio *bio, int err);
-static void scrub_wr_bio_end_io_worker(struct btrfs_work *work);
+static void scrub_wr_bio_end_io_worker(struct btrfs_work_struct *work);
 static int write_page_nocow(struct scrub_ctx *sctx,
 			    u64 physical_for_dev_replace, struct page *page);
 static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,
 				      struct scrub_copy_nocow_ctx *ctx);
 static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
 			    int mirror_num, u64 physical_for_dev_replace);
-static void copy_nocow_pages_worker(struct btrfs_work *work);
+static void copy_nocow_pages_worker(struct btrfs_work_struct *work);
 static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info);
 static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info);
 
@@ -428,7 +431,8 @@ struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace)
 		sbio->index = i;
 		sbio->sctx = sctx;
 		sbio->page_count = 0;
-		sbio->work.func = scrub_bio_end_io_worker;
+		btrfs_init_work(&sbio->work, scrub_bio_end_io_worker,
+				NULL, NULL);
 
 		if (i != SCRUB_BIOS_PER_SCTX - 1)
 			sctx->bios[i]->next_free = i + 1;
@@ -733,7 +737,7 @@ out:
 	return -EIO;
 }
 
-static void scrub_fixup_nodatasum(struct btrfs_work *work)
+static void scrub_fixup_nodatasum(struct btrfs_work_struct *work)
 {
 	int ret;
 	struct scrub_fixup_nodatasum *fixup;
@@ -997,9 +1001,10 @@ nodatasum_case:
 		fixup_nodatasum->root = fs_info->extent_root;
 		fixup_nodatasum->mirror_num = failed_mirror_index + 1;
 		scrub_pending_trans_workers_inc(sctx);
-		fixup_nodatasum->work.func = scrub_fixup_nodatasum;
-		btrfs_queue_worker(&fs_info->scrub_workers,
-				   &fixup_nodatasum->work);
+		btrfs_init_work(&fixup_nodatasum->work, scrub_fixup_nodatasum,
+				NULL, NULL);
+		btrfs_queue_work(fs_info->scrub_workers,
+				 &fixup_nodatasum->work);
 		goto out;
 	}
 
@@ -1613,11 +1618,11 @@ static void scrub_wr_bio_end_io(struct bio *bio, int err)
 	sbio->err = err;
 	sbio->bio = bio;
 
-	sbio->work.func = scrub_wr_bio_end_io_worker;
-	btrfs_queue_worker(&fs_info->scrub_wr_completion_workers, &sbio->work);
+	btrfs_init_work(&sbio->work, scrub_wr_bio_end_io_worker, NULL, NULL);
+	btrfs_queue_work(fs_info->scrub_wr_completion_workers, &sbio->work);
 }
 
-static void scrub_wr_bio_end_io_worker(struct btrfs_work *work)
+static void scrub_wr_bio_end_io_worker(struct btrfs_work_struct *work)
 {
 	struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
 	struct scrub_ctx *sctx = sbio->sctx;
@@ -2082,10 +2087,10 @@ static void scrub_bio_end_io(struct bio *bio, int err)
 	sbio->err = err;
 	sbio->bio = bio;
 
-	btrfs_queue_worker(&fs_info->scrub_workers, &sbio->work);
+	btrfs_queue_work(fs_info->scrub_workers, &sbio->work);
 }
 
-static void scrub_bio_end_io_worker(struct btrfs_work *work)
+static void scrub_bio_end_io_worker(struct btrfs_work_struct *work)
 {
 	struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
 	struct scrub_ctx *sctx = sbio->sctx;
@@ -2780,33 +2785,35 @@ static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
 						int is_dev_replace)
 {
 	int ret = 0;
+	int flags = WQ_FREEZABLE | WQ_UNBOUND;
+	int max_active = fs_info->thread_pool_size;
 
 	if (fs_info->scrub_workers_refcnt == 0) {
 		if (is_dev_replace)
-			btrfs_init_workers(&fs_info->scrub_workers, "scrub", 1,
-					&fs_info->generic_worker);
+			fs_info->scrub_workers =
+				btrfs_alloc_workqueue("btrfs-scrub", flags,
+						      1, 4);
 		else
-			btrfs_init_workers(&fs_info->scrub_workers, "scrub",
-					fs_info->thread_pool_size,
-					&fs_info->generic_worker);
-		fs_info->scrub_workers.idle_thresh = 4;
-		ret = btrfs_start_workers(&fs_info->scrub_workers);
-		if (ret)
+			fs_info->scrub_workers =
+				btrfs_alloc_workqueue("btrfs-scrub", flags,
+						      max_active, 4);
+		if (!fs_info->scrub_workers) {
+			ret = -ENOMEM;
 			goto out;
-		btrfs_init_workers(&fs_info->scrub_wr_completion_workers,
-				   "scrubwrc",
-				   fs_info->thread_pool_size,
-				   &fs_info->generic_worker);
-		fs_info->scrub_wr_completion_workers.idle_thresh = 2;
-		ret = btrfs_start_workers(
-				&fs_info->scrub_wr_completion_workers);
-		if (ret)
+		}
+		fs_info->scrub_wr_completion_workers =
+			btrfs_alloc_workqueue("btrfs-scrubwrc", flags,
+					      max_active, 2);
+		if (!fs_info->scrub_wr_completion_workers) {
+			ret = -ENOMEM;
 			goto out;
-		btrfs_init_workers(&fs_info->scrub_nocow_workers, "scrubnc", 1,
-				   &fs_info->generic_worker);
-		ret = btrfs_start_workers(&fs_info->scrub_nocow_workers);
-		if (ret)
+		}
+		fs_info->scrub_nocow_workers =
+			btrfs_alloc_workqueue("btrfs-scrubnc", flags, 1, 0);
+		if (!fs_info->scrub_nocow_workers) {
+			ret = -ENOMEM;
 			goto out;
+		}
 	}
 	++fs_info->scrub_workers_refcnt;
 out:
@@ -2816,9 +2823,9 @@ out:
 static noinline_for_stack void scrub_workers_put(struct btrfs_fs_info *fs_info)
 {
 	if (--fs_info->scrub_workers_refcnt == 0) {
-		btrfs_stop_workers(&fs_info->scrub_workers);
-		btrfs_stop_workers(&fs_info->scrub_wr_completion_workers);
-		btrfs_stop_workers(&fs_info->scrub_nocow_workers);
+		btrfs_destroy_workqueue(fs_info->scrub_workers);
+		btrfs_destroy_workqueue(fs_info->scrub_wr_completion_workers);
+		btrfs_destroy_workqueue(fs_info->scrub_nocow_workers);
 	}
 	WARN_ON(fs_info->scrub_workers_refcnt < 0);
 }
@@ -3129,10 +3136,10 @@ static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
 	nocow_ctx->len = len;
 	nocow_ctx->mirror_num = mirror_num;
 	nocow_ctx->physical_for_dev_replace = physical_for_dev_replace;
-	nocow_ctx->work.func = copy_nocow_pages_worker;
+	btrfs_init_work(&nocow_ctx->work, copy_nocow_pages_worker, NULL, NULL);
 	INIT_LIST_HEAD(&nocow_ctx->inodes);
-	btrfs_queue_worker(&fs_info->scrub_nocow_workers,
-			   &nocow_ctx->work);
+	btrfs_queue_work(fs_info->scrub_nocow_workers,
+			 &nocow_ctx->work);
 
 	return 0;
 }
@@ -3154,7 +3161,7 @@ static int record_inode_for_nocow(u64 inum, u64 offset, u64 root, void *ctx)
 
 #define COPY_COMPLETE 1
 
-static void copy_nocow_pages_worker(struct btrfs_work *work)
+static void copy_nocow_pages_worker(struct btrfs_work_struct *work)
 {
 	struct scrub_copy_nocow_ctx *nocow_ctx =
 		container_of(work, struct scrub_copy_nocow_ctx, work);
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index fd07d039b2de..aed1e11060a0 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1336,8 +1336,8 @@ static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info,
 	btrfs_workqueue_set_max(fs_info->endio_freespace_worker, new_pool_size);
 	btrfs_workqueue_set_max(fs_info->delayed_workers, new_pool_size);
 	btrfs_workqueue_set_max(fs_info->readahead_workers, new_pool_size);
-	btrfs_set_max_workers(&fs_info->scrub_wr_completion_workers,
-			      new_pool_size);
+	btrfs_workqueue_set_max(fs_info->scrub_wr_completion_workers,
+				new_pool_size);
 }
 
 static inline void btrfs_remount_prepare(struct btrfs_fs_info *fs_info)
-- 
cgit v1.2.3


From a046e9c88b0f46677923864295eac7c92cd962cb Mon Sep 17 00:00:00 2001
From: Qu Wenruo <quwenruo@cn.fujitsu.com>
Date: Fri, 28 Feb 2014 10:46:18 +0800
Subject: btrfs: Cleanup the old btrfs_worker.

Since all the btrfs_worker is replaced with the newly created
btrfs_workqueue, the old codes can be easily remove.

Signed-off-by: Quwenruo <quwenruo@cn.fujitsu.com>
Tested-by: David Sterba <dsterba@suse.cz>
Signed-off-by: Josef Bacik <jbacik@fb.com>
---
 fs/btrfs/async-thread.c | 707 +-----------------------------------------------
 fs/btrfs/async-thread.h | 100 -------
 fs/btrfs/ctree.h        |   1 -
 fs/btrfs/disk-io.c      |  12 -
 fs/btrfs/super.c        |   8 -
 5 files changed, 3 insertions(+), 825 deletions(-)

diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 977bce2ec887..2a5f383c3636 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -25,714 +25,13 @@
 #include <linux/workqueue.h>
 #include "async-thread.h"
 
-#define WORK_QUEUED_BIT 0
-#define WORK_DONE_BIT 1
-#define WORK_ORDER_DONE_BIT 2
-#define WORK_HIGH_PRIO_BIT 3
+#define WORK_DONE_BIT 0
+#define WORK_ORDER_DONE_BIT 1
+#define WORK_HIGH_PRIO_BIT 2
 
 #define NO_THRESHOLD (-1)
 #define DFT_THRESHOLD (32)
 
-/*
- * container for the kthread task pointer and the list of pending work
- * One of these is allocated per thread.
- */
-struct btrfs_worker_thread {
-	/* pool we belong to */
-	struct btrfs_workers *workers;
-
-	/* list of struct btrfs_work that are waiting for service */
-	struct list_head pending;
-	struct list_head prio_pending;
-
-	/* list of worker threads from struct btrfs_workers */
-	struct list_head worker_list;
-
-	/* kthread */
-	struct task_struct *task;
-
-	/* number of things on the pending list */
-	atomic_t num_pending;
-
-	/* reference counter for this struct */
-	atomic_t refs;
-
-	unsigned long sequence;
-
-	/* protects the pending list. */
-	spinlock_t lock;
-
-	/* set to non-zero when this thread is already awake and kicking */
-	int working;
-
-	/* are we currently idle */
-	int idle;
-};
-
-static int __btrfs_start_workers(struct btrfs_workers *workers);
-
-/*
- * btrfs_start_workers uses kthread_run, which can block waiting for memory
- * for a very long time.  It will actually throttle on page writeback,
- * and so it may not make progress until after our btrfs worker threads
- * process all of the pending work structs in their queue
- *
- * This means we can't use btrfs_start_workers from inside a btrfs worker
- * thread that is used as part of cleaning dirty memory, which pretty much
- * involves all of the worker threads.
- *
- * Instead we have a helper queue who never has more than one thread
- * where we scheduler thread start operations.  This worker_start struct
- * is used to contain the work and hold a pointer to the queue that needs
- * another worker.
- */
-struct worker_start {
-	struct btrfs_work work;
-	struct btrfs_workers *queue;
-};
-
-static void start_new_worker_func(struct btrfs_work *work)
-{
-	struct worker_start *start;
-	start = container_of(work, struct worker_start, work);
-	__btrfs_start_workers(start->queue);
-	kfree(start);
-}
-
-/*
- * helper function to move a thread onto the idle list after it
- * has finished some requests.
- */
-static void check_idle_worker(struct btrfs_worker_thread *worker)
-{
-	if (!worker->idle && atomic_read(&worker->num_pending) <
-	    worker->workers->idle_thresh / 2) {
-		unsigned long flags;
-		spin_lock_irqsave(&worker->workers->lock, flags);
-		worker->idle = 1;
-
-		/* the list may be empty if the worker is just starting */
-		if (!list_empty(&worker->worker_list) &&
-		    !worker->workers->stopping) {
-			list_move(&worker->worker_list,
-				 &worker->workers->idle_list);
-		}
-		spin_unlock_irqrestore(&worker->workers->lock, flags);
-	}
-}
-
-/*
- * helper function to move a thread off the idle list after new
- * pending work is added.
- */
-static void check_busy_worker(struct btrfs_worker_thread *worker)
-{
-	if (worker->idle && atomic_read(&worker->num_pending) >=
-	    worker->workers->idle_thresh) {
-		unsigned long flags;
-		spin_lock_irqsave(&worker->workers->lock, flags);
-		worker->idle = 0;
-
-		if (!list_empty(&worker->worker_list) &&
-		    !worker->workers->stopping) {
-			list_move_tail(&worker->worker_list,
-				      &worker->workers->worker_list);
-		}
-		spin_unlock_irqrestore(&worker->workers->lock, flags);
-	}
-}
-
-static void check_pending_worker_creates(struct btrfs_worker_thread *worker)
-{
-	struct btrfs_workers *workers = worker->workers;
-	struct worker_start *start;
-	unsigned long flags;
-
-	rmb();
-	if (!workers->atomic_start_pending)
-		return;
-
-	start = kzalloc(sizeof(*start), GFP_NOFS);
-	if (!start)
-		return;
-
-	start->work.func = start_new_worker_func;
-	start->queue = workers;
-
-	spin_lock_irqsave(&workers->lock, flags);
-	if (!workers->atomic_start_pending)
-		goto out;
-
-	workers->atomic_start_pending = 0;
-	if (workers->num_workers + workers->num_workers_starting >=
-	    workers->max_workers)
-		goto out;
-
-	workers->num_workers_starting += 1;
-	spin_unlock_irqrestore(&workers->lock, flags);
-	btrfs_queue_worker(workers->atomic_worker_start, &start->work);
-	return;
-
-out:
-	kfree(start);
-	spin_unlock_irqrestore(&workers->lock, flags);
-}
-
-static noinline void run_ordered_completions(struct btrfs_workers *workers,
-					    struct btrfs_work *work)
-{
-	if (!workers->ordered)
-		return;
-
-	set_bit(WORK_DONE_BIT, &work->flags);
-
-	spin_lock(&workers->order_lock);
-
-	while (1) {
-		if (!list_empty(&workers->prio_order_list)) {
-			work = list_entry(workers->prio_order_list.next,
-					  struct btrfs_work, order_list);
-		} else if (!list_empty(&workers->order_list)) {
-			work = list_entry(workers->order_list.next,
-					  struct btrfs_work, order_list);
-		} else {
-			break;
-		}
-		if (!test_bit(WORK_DONE_BIT, &work->flags))
-			break;
-
-		/* we are going to call the ordered done function, but
-		 * we leave the work item on the list as a barrier so
-		 * that later work items that are done don't have their
-		 * functions called before this one returns
-		 */
-		if (test_and_set_bit(WORK_ORDER_DONE_BIT, &work->flags))
-			break;
-
-		spin_unlock(&workers->order_lock);
-
-		work->ordered_func(work);
-
-		/* now take the lock again and drop our item from the list */
-		spin_lock(&workers->order_lock);
-		list_del(&work->order_list);
-		spin_unlock(&workers->order_lock);
-
-		/*
-		 * we don't want to call the ordered free functions
-		 * with the lock held though
-		 */
-		work->ordered_free(work);
-		spin_lock(&workers->order_lock);
-	}
-
-	spin_unlock(&workers->order_lock);
-}
-
-static void put_worker(struct btrfs_worker_thread *worker)
-{
-	if (atomic_dec_and_test(&worker->refs))
-		kfree(worker);
-}
-
-static int try_worker_shutdown(struct btrfs_worker_thread *worker)
-{
-	int freeit = 0;
-
-	spin_lock_irq(&worker->lock);
-	spin_lock(&worker->workers->lock);
-	if (worker->workers->num_workers > 1 &&
-	    worker->idle &&
-	    !worker->working &&
-	    !list_empty(&worker->worker_list) &&
-	    list_empty(&worker->prio_pending) &&
-	    list_empty(&worker->pending) &&
-	    atomic_read(&worker->num_pending) == 0) {
-		freeit = 1;
-		list_del_init(&worker->worker_list);
-		worker->workers->num_workers--;
-	}
-	spin_unlock(&worker->workers->lock);
-	spin_unlock_irq(&worker->lock);
-
-	if (freeit)
-		put_worker(worker);
-	return freeit;
-}
-
-static struct btrfs_work *get_next_work(struct btrfs_worker_thread *worker,
-					struct list_head *prio_head,
-					struct list_head *head)
-{
-	struct btrfs_work *work = NULL;
-	struct list_head *cur = NULL;
-
-	if (!list_empty(prio_head)) {
-		cur = prio_head->next;
-		goto out;
-	}
-
-	smp_mb();
-	if (!list_empty(&worker->prio_pending))
-		goto refill;
-
-	if (!list_empty(head)) {
-		cur = head->next;
-		goto out;
-	}
-
-refill:
-	spin_lock_irq(&worker->lock);
-	list_splice_tail_init(&worker->prio_pending, prio_head);
-	list_splice_tail_init(&worker->pending, head);
-
-	if (!list_empty(prio_head))
-		cur = prio_head->next;
-	else if (!list_empty(head))
-		cur = head->next;
-	spin_unlock_irq(&worker->lock);
-
-	if (!cur)
-		goto out_fail;
-
-out:
-	work = list_entry(cur, struct btrfs_work, list);
-
-out_fail:
-	return work;
-}
-
-/*
- * main loop for servicing work items
- */
-static int worker_loop(void *arg)
-{
-	struct btrfs_worker_thread *worker = arg;
-	struct list_head head;
-	struct list_head prio_head;
-	struct btrfs_work *work;
-
-	INIT_LIST_HEAD(&head);
-	INIT_LIST_HEAD(&prio_head);
-
-	do {
-again:
-		while (1) {
-
-
-			work = get_next_work(worker, &prio_head, &head);
-			if (!work)
-				break;
-
-			list_del(&work->list);
-			clear_bit(WORK_QUEUED_BIT, &work->flags);
-
-			work->worker = worker;
-
-			work->func(work);
-
-			atomic_dec(&worker->num_pending);
-			/*
-			 * unless this is an ordered work queue,
-			 * 'work' was probably freed by func above.
-			 */
-			run_ordered_completions(worker->workers, work);
-
-			check_pending_worker_creates(worker);
-			cond_resched();
-		}
-
-		spin_lock_irq(&worker->lock);
-		check_idle_worker(worker);
-
-		if (freezing(current)) {
-			worker->working = 0;
-			spin_unlock_irq(&worker->lock);
-			try_to_freeze();
-		} else {
-			spin_unlock_irq(&worker->lock);
-			if (!kthread_should_stop()) {
-				cpu_relax();
-				/*
-				 * we've dropped the lock, did someone else
-				 * jump_in?
-				 */
-				smp_mb();
-				if (!list_empty(&worker->pending) ||
-				    !list_empty(&worker->prio_pending))
-					continue;
-
-				/*
-				 * this short schedule allows more work to
-				 * come in without the queue functions
-				 * needing to go through wake_up_process()
-				 *
-				 * worker->working is still 1, so nobody
-				 * is going to try and wake us up
-				 */
-				schedule_timeout(1);
-				smp_mb();
-				if (!list_empty(&worker->pending) ||
-				    !list_empty(&worker->prio_pending))
-					continue;
-
-				if (kthread_should_stop())
-					break;
-
-				/* still no more work?, sleep for real */
-				spin_lock_irq(&worker->lock);
-				set_current_state(TASK_INTERRUPTIBLE);
-				if (!list_empty(&worker->pending) ||
-				    !list_empty(&worker->prio_pending)) {
-					spin_unlock_irq(&worker->lock);
-					set_current_state(TASK_RUNNING);
-					goto again;
-				}
-
-				/*
-				 * this makes sure we get a wakeup when someone
-				 * adds something new to the queue
-				 */
-				worker->working = 0;
-				spin_unlock_irq(&worker->lock);
-
-				if (!kthread_should_stop()) {
-					schedule_timeout(HZ * 120);
-					if (!worker->working &&
-					    try_worker_shutdown(worker)) {
-						return 0;
-					}
-				}
-			}
-			__set_current_state(TASK_RUNNING);
-		}
-	} while (!kthread_should_stop());
-	return 0;
-}
-
-/*
- * this will wait for all the worker threads to shutdown
- */
-void btrfs_stop_workers(struct btrfs_workers *workers)
-{
-	struct list_head *cur;
-	struct btrfs_worker_thread *worker;
-	int can_stop;
-
-	spin_lock_irq(&workers->lock);
-	workers->stopping = 1;
-	list_splice_init(&workers->idle_list, &workers->worker_list);
-	while (!list_empty(&workers->worker_list)) {
-		cur = workers->worker_list.next;
-		worker = list_entry(cur, struct btrfs_worker_thread,
-				    worker_list);
-
-		atomic_inc(&worker->refs);
-		workers->num_workers -= 1;
-		if (!list_empty(&worker->worker_list)) {
-			list_del_init(&worker->worker_list);
-			put_worker(worker);
-			can_stop = 1;
-		} else
-			can_stop = 0;
-		spin_unlock_irq(&workers->lock);
-		if (can_stop)
-			kthread_stop(worker->task);
-		spin_lock_irq(&workers->lock);
-		put_worker(worker);
-	}
-	spin_unlock_irq(&workers->lock);
-}
-
-/*
- * simple init on struct btrfs_workers
- */
-void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max,
-			struct btrfs_workers *async_helper)
-{
-	workers->num_workers = 0;
-	workers->num_workers_starting = 0;
-	INIT_LIST_HEAD(&workers->worker_list);
-	INIT_LIST_HEAD(&workers->idle_list);
-	INIT_LIST_HEAD(&workers->order_list);
-	INIT_LIST_HEAD(&workers->prio_order_list);
-	spin_lock_init(&workers->lock);
-	spin_lock_init(&workers->order_lock);
-	workers->max_workers = max;
-	workers->idle_thresh = 32;
-	workers->name = name;
-	workers->ordered = 0;
-	workers->atomic_start_pending = 0;
-	workers->atomic_worker_start = async_helper;
-	workers->stopping = 0;
-}
-
-/*
- * starts new worker threads.  This does not enforce the max worker
- * count in case you need to temporarily go past it.
- */
-static int __btrfs_start_workers(struct btrfs_workers *workers)
-{
-	struct btrfs_worker_thread *worker;
-	int ret = 0;
-
-	worker = kzalloc(sizeof(*worker), GFP_NOFS);
-	if (!worker) {
-		ret = -ENOMEM;
-		goto fail;
-	}
-
-	INIT_LIST_HEAD(&worker->pending);
-	INIT_LIST_HEAD(&worker->prio_pending);
-	INIT_LIST_HEAD(&worker->worker_list);
-	spin_lock_init(&worker->lock);
-
-	atomic_set(&worker->num_pending, 0);
-	atomic_set(&worker->refs, 1);
-	worker->workers = workers;
-	worker->task = kthread_create(worker_loop, worker,
-				      "btrfs-%s-%d", workers->name,
-				      workers->num_workers + 1);
-	if (IS_ERR(worker->task)) {
-		ret = PTR_ERR(worker->task);
-		goto fail;
-	}
-
-	spin_lock_irq(&workers->lock);
-	if (workers->stopping) {
-		spin_unlock_irq(&workers->lock);
-		ret = -EINVAL;
-		goto fail_kthread;
-	}
-	list_add_tail(&worker->worker_list, &workers->idle_list);
-	worker->idle = 1;
-	workers->num_workers++;
-	workers->num_workers_starting--;
-	WARN_ON(workers->num_workers_starting < 0);
-	spin_unlock_irq(&workers->lock);
-
-	wake_up_process(worker->task);
-	return 0;
-
-fail_kthread:
-	kthread_stop(worker->task);
-fail:
-	kfree(worker);
-	spin_lock_irq(&workers->lock);
-	workers->num_workers_starting--;
-	spin_unlock_irq(&workers->lock);
-	return ret;
-}
-
-int btrfs_start_workers(struct btrfs_workers *workers)
-{
-	spin_lock_irq(&workers->lock);
-	workers->num_workers_starting++;
-	spin_unlock_irq(&workers->lock);
-	return __btrfs_start_workers(workers);
-}
-
-/*
- * run through the list and find a worker thread that doesn't have a lot
- * to do right now.  This can return null if we aren't yet at the thread
- * count limit and all of the threads are busy.
- */
-static struct btrfs_worker_thread *next_worker(struct btrfs_workers *workers)
-{
-	struct btrfs_worker_thread *worker;
-	struct list_head *next;
-	int enforce_min;
-
-	enforce_min = (workers->num_workers + workers->num_workers_starting) <
-		workers->max_workers;
-
-	/*
-	 * if we find an idle thread, don't move it to the end of the
-	 * idle list.  This improves the chance that the next submission
-	 * will reuse the same thread, and maybe catch it while it is still
-	 * working
-	 */
-	if (!list_empty(&workers->idle_list)) {
-		next = workers->idle_list.next;
-		worker = list_entry(next, struct btrfs_worker_thread,
-				    worker_list);
-		return worker;
-	}
-	if (enforce_min || list_empty(&workers->worker_list))
-		return NULL;
-
-	/*
-	 * if we pick a busy task, move the task to the end of the list.
-	 * hopefully this will keep things somewhat evenly balanced.
-	 * Do the move in batches based on the sequence number.  This groups
-	 * requests submitted at roughly the same time onto the same worker.
-	 */
-	next = workers->worker_list.next;
-	worker = list_entry(next, struct btrfs_worker_thread, worker_list);
-	worker->sequence++;
-
-	if (worker->sequence % workers->idle_thresh == 0)
-		list_move_tail(next, &workers->worker_list);
-	return worker;
-}
-
-/*
- * selects a worker thread to take the next job.  This will either find
- * an idle worker, start a new worker up to the max count, or just return
- * one of the existing busy workers.
- */
-static struct btrfs_worker_thread *find_worker(struct btrfs_workers *workers)
-{
-	struct btrfs_worker_thread *worker;
-	unsigned long flags;
-	struct list_head *fallback;
-	int ret;
-
-	spin_lock_irqsave(&workers->lock, flags);
-again:
-	worker = next_worker(workers);
-
-	if (!worker) {
-		if (workers->num_workers + workers->num_workers_starting >=
-		    workers->max_workers) {
-			goto fallback;
-		} else if (workers->atomic_worker_start) {
-			workers->atomic_start_pending = 1;
-			goto fallback;
-		} else {
-			workers->num_workers_starting++;
-			spin_unlock_irqrestore(&workers->lock, flags);
-			/* we're below the limit, start another worker */
-			ret = __btrfs_start_workers(workers);
-			spin_lock_irqsave(&workers->lock, flags);
-			if (ret)
-				goto fallback;
-			goto again;
-		}
-	}
-	goto found;
-
-fallback:
-	fallback = NULL;
-	/*
-	 * we have failed to find any workers, just
-	 * return the first one we can find.
-	 */
-	if (!list_empty(&workers->worker_list))
-		fallback = workers->worker_list.next;
-	if (!list_empty(&workers->idle_list))
-		fallback = workers->idle_list.next;
-	BUG_ON(!fallback);
-	worker = list_entry(fallback,
-		  struct btrfs_worker_thread, worker_list);
-found:
-	/*
-	 * this makes sure the worker doesn't exit before it is placed
-	 * onto a busy/idle list
-	 */
-	atomic_inc(&worker->num_pending);
-	spin_unlock_irqrestore(&workers->lock, flags);
-	return worker;
-}
-
-/*
- * btrfs_requeue_work just puts the work item back on the tail of the list
- * it was taken from.  It is intended for use with long running work functions
- * that make some progress and want to give the cpu up for others.
- */
-void btrfs_requeue_work(struct btrfs_work *work)
-{
-	struct btrfs_worker_thread *worker = work->worker;
-	unsigned long flags;
-	int wake = 0;
-
-	if (test_and_set_bit(WORK_QUEUED_BIT, &work->flags))
-		return;
-
-	spin_lock_irqsave(&worker->lock, flags);
-	if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags))
-		list_add_tail(&work->list, &worker->prio_pending);
-	else
-		list_add_tail(&work->list, &worker->pending);
-	atomic_inc(&worker->num_pending);
-
-	/* by definition we're busy, take ourselves off the idle
-	 * list
-	 */
-	if (worker->idle) {
-		spin_lock(&worker->workers->lock);
-		worker->idle = 0;
-		list_move_tail(&worker->worker_list,
-			      &worker->workers->worker_list);
-		spin_unlock(&worker->workers->lock);
-	}
-	if (!worker->working) {
-		wake = 1;
-		worker->working = 1;
-	}
-
-	if (wake)
-		wake_up_process(worker->task);
-	spin_unlock_irqrestore(&worker->lock, flags);
-}
-
-void btrfs_set_work_high_prio(struct btrfs_work *work)
-{
-	set_bit(WORK_HIGH_PRIO_BIT, &work->flags);
-}
-
-/*
- * places a struct btrfs_work into the pending queue of one of the kthreads
- */
-void btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work)
-{
-	struct btrfs_worker_thread *worker;
-	unsigned long flags;
-	int wake = 0;
-
-	/* don't requeue something already on a list */
-	if (test_and_set_bit(WORK_QUEUED_BIT, &work->flags))
-		return;
-
-	worker = find_worker(workers);
-	if (workers->ordered) {
-		/*
-		 * you're not allowed to do ordered queues from an
-		 * interrupt handler
-		 */
-		spin_lock(&workers->order_lock);
-		if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags)) {
-			list_add_tail(&work->order_list,
-				      &workers->prio_order_list);
-		} else {
-			list_add_tail(&work->order_list, &workers->order_list);
-		}
-		spin_unlock(&workers->order_lock);
-	} else {
-		INIT_LIST_HEAD(&work->order_list);
-	}
-
-	spin_lock_irqsave(&worker->lock, flags);
-
-	if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags))
-		list_add_tail(&work->list, &worker->prio_pending);
-	else
-		list_add_tail(&work->list, &worker->pending);
-	check_busy_worker(worker);
-
-	/*
-	 * avoid calling into wake_up_process if this thread has already
-	 * been kicked
-	 */
-	if (!worker->working)
-		wake = 1;
-	worker->working = 1;
-
-	if (wake)
-		wake_up_process(worker->task);
-	spin_unlock_irqrestore(&worker->lock, flags);
-}
-
 struct __btrfs_workqueue_struct {
 	struct workqueue_struct *normal_wq;
 	/* List head pointing to ordered work list */
diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h
index 3129d8a6128b..ab05904f791c 100644
--- a/fs/btrfs/async-thread.h
+++ b/fs/btrfs/async-thread.h
@@ -20,106 +20,6 @@
 #ifndef __BTRFS_ASYNC_THREAD_
 #define __BTRFS_ASYNC_THREAD_
 
-struct btrfs_worker_thread;
-
-/*
- * This is similar to a workqueue, but it is meant to spread the operations
- * across all available cpus instead of just the CPU that was used to
- * queue the work.  There is also some batching introduced to try and
- * cut down on context switches.
- *
- * By default threads are added on demand up to 2 * the number of cpus.
- * Changing struct btrfs_workers->max_workers is one way to prevent
- * demand creation of kthreads.
- *
- * the basic model of these worker threads is to embed a btrfs_work
- * structure in your own data struct, and use container_of in a
- * work function to get back to your data struct.
- */
-struct btrfs_work {
-	/*
-	 * func should be set to the function you want called
-	 * your work struct is passed as the only arg
-	 *
-	 * ordered_func must be set for work sent to an ordered work queue,
-	 * and it is called to complete a given work item in the same
-	 * order they were sent to the queue.
-	 */
-	void (*func)(struct btrfs_work *work);
-	void (*ordered_func)(struct btrfs_work *work);
-	void (*ordered_free)(struct btrfs_work *work);
-
-	/*
-	 * flags should be set to zero.  It is used to make sure the
-	 * struct is only inserted once into the list.
-	 */
-	unsigned long flags;
-
-	/* don't touch these */
-	struct btrfs_worker_thread *worker;
-	struct list_head list;
-	struct list_head order_list;
-};
-
-struct btrfs_workers {
-	/* current number of running workers */
-	int num_workers;
-
-	int num_workers_starting;
-
-	/* max number of workers allowed.  changed by btrfs_start_workers */
-	int max_workers;
-
-	/* once a worker has this many requests or fewer, it is idle */
-	int idle_thresh;
-
-	/* force completions in the order they were queued */
-	int ordered;
-
-	/* more workers required, but in an interrupt handler */
-	int atomic_start_pending;
-
-	/*
-	 * are we allowed to sleep while starting workers or are we required
-	 * to start them at a later time?  If we can't sleep, this indicates
-	 * which queue we need to use to schedule thread creation.
-	 */
-	struct btrfs_workers *atomic_worker_start;
-
-	/* list with all the work threads.  The workers on the idle thread
-	 * may be actively servicing jobs, but they haven't yet hit the
-	 * idle thresh limit above.
-	 */
-	struct list_head worker_list;
-	struct list_head idle_list;
-
-	/*
-	 * when operating in ordered mode, this maintains the list
-	 * of work items waiting for completion
-	 */
-	struct list_head order_list;
-	struct list_head prio_order_list;
-
-	/* lock for finding the next worker thread to queue on */
-	spinlock_t lock;
-
-	/* lock for the ordered lists */
-	spinlock_t order_lock;
-
-	/* extra name for this worker, used for current->name */
-	char *name;
-
-	int stopping;
-};
-
-void btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work);
-int btrfs_start_workers(struct btrfs_workers *workers);
-void btrfs_stop_workers(struct btrfs_workers *workers);
-void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max,
-			struct btrfs_workers *async_starter);
-void btrfs_requeue_work(struct btrfs_work *work);
-void btrfs_set_work_high_prio(struct btrfs_work *work);
-
 struct btrfs_workqueue_struct;
 /* Internal use only */
 struct __btrfs_workqueue_struct;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index a98f86ac187e..5a8c77a441ba 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1504,7 +1504,6 @@ struct btrfs_fs_info {
 	 * A third pool does submit_bio to avoid deadlocking with the other
 	 * two
 	 */
-	struct btrfs_workers generic_worker;
 	struct btrfs_workqueue_struct *workers;
 	struct btrfs_workqueue_struct *delalloc_workers;
 	struct btrfs_workqueue_struct *flush_workers;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 9aaf9c309b54..c80d9507171c 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1994,7 +1994,6 @@ static noinline int next_root_backup(struct btrfs_fs_info *info,
 /* helper to cleanup workers */
 static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info)
 {
-	btrfs_stop_workers(&fs_info->generic_worker);
 	btrfs_destroy_workqueue(fs_info->fixup_workers);
 	btrfs_destroy_workqueue(fs_info->delalloc_workers);
 	btrfs_destroy_workqueue(fs_info->workers);
@@ -2472,8 +2471,6 @@ int open_ctree(struct super_block *sb,
 	}
 
 	max_active = fs_info->thread_pool_size;
-	btrfs_init_workers(&fs_info->generic_worker,
-			   "genwork", 1, NULL);
 
 	fs_info->workers =
 		btrfs_alloc_workqueue("worker", flags | WQ_HIGHPRI,
@@ -2526,15 +2523,6 @@ int open_ctree(struct super_block *sb,
 	fs_info->qgroup_rescan_workers =
 		btrfs_alloc_workqueue("qgroup-rescan", flags, 1, 0);
 
-	/*
-	 * btrfs_start_workers can really only fail because of ENOMEM so just
-	 * return -ENOMEM if any of these fail.
-	 */
-	ret = btrfs_start_workers(&fs_info->generic_worker);
-	if (ret) {
-		err = -ENOMEM;
-		goto fail_sb_buffer;
-	}
 	if (!(fs_info->workers && fs_info->delalloc_workers &&
 	      fs_info->submit_workers && fs_info->flush_workers &&
 	      fs_info->endio_workers && fs_info->endio_meta_workers &&
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index aed1e11060a0..d4878ddba87a 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1305,13 +1305,6 @@ error_fs_info:
 	return ERR_PTR(error);
 }
 
-static void btrfs_set_max_workers(struct btrfs_workers *workers, int new_limit)
-{
-	spin_lock_irq(&workers->lock);
-	workers->max_workers = new_limit;
-	spin_unlock_irq(&workers->lock);
-}
-
 static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info,
 				     int new_pool_size, int old_pool_size)
 {
@@ -1323,7 +1316,6 @@ static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info,
 	btrfs_info(fs_info, "resize thread pool %d -> %d",
 	       old_pool_size, new_pool_size);
 
-	btrfs_set_max_workers(&fs_info->generic_worker, new_pool_size);
 	btrfs_workqueue_set_max(fs_info->workers, new_pool_size);
 	btrfs_workqueue_set_max(fs_info->delalloc_workers, new_pool_size);
 	btrfs_workqueue_set_max(fs_info->submit_workers, new_pool_size);
-- 
cgit v1.2.3


From d458b0540ebd728b4d6ef47cc5ef0dbfd4dd361a Mon Sep 17 00:00:00 2001
From: Qu Wenruo <quwenruo@cn.fujitsu.com>
Date: Fri, 28 Feb 2014 10:46:19 +0800
Subject: btrfs: Cleanup the "_struct" suffix in btrfs_workequeue

Since the "_struct" suffix is mainly used for distinguish the differnt
btrfs_work between the original and the newly created one,
there is no need using the suffix since all btrfs_workers are changed
into btrfs_workqueue.

Also this patch fixed some codes whose code style is changed due to the
too long "_struct" suffix.

Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Tested-by: David Sterba <dsterba@suse.cz>
Signed-off-by: Josef Bacik <jbacik@fb.com>
---
 fs/btrfs/async-thread.c  | 66 ++++++++++++++++++++++++------------------------
 fs/btrfs/async-thread.h  | 34 ++++++++++++-------------
 fs/btrfs/ctree.h         | 44 ++++++++++++++++----------------
 fs/btrfs/delayed-inode.c |  4 +--
 fs/btrfs/disk-io.c       | 14 +++++-----
 fs/btrfs/extent-tree.c   |  2 +-
 fs/btrfs/inode.c         | 18 ++++++-------
 fs/btrfs/ordered-data.c  |  2 +-
 fs/btrfs/ordered-data.h  |  4 +--
 fs/btrfs/qgroup.c        |  2 +-
 fs/btrfs/raid56.c        | 14 +++++-----
 fs/btrfs/reada.c         |  5 ++--
 fs/btrfs/scrub.c         | 23 ++++++++---------
 fs/btrfs/volumes.c       |  2 +-
 fs/btrfs/volumes.h       |  2 +-
 15 files changed, 116 insertions(+), 120 deletions(-)

diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 2a5f383c3636..a709585e2c97 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -32,7 +32,7 @@
 #define NO_THRESHOLD (-1)
 #define DFT_THRESHOLD (32)
 
-struct __btrfs_workqueue_struct {
+struct __btrfs_workqueue {
 	struct workqueue_struct *normal_wq;
 	/* List head pointing to ordered work list */
 	struct list_head ordered_list;
@@ -49,15 +49,15 @@ struct __btrfs_workqueue_struct {
 	spinlock_t thres_lock;
 };
 
-struct btrfs_workqueue_struct {
-	struct __btrfs_workqueue_struct *normal;
-	struct __btrfs_workqueue_struct *high;
+struct btrfs_workqueue {
+	struct __btrfs_workqueue *normal;
+	struct __btrfs_workqueue *high;
 };
 
-static inline struct __btrfs_workqueue_struct
+static inline struct __btrfs_workqueue
 *__btrfs_alloc_workqueue(char *name, int flags, int max_active, int thresh)
 {
-	struct __btrfs_workqueue_struct *ret = kzalloc(sizeof(*ret), GFP_NOFS);
+	struct __btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_NOFS);
 
 	if (unlikely(!ret))
 		return NULL;
@@ -95,14 +95,14 @@ static inline struct __btrfs_workqueue_struct
 }
 
 static inline void
-__btrfs_destroy_workqueue(struct __btrfs_workqueue_struct *wq);
+__btrfs_destroy_workqueue(struct __btrfs_workqueue *wq);
 
-struct btrfs_workqueue_struct *btrfs_alloc_workqueue(char *name,
-						     int flags,
-						     int max_active,
-						     int thresh)
+struct btrfs_workqueue *btrfs_alloc_workqueue(char *name,
+					      int flags,
+					      int max_active,
+					      int thresh)
 {
-	struct btrfs_workqueue_struct *ret = kzalloc(sizeof(*ret), GFP_NOFS);
+	struct btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_NOFS);
 
 	if (unlikely(!ret))
 		return NULL;
@@ -131,7 +131,7 @@ struct btrfs_workqueue_struct *btrfs_alloc_workqueue(char *name,
  * This hook WILL be called in IRQ handler context,
  * so workqueue_set_max_active MUST NOT be called in this hook
  */
-static inline void thresh_queue_hook(struct __btrfs_workqueue_struct *wq)
+static inline void thresh_queue_hook(struct __btrfs_workqueue *wq)
 {
 	if (wq->thresh == NO_THRESHOLD)
 		return;
@@ -143,7 +143,7 @@ static inline void thresh_queue_hook(struct __btrfs_workqueue_struct *wq)
  * This hook is called in kthread content.
  * So workqueue_set_max_active is called here.
  */
-static inline void thresh_exec_hook(struct __btrfs_workqueue_struct *wq)
+static inline void thresh_exec_hook(struct __btrfs_workqueue *wq)
 {
 	int new_max_active;
 	long pending;
@@ -186,10 +186,10 @@ out:
 	}
 }
 
-static void run_ordered_work(struct __btrfs_workqueue_struct *wq)
+static void run_ordered_work(struct __btrfs_workqueue *wq)
 {
 	struct list_head *list = &wq->ordered_list;
-	struct btrfs_work_struct *work;
+	struct btrfs_work *work;
 	spinlock_t *lock = &wq->list_lock;
 	unsigned long flags;
 
@@ -197,7 +197,7 @@ static void run_ordered_work(struct __btrfs_workqueue_struct *wq)
 		spin_lock_irqsave(lock, flags);
 		if (list_empty(list))
 			break;
-		work = list_entry(list->next, struct btrfs_work_struct,
+		work = list_entry(list->next, struct btrfs_work,
 				  ordered_list);
 		if (!test_bit(WORK_DONE_BIT, &work->flags))
 			break;
@@ -229,11 +229,11 @@ static void run_ordered_work(struct __btrfs_workqueue_struct *wq)
 
 static void normal_work_helper(struct work_struct *arg)
 {
-	struct btrfs_work_struct *work;
-	struct __btrfs_workqueue_struct *wq;
+	struct btrfs_work *work;
+	struct __btrfs_workqueue *wq;
 	int need_order = 0;
 
-	work = container_of(arg, struct btrfs_work_struct, normal_work);
+	work = container_of(arg, struct btrfs_work, normal_work);
 	/*
 	 * We should not touch things inside work in the following cases:
 	 * 1) after work->func() if it has no ordered_free
@@ -254,10 +254,10 @@ static void normal_work_helper(struct work_struct *arg)
 	}
 }
 
-void btrfs_init_work(struct btrfs_work_struct *work,
-		     void (*func)(struct btrfs_work_struct *),
-		     void (*ordered_func)(struct btrfs_work_struct *),
-		     void (*ordered_free)(struct btrfs_work_struct *))
+void btrfs_init_work(struct btrfs_work *work,
+		     void (*func)(struct btrfs_work *),
+		     void (*ordered_func)(struct btrfs_work *),
+		     void (*ordered_free)(struct btrfs_work *))
 {
 	work->func = func;
 	work->ordered_func = ordered_func;
@@ -267,8 +267,8 @@ void btrfs_init_work(struct btrfs_work_struct *work,
 	work->flags = 0;
 }
 
-static inline void __btrfs_queue_work(struct __btrfs_workqueue_struct *wq,
-				      struct btrfs_work_struct *work)
+static inline void __btrfs_queue_work(struct __btrfs_workqueue *wq,
+				      struct btrfs_work *work)
 {
 	unsigned long flags;
 
@@ -282,10 +282,10 @@ static inline void __btrfs_queue_work(struct __btrfs_workqueue_struct *wq,
 	queue_work(wq->normal_wq, &work->normal_work);
 }
 
-void btrfs_queue_work(struct btrfs_workqueue_struct *wq,
-		      struct btrfs_work_struct *work)
+void btrfs_queue_work(struct btrfs_workqueue *wq,
+		      struct btrfs_work *work)
 {
-	struct __btrfs_workqueue_struct *dest_wq;
+	struct __btrfs_workqueue *dest_wq;
 
 	if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags) && wq->high)
 		dest_wq = wq->high;
@@ -295,13 +295,13 @@ void btrfs_queue_work(struct btrfs_workqueue_struct *wq,
 }
 
 static inline void
-__btrfs_destroy_workqueue(struct __btrfs_workqueue_struct *wq)
+__btrfs_destroy_workqueue(struct __btrfs_workqueue *wq)
 {
 	destroy_workqueue(wq->normal_wq);
 	kfree(wq);
 }
 
-void btrfs_destroy_workqueue(struct btrfs_workqueue_struct *wq)
+void btrfs_destroy_workqueue(struct btrfs_workqueue *wq)
 {
 	if (!wq)
 		return;
@@ -310,14 +310,14 @@ void btrfs_destroy_workqueue(struct btrfs_workqueue_struct *wq)
 	__btrfs_destroy_workqueue(wq->normal);
 }
 
-void btrfs_workqueue_set_max(struct btrfs_workqueue_struct *wq, int max)
+void btrfs_workqueue_set_max(struct btrfs_workqueue *wq, int max)
 {
 	wq->normal->max_active = max;
 	if (wq->high)
 		wq->high->max_active = max;
 }
 
-void btrfs_set_work_high_priority(struct btrfs_work_struct *work)
+void btrfs_set_work_high_priority(struct btrfs_work *work)
 {
 	set_bit(WORK_HIGH_PRIO_BIT, &work->flags);
 }
diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h
index ab05904f791c..08d717476227 100644
--- a/fs/btrfs/async-thread.h
+++ b/fs/btrfs/async-thread.h
@@ -20,33 +20,33 @@
 #ifndef __BTRFS_ASYNC_THREAD_
 #define __BTRFS_ASYNC_THREAD_
 
-struct btrfs_workqueue_struct;
+struct btrfs_workqueue;
 /* Internal use only */
-struct __btrfs_workqueue_struct;
+struct __btrfs_workqueue;
 
-struct btrfs_work_struct {
-	void (*func)(struct btrfs_work_struct *arg);
-	void (*ordered_func)(struct btrfs_work_struct *arg);
-	void (*ordered_free)(struct btrfs_work_struct *arg);
+struct btrfs_work {
+	void (*func)(struct btrfs_work *arg);
+	void (*ordered_func)(struct btrfs_work *arg);
+	void (*ordered_free)(struct btrfs_work *arg);
 
 	/* Don't touch things below */
 	struct work_struct normal_work;
 	struct list_head ordered_list;
-	struct __btrfs_workqueue_struct *wq;
+	struct __btrfs_workqueue *wq;
 	unsigned long flags;
 };
 
-struct btrfs_workqueue_struct *btrfs_alloc_workqueue(char *name,
+struct btrfs_workqueue *btrfs_alloc_workqueue(char *name,
 						     int flags,
 						     int max_active,
 						     int thresh);
-void btrfs_init_work(struct btrfs_work_struct *work,
-		     void (*func)(struct btrfs_work_struct *),
-		     void (*ordered_func)(struct btrfs_work_struct *),
-		     void (*ordered_free)(struct btrfs_work_struct *));
-void btrfs_queue_work(struct btrfs_workqueue_struct *wq,
-		      struct btrfs_work_struct *work);
-void btrfs_destroy_workqueue(struct btrfs_workqueue_struct *wq);
-void btrfs_workqueue_set_max(struct btrfs_workqueue_struct *wq, int max);
-void btrfs_set_work_high_priority(struct btrfs_work_struct *work);
+void btrfs_init_work(struct btrfs_work *work,
+		     void (*func)(struct btrfs_work *),
+		     void (*ordered_func)(struct btrfs_work *),
+		     void (*ordered_free)(struct btrfs_work *));
+void btrfs_queue_work(struct btrfs_workqueue *wq,
+		      struct btrfs_work *work);
+void btrfs_destroy_workqueue(struct btrfs_workqueue *wq);
+void btrfs_workqueue_set_max(struct btrfs_workqueue *wq, int max);
+void btrfs_set_work_high_priority(struct btrfs_work *work);
 #endif
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 5a8c77a441ba..b4d2e957b89f 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1221,7 +1221,7 @@ struct btrfs_caching_control {
 	struct list_head list;
 	struct mutex mutex;
 	wait_queue_head_t wait;
-	struct btrfs_work_struct work;
+	struct btrfs_work work;
 	struct btrfs_block_group_cache *block_group;
 	u64 progress;
 	atomic_t count;
@@ -1504,27 +1504,27 @@ struct btrfs_fs_info {
 	 * A third pool does submit_bio to avoid deadlocking with the other
 	 * two
 	 */
-	struct btrfs_workqueue_struct *workers;
-	struct btrfs_workqueue_struct *delalloc_workers;
-	struct btrfs_workqueue_struct *flush_workers;
-	struct btrfs_workqueue_struct *endio_workers;
-	struct btrfs_workqueue_struct *endio_meta_workers;
-	struct btrfs_workqueue_struct *endio_raid56_workers;
-	struct btrfs_workqueue_struct *rmw_workers;
-	struct btrfs_workqueue_struct *endio_meta_write_workers;
-	struct btrfs_workqueue_struct *endio_write_workers;
-	struct btrfs_workqueue_struct *endio_freespace_worker;
-	struct btrfs_workqueue_struct *submit_workers;
-	struct btrfs_workqueue_struct *caching_workers;
-	struct btrfs_workqueue_struct *readahead_workers;
+	struct btrfs_workqueue *workers;
+	struct btrfs_workqueue *delalloc_workers;
+	struct btrfs_workqueue *flush_workers;
+	struct btrfs_workqueue *endio_workers;
+	struct btrfs_workqueue *endio_meta_workers;
+	struct btrfs_workqueue *endio_raid56_workers;
+	struct btrfs_workqueue *rmw_workers;
+	struct btrfs_workqueue *endio_meta_write_workers;
+	struct btrfs_workqueue *endio_write_workers;
+	struct btrfs_workqueue *endio_freespace_worker;
+	struct btrfs_workqueue *submit_workers;
+	struct btrfs_workqueue *caching_workers;
+	struct btrfs_workqueue *readahead_workers;
 
 	/*
 	 * fixup workers take dirty pages that didn't properly go through
 	 * the cow mechanism and make them safe to write.  It happens
 	 * for the sys_munmap function call path
 	 */
-	struct btrfs_workqueue_struct *fixup_workers;
-	struct btrfs_workqueue_struct *delayed_workers;
+	struct btrfs_workqueue *fixup_workers;
+	struct btrfs_workqueue *delayed_workers;
 	struct task_struct *transaction_kthread;
 	struct task_struct *cleaner_kthread;
 	int thread_pool_size;
@@ -1604,9 +1604,9 @@ struct btrfs_fs_info {
 	atomic_t scrub_cancel_req;
 	wait_queue_head_t scrub_pause_wait;
 	int scrub_workers_refcnt;
-	struct btrfs_workqueue_struct *scrub_workers;
-	struct btrfs_workqueue_struct *scrub_wr_completion_workers;
-	struct btrfs_workqueue_struct *scrub_nocow_workers;
+	struct btrfs_workqueue *scrub_workers;
+	struct btrfs_workqueue *scrub_wr_completion_workers;
+	struct btrfs_workqueue *scrub_nocow_workers;
 
 #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
 	u32 check_integrity_print_mask;
@@ -1647,9 +1647,9 @@ struct btrfs_fs_info {
 	/* qgroup rescan items */
 	struct mutex qgroup_rescan_lock; /* protects the progress item */
 	struct btrfs_key qgroup_rescan_progress;
-	struct btrfs_workqueue_struct *qgroup_rescan_workers;
+	struct btrfs_workqueue *qgroup_rescan_workers;
 	struct completion qgroup_rescan_completion;
-	struct btrfs_work_struct qgroup_rescan_work;
+	struct btrfs_work qgroup_rescan_work;
 
 	/* filesystem state */
 	unsigned long fs_state;
@@ -3680,7 +3680,7 @@ struct btrfs_delalloc_work {
 	int delay_iput;
 	struct completion completion;
 	struct list_head list;
-	struct btrfs_work_struct work;
+	struct btrfs_work work;
 };
 
 struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode,
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 76e85d66801f..33e561a84013 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -1318,10 +1318,10 @@ void btrfs_remove_delayed_node(struct inode *inode)
 struct btrfs_async_delayed_work {
 	struct btrfs_delayed_root *delayed_root;
 	int nr;
-	struct btrfs_work_struct work;
+	struct btrfs_work work;
 };
 
-static void btrfs_async_run_delayed_root(struct btrfs_work_struct *work)
+static void btrfs_async_run_delayed_root(struct btrfs_work *work)
 {
 	struct btrfs_async_delayed_work *async_work;
 	struct btrfs_delayed_root *delayed_root;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index c80d9507171c..f7d84d955764 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -55,7 +55,7 @@
 #endif
 
 static struct extent_io_ops btree_extent_io_ops;
-static void end_workqueue_fn(struct btrfs_work_struct *work);
+static void end_workqueue_fn(struct btrfs_work *work);
 static void free_fs_root(struct btrfs_root *root);
 static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
 				    int read_only);
@@ -86,7 +86,7 @@ struct end_io_wq {
 	int error;
 	int metadata;
 	struct list_head list;
-	struct btrfs_work_struct work;
+	struct btrfs_work work;
 };
 
 /*
@@ -108,7 +108,7 @@ struct async_submit_bio {
 	 * can't tell us where in the file the bio should go
 	 */
 	u64 bio_offset;
-	struct btrfs_work_struct work;
+	struct btrfs_work work;
 	int error;
 };
 
@@ -742,7 +742,7 @@ unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info)
 	return 256 * limit;
 }
 
-static void run_one_async_start(struct btrfs_work_struct *work)
+static void run_one_async_start(struct btrfs_work *work)
 {
 	struct async_submit_bio *async;
 	int ret;
@@ -755,7 +755,7 @@ static void run_one_async_start(struct btrfs_work_struct *work)
 		async->error = ret;
 }
 
-static void run_one_async_done(struct btrfs_work_struct *work)
+static void run_one_async_done(struct btrfs_work *work)
 {
 	struct btrfs_fs_info *fs_info;
 	struct async_submit_bio *async;
@@ -782,7 +782,7 @@ static void run_one_async_done(struct btrfs_work_struct *work)
 			       async->bio_offset);
 }
 
-static void run_one_async_free(struct btrfs_work_struct *work)
+static void run_one_async_free(struct btrfs_work *work)
 {
 	struct async_submit_bio *async;
 
@@ -1668,7 +1668,7 @@ static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi)
  * called by the kthread helper functions to finally call the bio end_io
  * functions.  This is where read checksum verification actually happens
  */
-static void end_workqueue_fn(struct btrfs_work_struct *work)
+static void end_workqueue_fn(struct btrfs_work *work)
 {
 	struct bio *bio;
 	struct end_io_wq *end_io_wq;
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index bb58082f6d61..19ea8ad70c67 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -378,7 +378,7 @@ static u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
 	return total_added;
 }
 
-static noinline void caching_thread(struct btrfs_work_struct *work)
+static noinline void caching_thread(struct btrfs_work *work)
 {
 	struct btrfs_block_group_cache *block_group;
 	struct btrfs_fs_info *fs_info;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 0885f333574d..53697a80b849 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -324,7 +324,7 @@ struct async_cow {
 	u64 start;
 	u64 end;
 	struct list_head extents;
-	struct btrfs_work_struct work;
+	struct btrfs_work work;
 };
 
 static noinline int add_async_extent(struct async_cow *cow,
@@ -1000,7 +1000,7 @@ out_unlock:
 /*
  * work queue call back to started compression on a file and pages
  */
-static noinline void async_cow_start(struct btrfs_work_struct *work)
+static noinline void async_cow_start(struct btrfs_work *work)
 {
 	struct async_cow *async_cow;
 	int num_added = 0;
@@ -1018,7 +1018,7 @@ static noinline void async_cow_start(struct btrfs_work_struct *work)
 /*
  * work queue call back to submit previously compressed pages
  */
-static noinline void async_cow_submit(struct btrfs_work_struct *work)
+static noinline void async_cow_submit(struct btrfs_work *work)
 {
 	struct async_cow *async_cow;
 	struct btrfs_root *root;
@@ -1039,7 +1039,7 @@ static noinline void async_cow_submit(struct btrfs_work_struct *work)
 		submit_compressed_extents(async_cow->inode, async_cow);
 }
 
-static noinline void async_cow_free(struct btrfs_work_struct *work)
+static noinline void async_cow_free(struct btrfs_work *work)
 {
 	struct async_cow *async_cow;
 	async_cow = container_of(work, struct async_cow, work);
@@ -1748,10 +1748,10 @@ int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
 /* see btrfs_writepage_start_hook for details on why this is required */
 struct btrfs_writepage_fixup {
 	struct page *page;
-	struct btrfs_work_struct work;
+	struct btrfs_work work;
 };
 
-static void btrfs_writepage_fixup_worker(struct btrfs_work_struct *work)
+static void btrfs_writepage_fixup_worker(struct btrfs_work *work)
 {
 	struct btrfs_writepage_fixup *fixup;
 	struct btrfs_ordered_extent *ordered;
@@ -2750,7 +2750,7 @@ out:
 	return ret;
 }
 
-static void finish_ordered_fn(struct btrfs_work_struct *work)
+static void finish_ordered_fn(struct btrfs_work *work)
 {
 	struct btrfs_ordered_extent *ordered_extent;
 	ordered_extent = container_of(work, struct btrfs_ordered_extent, work);
@@ -2763,7 +2763,7 @@ static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
 	struct inode *inode = page->mapping->host;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_ordered_extent *ordered_extent = NULL;
-	struct btrfs_workqueue_struct *workers;
+	struct btrfs_workqueue *workers;
 
 	trace_btrfs_writepage_end_io_hook(page, start, end, uptodate);
 
@@ -8384,7 +8384,7 @@ out_notrans:
 	return ret;
 }
 
-static void btrfs_run_delalloc_work(struct btrfs_work_struct *work)
+static void btrfs_run_delalloc_work(struct btrfs_work *work)
 {
 	struct btrfs_delalloc_work *delalloc_work;
 	struct inode *inode;
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 6fa8219b5d03..751ee38083a9 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -576,7 +576,7 @@ void btrfs_remove_ordered_extent(struct inode *inode,
 	wake_up(&entry->wait);
 }
 
-static void btrfs_run_ordered_extent_work(struct btrfs_work_struct *work)
+static void btrfs_run_ordered_extent_work(struct btrfs_work *work)
 {
 	struct btrfs_ordered_extent *ordered;
 
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 84bb236119fe..246897058efb 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -130,10 +130,10 @@ struct btrfs_ordered_extent {
 	/* a per root list of all the pending ordered extents */
 	struct list_head root_extent_list;
 
-	struct btrfs_work_struct work;
+	struct btrfs_work work;
 
 	struct completion completion;
-	struct btrfs_work_struct flush_work;
+	struct btrfs_work flush_work;
 	struct list_head work_list;
 };
 
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 38617cc2fdd5..2cf905877aaf 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1984,7 +1984,7 @@ out:
 	return ret;
 }
 
-static void btrfs_qgroup_rescan_worker(struct btrfs_work_struct *work)
+static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
 {
 	struct btrfs_fs_info *fs_info = container_of(work, struct btrfs_fs_info,
 						     qgroup_rescan_work);
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 5afa564201a2..1269fc30b15c 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -87,7 +87,7 @@ struct btrfs_raid_bio {
 	/*
 	 * for scheduling work in the helper threads
 	 */
-	struct btrfs_work_struct work;
+	struct btrfs_work work;
 
 	/*
 	 * bio list and bio_list_lock are used
@@ -166,8 +166,8 @@ struct btrfs_raid_bio {
 
 static int __raid56_parity_recover(struct btrfs_raid_bio *rbio);
 static noinline void finish_rmw(struct btrfs_raid_bio *rbio);
-static void rmw_work(struct btrfs_work_struct *work);
-static void read_rebuild_work(struct btrfs_work_struct *work);
+static void rmw_work(struct btrfs_work *work);
+static void read_rebuild_work(struct btrfs_work *work);
 static void async_rmw_stripe(struct btrfs_raid_bio *rbio);
 static void async_read_rebuild(struct btrfs_raid_bio *rbio);
 static int fail_bio_stripe(struct btrfs_raid_bio *rbio, struct bio *bio);
@@ -1588,7 +1588,7 @@ struct btrfs_plug_cb {
 	struct blk_plug_cb cb;
 	struct btrfs_fs_info *info;
 	struct list_head rbio_list;
-	struct btrfs_work_struct work;
+	struct btrfs_work work;
 };
 
 /*
@@ -1652,7 +1652,7 @@ static void run_plug(struct btrfs_plug_cb *plug)
  * if the unplug comes from schedule, we have to push the
  * work off to a helper thread
  */
-static void unplug_work(struct btrfs_work_struct *work)
+static void unplug_work(struct btrfs_work *work)
 {
 	struct btrfs_plug_cb *plug;
 	plug = container_of(work, struct btrfs_plug_cb, work);
@@ -2079,7 +2079,7 @@ int raid56_parity_recover(struct btrfs_root *root, struct bio *bio,
 
 }
 
-static void rmw_work(struct btrfs_work_struct *work)
+static void rmw_work(struct btrfs_work *work)
 {
 	struct btrfs_raid_bio *rbio;
 
@@ -2087,7 +2087,7 @@ static void rmw_work(struct btrfs_work_struct *work)
 	raid56_rmw_stripe(rbio);
 }
 
-static void read_rebuild_work(struct btrfs_work_struct *work)
+static void read_rebuild_work(struct btrfs_work *work)
 {
 	struct btrfs_raid_bio *rbio;
 
diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c
index 9e01d3677355..30947f923620 100644
--- a/fs/btrfs/reada.c
+++ b/fs/btrfs/reada.c
@@ -91,8 +91,7 @@ struct reada_zone {
 };
 
 struct reada_machine_work {
-	struct btrfs_work_struct
-				work;
+	struct btrfs_work	work;
 	struct btrfs_fs_info	*fs_info;
 };
 
@@ -734,7 +733,7 @@ static int reada_start_machine_dev(struct btrfs_fs_info *fs_info,
 
 }
 
-static void reada_start_machine_worker(struct btrfs_work_struct *work)
+static void reada_start_machine_worker(struct btrfs_work *work)
 {
 	struct reada_machine_work *rmw;
 	struct btrfs_fs_info *fs_info;
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 5a240f5e6ceb..db21a1360e13 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -96,8 +96,7 @@ struct scrub_bio {
 #endif
 	int			page_count;
 	int			next_free;
-	struct btrfs_work_struct
-				work;
+	struct btrfs_work	work;
 };
 
 struct scrub_block {
@@ -155,8 +154,7 @@ struct scrub_fixup_nodatasum {
 	struct btrfs_device	*dev;
 	u64			logical;
 	struct btrfs_root	*root;
-	struct btrfs_work_struct
-				work;
+	struct btrfs_work	work;
 	int			mirror_num;
 };
 
@@ -174,8 +172,7 @@ struct scrub_copy_nocow_ctx {
 	int			mirror_num;
 	u64			physical_for_dev_replace;
 	struct list_head	inodes;
-	struct btrfs_work_struct
-				work;
+	struct btrfs_work	work;
 };
 
 struct scrub_warning {
@@ -234,7 +231,7 @@ static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
 		       u64 gen, int mirror_num, u8 *csum, int force,
 		       u64 physical_for_dev_replace);
 static void scrub_bio_end_io(struct bio *bio, int err);
-static void scrub_bio_end_io_worker(struct btrfs_work_struct *work);
+static void scrub_bio_end_io_worker(struct btrfs_work *work);
 static void scrub_block_complete(struct scrub_block *sblock);
 static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
 			       u64 extent_logical, u64 extent_len,
@@ -251,14 +248,14 @@ static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
 				    struct scrub_page *spage);
 static void scrub_wr_submit(struct scrub_ctx *sctx);
 static void scrub_wr_bio_end_io(struct bio *bio, int err);
-static void scrub_wr_bio_end_io_worker(struct btrfs_work_struct *work);
+static void scrub_wr_bio_end_io_worker(struct btrfs_work *work);
 static int write_page_nocow(struct scrub_ctx *sctx,
 			    u64 physical_for_dev_replace, struct page *page);
 static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,
 				      struct scrub_copy_nocow_ctx *ctx);
 static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
 			    int mirror_num, u64 physical_for_dev_replace);
-static void copy_nocow_pages_worker(struct btrfs_work_struct *work);
+static void copy_nocow_pages_worker(struct btrfs_work *work);
 static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info);
 static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info);
 
@@ -737,7 +734,7 @@ out:
 	return -EIO;
 }
 
-static void scrub_fixup_nodatasum(struct btrfs_work_struct *work)
+static void scrub_fixup_nodatasum(struct btrfs_work *work)
 {
 	int ret;
 	struct scrub_fixup_nodatasum *fixup;
@@ -1622,7 +1619,7 @@ static void scrub_wr_bio_end_io(struct bio *bio, int err)
 	btrfs_queue_work(fs_info->scrub_wr_completion_workers, &sbio->work);
 }
 
-static void scrub_wr_bio_end_io_worker(struct btrfs_work_struct *work)
+static void scrub_wr_bio_end_io_worker(struct btrfs_work *work)
 {
 	struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
 	struct scrub_ctx *sctx = sbio->sctx;
@@ -2090,7 +2087,7 @@ static void scrub_bio_end_io(struct bio *bio, int err)
 	btrfs_queue_work(fs_info->scrub_workers, &sbio->work);
 }
 
-static void scrub_bio_end_io_worker(struct btrfs_work_struct *work)
+static void scrub_bio_end_io_worker(struct btrfs_work *work)
 {
 	struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
 	struct scrub_ctx *sctx = sbio->sctx;
@@ -3161,7 +3158,7 @@ static int record_inode_for_nocow(u64 inum, u64 offset, u64 root, void *ctx)
 
 #define COPY_COMPLETE 1
 
-static void copy_nocow_pages_worker(struct btrfs_work_struct *work)
+static void copy_nocow_pages_worker(struct btrfs_work *work)
 {
 	struct scrub_copy_nocow_ctx *nocow_ctx =
 		container_of(work, struct scrub_copy_nocow_ctx, work);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 0066cff077ce..b4660c413c73 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -440,7 +440,7 @@ done:
 	blk_finish_plug(&plug);
 }
 
-static void pending_bios_fn(struct btrfs_work_struct *work)
+static void pending_bios_fn(struct btrfs_work *work)
 {
 	struct btrfs_device *device;
 
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 5d9a03773ca6..80754f9dd3df 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -95,7 +95,7 @@ struct btrfs_device {
 	/* per-device scrub information */
 	struct scrub_ctx *scrub_device;
 
-	struct btrfs_work_struct work;
+	struct btrfs_work work;
 	struct rcu_head rcu;
 	struct work_struct rcu_work;
 
-- 
cgit v1.2.3


From dec8ef90552f7b8cc6612daa2e3aa3a2212b0402 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@gmail.com>
Date: Sat, 1 Mar 2014 10:55:54 +0000
Subject: Btrfs: correctly flush data on defrag when compression is enabled

When the defrag flag BTRFS_DEFRAG_RANGE_START_IO is set and compression
enabled, we weren't flushing completely, as writing compressed extents
is a 2 steps process, one to compress the data and another one to write
the compressed data to disk.

Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
---
 fs/btrfs/ioctl.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index d4c179502775..f914b5db7ff1 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1382,8 +1382,12 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
 		}
 	}
 
-	if ((range->flags & BTRFS_DEFRAG_RANGE_START_IO))
+	if ((range->flags & BTRFS_DEFRAG_RANGE_START_IO)) {
 		filemap_flush(inode->i_mapping);
+		if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
+			     &BTRFS_I(inode)->runtime_flags))
+			filemap_flush(inode->i_mapping);
+	}
 
 	if ((range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)) {
 		/* the filemap_flush will queue IO into the worker threads, but
-- 
cgit v1.2.3


From e2127cf008be01c6aa9db463470c0668a85d6fe4 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@gmail.com>
Date: Sat, 1 Mar 2014 10:57:03 +0000
Subject: Btrfs: make defrag not fragment files when using prealloc extents

When using prealloc extents, a file defragment operation may actually
fragment the file and increase the amount of data space used by the file.
This change fixes that behaviour.

Example:

$ mkfs.btrfs -f /dev/sdb3
$ mount /dev/sdb3 /mnt
$ cd /mnt
$ xfs_io -f -c 'falloc 0 1048576' foobar && sync
$ xfs_io -c 'pwrite -S 0xff -b 100000 5000 100000' foobar
$ xfs_io -c 'pwrite -S 0xac -b 100000 200000 100000' foobar
$ xfs_io -c 'pwrite -S 0xe1 -b 100000 900000 100000' foobar && sync

Before defragmenting the file:

$ btrfs filesystem df /mnt
Data, single: total=8.00MiB, used=1.25MiB
System, DUP: total=8.00MiB, used=16.00KiB
System, single: total=4.00MiB, used=0.00
Metadata, DUP: total=1.00GiB, used=112.00KiB
Metadata, single: total=8.00MiB, used=0.00

$ btrfs-debug-tree /dev/sdb3
(...)
	item 6 key (257 EXTENT_DATA 0) itemoff 15810 itemsize 53
		prealloc data disk byte 12845056 nr 1048576
		prealloc data offset 0 nr 4096
	item 7 key (257 EXTENT_DATA 4096) itemoff 15757 itemsize 53
		extent data disk byte 12845056 nr 1048576
		extent data offset 4096 nr 102400 ram 1048576
		extent compression 0
	item 8 key (257 EXTENT_DATA 106496) itemoff 15704 itemsize 53
		prealloc data disk byte 12845056 nr 1048576
		prealloc data offset 106496 nr 90112
	item 9 key (257 EXTENT_DATA 196608) itemoff 15651 itemsize 53
		extent data disk byte 12845056 nr 1048576
		extent data offset 196608 nr 106496 ram 1048576
		extent compression 0
	item 10 key (257 EXTENT_DATA 303104) itemoff 15598 itemsize 53
		prealloc data disk byte 12845056 nr 1048576
		prealloc data offset 303104 nr 593920
	item 11 key (257 EXTENT_DATA 897024) itemoff 15545 itemsize 53
		extent data disk byte 12845056 nr 1048576
		extent data offset 897024 nr 106496 ram 1048576
		extent compression 0
	item 12 key (257 EXTENT_DATA 1003520) itemoff 15492 itemsize 53
		prealloc data disk byte 12845056 nr 1048576
		prealloc data offset 1003520 nr 45056
(...)

Now defragmenting the file results in more data space used than before:

$ btrfs filesystem defragment -f foobar && sync
$ btrfs filesystem df /mnt
Data, single: total=8.00MiB, used=1.55MiB
System, DUP: total=8.00MiB, used=16.00KiB
System, single: total=4.00MiB, used=0.00
Metadata, DUP: total=1.00GiB, used=112.00KiB
Metadata, single: total=8.00MiB, used=0.00

And the corresponding file extent items are now no longer perfectly sequential
as before, and we're now needlessly using more space from data block groups:

$ btrfs-debug-tree /dev/sdb3
(...)
	item 6 key (257 EXTENT_DATA 0) itemoff 15810 itemsize 53
		extent data disk byte 12845056 nr 1048576
		extent data offset 0 nr 4096 ram 1048576
		extent compression 0
	item 7 key (257 EXTENT_DATA 4096) itemoff 15757 itemsize 53
		extent data disk byte 13893632 nr 102400
		extent data offset 0 nr 102400 ram 102400
		extent compression 0
	item 8 key (257 EXTENT_DATA 106496) itemoff 15704 itemsize 53
		extent data disk byte 12845056 nr 1048576
		extent data offset 106496 nr 90112 ram 1048576
		extent compression 0
	item 9 key (257 EXTENT_DATA 196608) itemoff 15651 itemsize 53
		extent data disk byte 13996032 nr 106496
		extent data offset 0 nr 106496 ram 106496
		extent compression 0
	item 10 key (257 EXTENT_DATA 303104) itemoff 15598 itemsize 53
		prealloc data disk byte 12845056 nr 1048576
		prealloc data offset 303104 nr 593920
	item 11 key (257 EXTENT_DATA 897024) itemoff 15545 itemsize 53
		extent data disk byte 14102528 nr 106496
		extent data offset 0 nr 106496 ram 106496
		extent compression 0
	item 12 key (257 EXTENT_DATA 1003520) itemoff 15492 itemsize 53
		extent data disk byte 12845056 nr 1048576
		extent data offset 1003520 nr 45056 ram 1048576
		extent compression 0
(...)

With this change, the above example will no longer cause allocation of new data
space nor change the sequentiality of the file extents, that is, defragment will
be effectless, leaving all extent items pointing to the extent starting at disk
byte 12845056.

In a 20Gb filesystem I had, mounted with the autodefrag option and 20 files of
400Mb each, initially consisting of a single prealloc extent of 400Mb, having
random writes happening at a low rate, lead to a total of over ~17Gb of data
space used, not far from eventually reaching an ENOSPC state.

Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
---
 fs/btrfs/ioctl.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index f914b5db7ff1..1ae45bd9d27d 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -983,7 +983,8 @@ static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em)
 		return false;
 
 	next = defrag_lookup_extent(inode, em->start + em->len);
-	if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE)
+	if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE ||
+	    (em->block_start + em->block_len == next->block_start))
 		ret = false;
 
 	free_extent_map(next);
-- 
cgit v1.2.3


From fcbd2154d16431395e86a48859a5b547c33c09ad Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@gmail.com>
Date: Mon, 3 Mar 2014 12:28:40 +0000
Subject: Btrfs: avoid unnecessary utimes update in incremental send

When we're finishing processing of an inode, if we're dealing with a
directory inode that has a pending move/rename operation, we don't
need to send a utimes update instruction to the send stream, as we'll
do it later after doing the move/rename operation. Therefore we save
some time here building paths and doing btree lookups.

Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
---
 fs/btrfs/send.c | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index c2522e4e2c59..9d057ef5adef 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -4957,18 +4957,19 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end)
 		ret = apply_children_dir_moves(sctx);
 		if (ret)
 			goto out;
+		/*
+		 * Need to send that every time, no matter if it actually
+		 * changed between the two trees as we have done changes to
+		 * the inode before. If our inode is a directory and it's
+		 * waiting to be moved/renamed, we will send its utimes when
+		 * it's moved/renamed, therefore we don't need to do it here.
+		 */
+		sctx->send_progress = sctx->cur_ino + 1;
+		ret = send_utimes(sctx, sctx->cur_ino, sctx->cur_inode_gen);
+		if (ret < 0)
+			goto out;
 	}
 
-	/*
-	 * Need to send that every time, no matter if it actually
-	 * changed between the two trees as we have done changes to
-	 * the inode before.
-	 */
-	sctx->send_progress = sctx->cur_ino + 1;
-	ret = send_utimes(sctx, sctx->cur_ino, sctx->cur_inode_gen);
-	if (ret < 0)
-		goto out;
-
 out:
 	return ret;
 }
-- 
cgit v1.2.3


From a4d96d6254590df5eb9a6ac32434ed9d33a46d19 Mon Sep 17 00:00:00 2001
From: Liu Bo <bo.li.liu@oracle.com>
Date: Mon, 3 Mar 2014 21:31:03 +0800
Subject: Btrfs: share the same code for __record_{new,deleted}_ref

This has no functional change, only picks out the same part of two functions,
and makes it shared.

Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
---
 fs/btrfs/send.c | 49 +++++++++++++++++--------------------------------
 1 file changed, 17 insertions(+), 32 deletions(-)

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 9d057ef5adef..112eb647b5cd 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -2615,7 +2615,7 @@ struct recorded_ref {
  * everything mixed. So we first record all refs and later process them.
  * This function is a helper to record one ref.
  */
-static int record_ref(struct list_head *head, u64 dir,
+static int __record_ref(struct list_head *head, u64 dir,
 		      u64 dir_gen, struct fs_path *path)
 {
 	struct recorded_ref *ref;
@@ -3555,9 +3555,8 @@ out:
 	return ret;
 }
 
-static int __record_new_ref(int num, u64 dir, int index,
-			    struct fs_path *name,
-			    void *ctx)
+static int record_ref(struct btrfs_root *root, int num, u64 dir, int index,
+		      struct fs_path *name, void *ctx, struct list_head *refs)
 {
 	int ret = 0;
 	struct send_ctx *sctx = ctx;
@@ -3568,7 +3567,7 @@ static int __record_new_ref(int num, u64 dir, int index,
 	if (!p)
 		return -ENOMEM;
 
-	ret = get_inode_info(sctx->send_root, dir, NULL, &gen, NULL, NULL,
+	ret = get_inode_info(root, dir, NULL, &gen, NULL, NULL,
 			NULL, NULL);
 	if (ret < 0)
 		goto out;
@@ -3580,7 +3579,7 @@ static int __record_new_ref(int num, u64 dir, int index,
 	if (ret < 0)
 		goto out;
 
-	ret = record_ref(&sctx->new_refs, dir, gen, p);
+	ret = __record_ref(refs, dir, gen, p);
 
 out:
 	if (ret)
@@ -3588,37 +3587,23 @@ out:
 	return ret;
 }
 
+static int __record_new_ref(int num, u64 dir, int index,
+			    struct fs_path *name,
+			    void *ctx)
+{
+	struct send_ctx *sctx = ctx;
+	return record_ref(sctx->send_root, num, dir, index, name,
+			  ctx, &sctx->new_refs);
+}
+
+
 static int __record_deleted_ref(int num, u64 dir, int index,
 				struct fs_path *name,
 				void *ctx)
 {
-	int ret = 0;
 	struct send_ctx *sctx = ctx;
-	struct fs_path *p;
-	u64 gen;
-
-	p = fs_path_alloc();
-	if (!p)
-		return -ENOMEM;
-
-	ret = get_inode_info(sctx->parent_root, dir, NULL, &gen, NULL, NULL,
-			NULL, NULL);
-	if (ret < 0)
-		goto out;
-
-	ret = get_cur_path(sctx, dir, gen, p);
-	if (ret < 0)
-		goto out;
-	ret = fs_path_add_path(p, name);
-	if (ret < 0)
-		goto out;
-
-	ret = record_ref(&sctx->deleted_refs, dir, gen, p);
-
-out:
-	if (ret)
-		fs_path_free(p);
-	return ret;
+	return record_ref(sctx->parent_root, num, dir, index, name,
+			  ctx, &sctx->deleted_refs);
 }
 
 static int record_new_ref(struct send_ctx *sctx)
-- 
cgit v1.2.3


From 2131bcd38b18167f499f190acf3409dfe5b3c280 Mon Sep 17 00:00:00 2001
From: Liu Bo <bo.li.liu@oracle.com>
Date: Wed, 5 Mar 2014 10:07:35 +0800
Subject: Btrfs: add readahead for send_write

Btrfs send reads data from disk and then writes to a stream via pipe or
a file via flush.

Currently we're going to read each page a time, so every page results
in a disk read, which is not friendly to disks, esp. HDD.  Given that,
the performance can be gained by adding readahead for those pages.

Here is a quick test:
$ btrfs subvolume create send
$ xfs_io -f -c "pwrite 0 1G" send/foobar
$ btrfs subvolume snap -r send ro
$ time "btrfs send ro -f /dev/null"

           w/o             w
real    1m37.527s       0m9.097s
user    0m0.122s        0m0.086s
sys     0m53.191s       0m12.857s

Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
Reviewed-by: David Sterba <dsterba@suse.cz>
Signed-off-by: Josef Bacik <jbacik@fb.com>
---
 fs/btrfs/send.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 112eb647b5cd..646369179697 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -124,6 +124,8 @@ struct send_ctx {
 	struct list_head name_cache_list;
 	int name_cache_size;
 
+	struct file_ra_state ra;
+
 	char *read_buf;
 
 	/*
@@ -4170,6 +4172,13 @@ static ssize_t fill_read_buf(struct send_ctx *sctx, u64 offset, u32 len)
 		goto out;
 
 	last_index = (offset + len - 1) >> PAGE_CACHE_SHIFT;
+
+	/* initial readahead */
+	memset(&sctx->ra, 0, sizeof(struct file_ra_state));
+	file_ra_state_init(&sctx->ra, inode->i_mapping);
+	btrfs_force_ra(inode->i_mapping, &sctx->ra, NULL, index,
+		       last_index - index + 1);
+
 	while (index <= last_index) {
 		unsigned cur_len = min_t(unsigned, len,
 					 PAGE_CACHE_SIZE - pg_offset);
-- 
cgit v1.2.3


From 6db8914f9763d3f0a7610b497d44f93a4c17e62e Mon Sep 17 00:00:00 2001
From: Qu Wenruo <quwenruo@cn.fujitsu.com>
Date: Thu, 6 Mar 2014 04:19:50 +0000
Subject: btrfs: Cleanup the btrfs_workqueue related function type

The new btrfs_workqueue still use open-coded function defition,
this patch will change them into btrfs_func_t type which is much the
same as kernel workqueue.

Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
---
 fs/btrfs/async-thread.c |  6 +++---
 fs/btrfs/async-thread.h | 20 +++++++++++---------
 2 files changed, 14 insertions(+), 12 deletions(-)

diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index a709585e2c97..d8c07e5c1f24 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -255,9 +255,9 @@ static void normal_work_helper(struct work_struct *arg)
 }
 
 void btrfs_init_work(struct btrfs_work *work,
-		     void (*func)(struct btrfs_work *),
-		     void (*ordered_func)(struct btrfs_work *),
-		     void (*ordered_free)(struct btrfs_work *))
+		     btrfs_func_t func,
+		     btrfs_func_t ordered_func,
+		     btrfs_func_t ordered_free)
 {
 	work->func = func;
 	work->ordered_func = ordered_func;
diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h
index 08d717476227..0a891cdc4c28 100644
--- a/fs/btrfs/async-thread.h
+++ b/fs/btrfs/async-thread.h
@@ -23,11 +23,13 @@
 struct btrfs_workqueue;
 /* Internal use only */
 struct __btrfs_workqueue;
+struct btrfs_work;
+typedef void (*btrfs_func_t)(struct btrfs_work *arg);
 
 struct btrfs_work {
-	void (*func)(struct btrfs_work *arg);
-	void (*ordered_func)(struct btrfs_work *arg);
-	void (*ordered_free)(struct btrfs_work *arg);
+	btrfs_func_t func;
+	btrfs_func_t ordered_func;
+	btrfs_func_t ordered_free;
 
 	/* Don't touch things below */
 	struct work_struct normal_work;
@@ -37,13 +39,13 @@ struct btrfs_work {
 };
 
 struct btrfs_workqueue *btrfs_alloc_workqueue(char *name,
-						     int flags,
-						     int max_active,
-						     int thresh);
+					      int flags,
+					      int max_active,
+					      int thresh);
 void btrfs_init_work(struct btrfs_work *work,
-		     void (*func)(struct btrfs_work *),
-		     void (*ordered_func)(struct btrfs_work *),
-		     void (*ordered_free)(struct btrfs_work *));
+		     btrfs_func_t func,
+		     btrfs_func_t ordered_func,
+		     btrfs_func_t ordered_free);
 void btrfs_queue_work(struct btrfs_workqueue *wq,
 		      struct btrfs_work *work);
 void btrfs_destroy_workqueue(struct btrfs_workqueue *wq);
-- 
cgit v1.2.3


From 52483bc26f0e95c91e8fd07f9def588bf89664f8 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <quwenruo@cn.fujitsu.com>
Date: Thu, 6 Mar 2014 04:19:50 +0000
Subject: btrfs: Add ftrace for btrfs_workqueue

Add ftrace for btrfs_workqueue for further workqueue tunning.
This patch needs to applied after the workqueue replace patchset.

Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
---
 fs/btrfs/async-thread.c      |  7 ++++
 include/trace/events/btrfs.h | 82 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 89 insertions(+)

diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index d8c07e5c1f24..00623dd16b81 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -24,6 +24,7 @@
 #include <linux/freezer.h>
 #include <linux/workqueue.h>
 #include "async-thread.h"
+#include "ctree.h"
 
 #define WORK_DONE_BIT 0
 #define WORK_ORDER_DONE_BIT 1
@@ -210,6 +211,7 @@ static void run_ordered_work(struct __btrfs_workqueue *wq)
 		 */
 		if (test_and_set_bit(WORK_ORDER_DONE_BIT, &work->flags))
 			break;
+		trace_btrfs_ordered_sched(work);
 		spin_unlock_irqrestore(lock, flags);
 		work->ordered_func(work);
 
@@ -223,6 +225,7 @@ static void run_ordered_work(struct __btrfs_workqueue *wq)
 		 * with the lock held though
 		 */
 		work->ordered_free(work);
+		trace_btrfs_all_work_done(work);
 	}
 	spin_unlock_irqrestore(lock, flags);
 }
@@ -246,12 +249,15 @@ static void normal_work_helper(struct work_struct *arg)
 		need_order = 1;
 	wq = work->wq;
 
+	trace_btrfs_work_sched(work);
 	thresh_exec_hook(wq);
 	work->func(work);
 	if (need_order) {
 		set_bit(WORK_DONE_BIT, &work->flags);
 		run_ordered_work(wq);
 	}
+	if (!need_order)
+		trace_btrfs_all_work_done(work);
 }
 
 void btrfs_init_work(struct btrfs_work *work,
@@ -280,6 +286,7 @@ static inline void __btrfs_queue_work(struct __btrfs_workqueue *wq,
 		spin_unlock_irqrestore(&wq->list_lock, flags);
 	}
 	queue_work(wq->normal_wq, &work->normal_work);
+	trace_btrfs_work_queued(work);
 }
 
 void btrfs_queue_work(struct btrfs_workqueue *wq,
diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h
index 3176cdc32937..c346919254a9 100644
--- a/include/trace/events/btrfs.h
+++ b/include/trace/events/btrfs.h
@@ -21,6 +21,7 @@ struct btrfs_block_group_cache;
 struct btrfs_free_cluster;
 struct map_lookup;
 struct extent_buffer;
+struct btrfs_work;
 
 #define show_ref_type(type)						\
 	__print_symbolic(type,						\
@@ -982,6 +983,87 @@ TRACE_EVENT(free_extent_state,
 		  (void *)__entry->ip)
 );
 
+DECLARE_EVENT_CLASS(btrfs__work,
+
+	TP_PROTO(struct btrfs_work *work),
+
+	TP_ARGS(work),
+
+	TP_STRUCT__entry(
+		__field(	void *,	work			)
+		__field(	void *, wq			)
+		__field(	void *,	func			)
+		__field(	void *,	ordered_func		)
+		__field(	void *,	ordered_free		)
+	),
+
+	TP_fast_assign(
+		__entry->work		= work;
+		__entry->wq		= work->wq;
+		__entry->func		= work->func;
+		__entry->ordered_func	= work->ordered_func;
+		__entry->ordered_free	= work->ordered_free;
+	),
+
+	TP_printk("work=%p, wq=%p, func=%p, ordered_func=%p, ordered_free=%p",
+		  __entry->work, __entry->wq, __entry->func,
+		  __entry->ordered_func, __entry->ordered_free)
+);
+
+/* For situiations that the work is freed */
+DECLARE_EVENT_CLASS(btrfs__work__done,
+
+	TP_PROTO(struct btrfs_work *work),
+
+	TP_ARGS(work),
+
+	TP_STRUCT__entry(
+		__field(	void *,	work			)
+	),
+
+	TP_fast_assign(
+		__entry->work		= work;
+	),
+
+	TP_printk("work->%p", __entry->work)
+);
+
+DEFINE_EVENT(btrfs__work, btrfs_work_queued,
+
+	TP_PROTO(struct btrfs_work *work),
+
+	TP_ARGS(work)
+);
+
+DEFINE_EVENT(btrfs__work, btrfs_work_sched,
+
+	TP_PROTO(struct btrfs_work *work),
+
+	TP_ARGS(work)
+);
+
+DEFINE_EVENT(btrfs__work, btrfs_normal_work_done,
+
+	TP_PROTO(struct btrfs_work *work),
+
+	TP_ARGS(work)
+);
+
+DEFINE_EVENT(btrfs__work__done, btrfs_all_work_done,
+
+	TP_PROTO(struct btrfs_work *work),
+
+	TP_ARGS(work)
+);
+
+DEFINE_EVENT(btrfs__work, btrfs_ordered_sched,
+
+	TP_PROTO(struct btrfs_work *work),
+
+	TP_ARGS(work)
+);
+
+
 #endif /* _TRACE_BTRFS_H */
 
 /* This part must be outside protection */
-- 
cgit v1.2.3


From 8257b2dc3c1a1057b84a589827354abdc4c767fd Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Thu, 6 Mar 2014 13:38:19 +0800
Subject: Btrfs: introduce btrfs_{start, end}_nocow_write() for each subvolume

If the snapshot creation happened after the nocow write but before the dirty
data flush, we would fail to flush the dirty data because of no space.

So we must keep track of when those nocow write operations start and when they
end, if there are nocow writers, the snapshot creators must wait. In order
to implement this function, I introduce btrfs_{start, end}_nocow_write(),
which is similar to mnt_{want,drop}_write().

These two functions are only used for nocow file write operations.

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
---
 fs/btrfs/ctree.h       | 10 ++++++++++
 fs/btrfs/disk-io.c     | 42 +++++++++++++++++++++++++++++++++++++++++-
 fs/btrfs/extent-tree.c | 35 +++++++++++++++++++++++++++++++++++
 fs/btrfs/file.c        | 21 +++++++++++++++++----
 fs/btrfs/ioctl.c       | 35 ++++++++++++++++++++++++++++++-----
 5 files changed, 133 insertions(+), 10 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index b4d2e957b89f..374bb2f8ccd9 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1681,6 +1681,11 @@ struct btrfs_fs_info {
 	unsigned int update_uuid_tree_gen:1;
 };
 
+struct btrfs_subvolume_writers {
+	struct percpu_counter	counter;
+	wait_queue_head_t	wait;
+};
+
 /*
  * in ram representation of the tree.  extent_root is used for all allocations
  * and for the extent tree extent_root root.
@@ -1829,6 +1834,8 @@ struct btrfs_root {
 	 * manipulation with the read-only status via SUBVOL_SETFLAGS
 	 */
 	int send_in_progress;
+	struct btrfs_subvolume_writers *subv_writers;
+	atomic_t will_be_snapshoted;
 };
 
 struct btrfs_ioctl_defrag_range_args {
@@ -3353,6 +3360,9 @@ int btrfs_init_space_info(struct btrfs_fs_info *fs_info);
 int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans,
 					 struct btrfs_fs_info *fs_info);
 int __get_raid_index(u64 flags);
+
+int btrfs_start_nocow_write(struct btrfs_root *root);
+void btrfs_end_nocow_write(struct btrfs_root *root);
 /* ctree.c */
 int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
 		     int level, int *slot);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index f7d84d955764..7d09ca48c347 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1149,6 +1149,32 @@ void clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 	}
 }
 
+static struct btrfs_subvolume_writers *btrfs_alloc_subvolume_writers(void)
+{
+	struct btrfs_subvolume_writers *writers;
+	int ret;
+
+	writers = kmalloc(sizeof(*writers), GFP_NOFS);
+	if (!writers)
+		return ERR_PTR(-ENOMEM);
+
+	ret = percpu_counter_init(&writers->counter, 0);
+	if (ret < 0) {
+		kfree(writers);
+		return ERR_PTR(ret);
+	}
+
+	init_waitqueue_head(&writers->wait);
+	return writers;
+}
+
+static void
+btrfs_free_subvolume_writers(struct btrfs_subvolume_writers *writers)
+{
+	percpu_counter_destroy(&writers->counter);
+	kfree(writers);
+}
+
 static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
 			 u32 stripesize, struct btrfs_root *root,
 			 struct btrfs_fs_info *fs_info,
@@ -1205,6 +1231,7 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
 	atomic_set(&root->log_batch, 0);
 	atomic_set(&root->orphan_inodes, 0);
 	atomic_set(&root->refs, 1);
+	atomic_set(&root->will_be_snapshoted, 0);
 	root->log_transid = 0;
 	root->log_transid_committed = -1;
 	root->last_log_commit = 0;
@@ -1502,6 +1529,7 @@ struct btrfs_root *btrfs_read_fs_root(struct btrfs_root *tree_root,
 int btrfs_init_fs_root(struct btrfs_root *root)
 {
 	int ret;
+	struct btrfs_subvolume_writers *writers;
 
 	root->free_ino_ctl = kzalloc(sizeof(*root->free_ino_ctl), GFP_NOFS);
 	root->free_ino_pinned = kzalloc(sizeof(*root->free_ino_pinned),
@@ -1511,6 +1539,13 @@ int btrfs_init_fs_root(struct btrfs_root *root)
 		goto fail;
 	}
 
+	writers = btrfs_alloc_subvolume_writers();
+	if (IS_ERR(writers)) {
+		ret = PTR_ERR(writers);
+		goto fail;
+	}
+	root->subv_writers = writers;
+
 	btrfs_init_free_ino_ctl(root);
 	mutex_init(&root->fs_commit_mutex);
 	spin_lock_init(&root->cache_lock);
@@ -1518,8 +1553,11 @@ int btrfs_init_fs_root(struct btrfs_root *root)
 
 	ret = get_anon_bdev(&root->anon_dev);
 	if (ret)
-		goto fail;
+		goto free_writers;
 	return 0;
+
+free_writers:
+	btrfs_free_subvolume_writers(root->subv_writers);
 fail:
 	kfree(root->free_ino_ctl);
 	kfree(root->free_ino_pinned);
@@ -3459,6 +3497,8 @@ static void free_fs_root(struct btrfs_root *root)
 	root->orphan_block_rsv = NULL;
 	if (root->anon_dev)
 		free_anon_bdev(root->anon_dev);
+	if (root->subv_writers)
+		btrfs_free_subvolume_writers(root->subv_writers);
 	free_extent_buffer(root->node);
 	free_extent_buffer(root->commit_root);
 	kfree(root->free_ino_ctl);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 19ea8ad70c67..6b821c64b37b 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -8938,3 +8938,38 @@ int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range)
 	range->len = trimmed;
 	return ret;
 }
+
+/*
+ * btrfs_{start,end}_write() is similar to mnt_{want, drop}_write(),
+ * they are used to prevent the some tasks writing data into the page cache
+ * by nocow before the subvolume is snapshoted, but flush the data into
+ * the disk after the snapshot creation.
+ */
+void btrfs_end_nocow_write(struct btrfs_root *root)
+{
+	percpu_counter_dec(&root->subv_writers->counter);
+	/*
+	 * Make sure counter is updated before we wake up
+	 * waiters.
+	 */
+	smp_mb();
+	if (waitqueue_active(&root->subv_writers->wait))
+		wake_up(&root->subv_writers->wait);
+}
+
+int btrfs_start_nocow_write(struct btrfs_root *root)
+{
+	if (unlikely(atomic_read(&root->will_be_snapshoted)))
+		return 0;
+
+	percpu_counter_inc(&root->subv_writers->counter);
+	/*
+	 * Make sure counter is updated before we check for snapshot creation.
+	 */
+	smp_mb();
+	if (unlikely(atomic_read(&root->will_be_snapshoted))) {
+		btrfs_end_nocow_write(root);
+		return 0;
+	}
+	return 1;
+}
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index d88f2dc4d045..006cf9f0e852 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1410,6 +1410,10 @@ static noinline int check_can_nocow(struct inode *inode, loff_t pos,
 	u64 num_bytes;
 	int ret;
 
+	ret = btrfs_start_nocow_write(root);
+	if (!ret)
+		return -ENOSPC;
+
 	lockstart = round_down(pos, root->sectorsize);
 	lockend = round_up(pos + *write_bytes, root->sectorsize) - 1;
 
@@ -1427,11 +1431,13 @@ static noinline int check_can_nocow(struct inode *inode, loff_t pos,
 
 	num_bytes = lockend - lockstart + 1;
 	ret = can_nocow_extent(inode, lockstart, &num_bytes, NULL, NULL, NULL);
-	if (ret <= 0)
+	if (ret <= 0) {
 		ret = 0;
-	else
+		btrfs_end_nocow_write(root);
+	} else {
 		*write_bytes = min_t(size_t, *write_bytes ,
 				     num_bytes - pos + lockstart);
+	}
 
 	unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend);
 
@@ -1520,6 +1526,8 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
 			if (!only_release_metadata)
 				btrfs_free_reserved_data_space(inode,
 							       reserve_bytes);
+			else
+				btrfs_end_nocow_write(root);
 			break;
 		}
 
@@ -1608,6 +1616,9 @@ again:
 		}
 
 		release_bytes = 0;
+		if (only_release_metadata)
+			btrfs_end_nocow_write(root);
+
 		if (only_release_metadata && copied > 0) {
 			u64 lockstart = round_down(pos, root->sectorsize);
 			u64 lockend = lockstart +
@@ -1634,10 +1645,12 @@ again:
 	kfree(pages);
 
 	if (release_bytes) {
-		if (only_release_metadata)
+		if (only_release_metadata) {
+			btrfs_end_nocow_write(root);
 			btrfs_delalloc_release_metadata(inode, release_bytes);
-		else
+		} else {
 			btrfs_delalloc_release_space(inode, release_bytes);
+		}
 	}
 
 	return num_written ? num_written : ret;
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 1ae45bd9d27d..57bc9f33fa3c 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -611,6 +611,23 @@ fail:
 	return ret;
 }
 
+static void btrfs_wait_nocow_write(struct btrfs_root *root)
+{
+	s64 writers;
+	DEFINE_WAIT(wait);
+
+	do {
+		prepare_to_wait(&root->subv_writers->wait, &wait,
+				TASK_UNINTERRUPTIBLE);
+
+		writers = percpu_counter_sum(&root->subv_writers->counter);
+		if (writers)
+			schedule();
+
+		finish_wait(&root->subv_writers->wait, &wait);
+	} while (writers);
+}
+
 static int create_snapshot(struct btrfs_root *root, struct inode *dir,
 			   struct dentry *dentry, char *name, int namelen,
 			   u64 *async_transid, bool readonly,
@@ -624,15 +641,21 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
 	if (!root->ref_cows)
 		return -EINVAL;
 
+	atomic_inc(&root->will_be_snapshoted);
+	smp_mb__after_atomic_inc();
+	btrfs_wait_nocow_write(root);
+
 	ret = btrfs_start_delalloc_inodes(root, 0);
 	if (ret)
-		return ret;
+		goto out;
 
 	btrfs_wait_ordered_extents(root, -1);
 
 	pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS);
-	if (!pending_snapshot)
-		return -ENOMEM;
+	if (!pending_snapshot) {
+		ret = -ENOMEM;
+		goto out;
+	}
 
 	btrfs_init_block_rsv(&pending_snapshot->block_rsv,
 			     BTRFS_BLOCK_RSV_TEMP);
@@ -649,7 +672,7 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
 					&pending_snapshot->qgroup_reserved,
 					false);
 	if (ret)
-		goto out;
+		goto free;
 
 	pending_snapshot->dentry = dentry;
 	pending_snapshot->root = root;
@@ -700,8 +723,10 @@ fail:
 	btrfs_subvolume_release_metadata(BTRFS_I(dir)->root,
 					 &pending_snapshot->block_rsv,
 					 pending_snapshot->qgroup_reserved);
-out:
+free:
 	kfree(pending_snapshot);
+out:
+	atomic_dec(&root->will_be_snapshoted);
 	return ret;
 }
 
-- 
cgit v1.2.3


From 8b9d83cd6bebe10e9965d2cef8053b02663eaad8 Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Thu, 6 Mar 2014 13:54:55 +0800
Subject: Btrfs: fix early enospc due to the race of the two ordered extent
 wait

btrfs_wait_ordered_roots() moves all the list entries to a new list,
and then deals with them one by one. But if the other task invokes this
function at that time, it would get a empty list. It makes the enospc
error happens more early. Fix it.

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
---
 fs/btrfs/ordered-data.c | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 751ee38083a9..73de19c37f5a 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -589,7 +589,7 @@ static void btrfs_run_ordered_extent_work(struct btrfs_work *work)
  * wait for all the ordered extents in a root.  This is done when balancing
  * space between drives.
  */
-int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr)
+static int __btrfs_wait_ordered_extents(struct btrfs_root *root, int nr)
 {
 	struct list_head splice, works;
 	struct btrfs_ordered_extent *ordered, *next;
@@ -598,7 +598,6 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr)
 	INIT_LIST_HEAD(&splice);
 	INIT_LIST_HEAD(&works);
 
-	mutex_lock(&root->fs_info->ordered_operations_mutex);
 	spin_lock(&root->ordered_extent_lock);
 	list_splice_init(&root->ordered_extents, &splice);
 	while (!list_empty(&splice) && nr) {
@@ -630,6 +629,16 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr)
 		btrfs_put_ordered_extent(ordered);
 		cond_resched();
 	}
+
+	return count;
+}
+
+int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr)
+{
+	int count;
+
+	mutex_lock(&root->fs_info->ordered_operations_mutex);
+	count = __btrfs_wait_ordered_extents(root, nr);
 	mutex_unlock(&root->fs_info->ordered_operations_mutex);
 
 	return count;
@@ -643,6 +652,7 @@ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr)
 
 	INIT_LIST_HEAD(&splice);
 
+	mutex_lock(&fs_info->ordered_operations_mutex);
 	spin_lock(&fs_info->ordered_root_lock);
 	list_splice_init(&fs_info->ordered_roots, &splice);
 	while (!list_empty(&splice) && nr) {
@@ -654,7 +664,7 @@ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr)
 			       &fs_info->ordered_roots);
 		spin_unlock(&fs_info->ordered_root_lock);
 
-		done = btrfs_wait_ordered_extents(root, nr);
+		done = __btrfs_wait_ordered_extents(root, nr);
 		btrfs_put_fs_root(root);
 
 		spin_lock(&fs_info->ordered_root_lock);
@@ -665,6 +675,7 @@ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr)
 	}
 	list_splice_tail(&splice, &fs_info->ordered_roots);
 	spin_unlock(&fs_info->ordered_root_lock);
+	mutex_unlock(&fs_info->ordered_operations_mutex);
 }
 
 /*
-- 
cgit v1.2.3


From af7a65097b3f0a63caf1755df78d04e1a33588ef Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Thu, 6 Mar 2014 13:54:56 +0800
Subject: Btrfs: wake up the tasks that wait for the io earlier

The tasks that wait for the IO_DONE flag just care about the io of the dirty
pages, so it is better to wake up them immediately after all the pages are
written, not the whole process of the io completes.

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
---
 fs/btrfs/ordered-data.c | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 73de19c37f5a..a75eaa286b8a 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -349,10 +349,13 @@ int btrfs_dec_test_first_ordered_pending(struct inode *inode,
 	if (!uptodate)
 		set_bit(BTRFS_ORDERED_IOERR, &entry->flags);
 
-	if (entry->bytes_left == 0)
+	if (entry->bytes_left == 0) {
 		ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
-	else
+		if (waitqueue_active(&entry->wait))
+			wake_up(&entry->wait);
+	} else {
 		ret = 1;
+	}
 out:
 	if (!ret && cached && entry) {
 		*cached = entry;
@@ -410,10 +413,13 @@ have_entry:
 	if (!uptodate)
 		set_bit(BTRFS_ORDERED_IOERR, &entry->flags);
 
-	if (entry->bytes_left == 0)
+	if (entry->bytes_left == 0) {
 		ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
-	else
+		if (waitqueue_active(&entry->wait))
+			wake_up(&entry->wait);
+	} else {
 		ret = 1;
+	}
 out:
 	if (!ret && cached && entry) {
 		*cached = entry;
-- 
cgit v1.2.3


From 41bd9ca459a007cc5588563bb08de9677c8d23fd Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Thu, 6 Mar 2014 13:54:57 +0800
Subject: Btrfs: just do dirty page flush for the inode with compression before
 direct IO

As the comment in the btrfs_direct_IO says, only the compressed pages need be
flush again to make sure they are on the disk, but the common pages needn't,
so we add a if statement to check if the inode has compressed pages or not,
if no, skip the flush.

And in order to prevent the write ranges from intersecting, we need wait for
the running ordered extents. But the current code waits for them twice, one
is done before the direct IO starts (in btrfs_wait_ordered_range()), the other
is before we get the blocks, it is unnecessary. because we can do the direct
IO without holding i_mutex, it means that the intersected ordered extents may
happen during the direct IO, the first wait can not avoid this problem. So we
use filemap_fdatawrite_range() instead of btrfs_wait_ordered_range() to remove
the first wait.

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
---
 fs/btrfs/inode.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 53697a80b849..f5e623371bf3 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -7422,15 +7422,15 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
 	smp_mb__after_atomic_inc();
 
 	/*
-	 * The generic stuff only does filemap_write_and_wait_range, which isn't
-	 * enough if we've written compressed pages to this area, so we need to
-	 * call btrfs_wait_ordered_range to make absolutely sure that any
-	 * outstanding dirty pages are on disk.
+	 * The generic stuff only does filemap_write_and_wait_range, which
+	 * isn't enough if we've written compressed pages to this area, so
+	 * we need to flush the dirty pages again to make absolutely sure
+	 * that any outstanding dirty pages are on disk.
 	 */
 	count = iov_length(iov, nr_segs);
-	ret = btrfs_wait_ordered_range(inode, offset, count);
-	if (ret)
-		return ret;
+	if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
+		     &BTRFS_I(inode)->runtime_flags))
+		filemap_fdatawrite_range(inode->i_mapping, offset, count);
 
 	if (rw & WRITE) {
 		/*
-- 
cgit v1.2.3


From b88935bf9822cda58fd70dffe8e016d448757d40 Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Thu, 6 Mar 2014 13:54:58 +0800
Subject: Btrfs: remove the unnecessary flush when preparing the pages

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
---
 fs/btrfs/file.c | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 006cf9f0e852..b2143b8c33c5 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1360,11 +1360,11 @@ lock_and_cleanup_extent_if_need(struct inode *inode, struct page **pages,
 		struct btrfs_ordered_extent *ordered;
 		lock_extent_bits(&BTRFS_I(inode)->io_tree,
 				 start_pos, last_pos, 0, cached_state);
-		ordered = btrfs_lookup_first_ordered_extent(inode, last_pos);
+		ordered = btrfs_lookup_ordered_range(inode, start_pos,
+						     last_pos - start_pos + 1);
 		if (ordered &&
 		    ordered->file_offset + ordered->len > start_pos &&
 		    ordered->file_offset <= last_pos) {
-			btrfs_put_ordered_extent(ordered);
 			unlock_extent_cached(&BTRFS_I(inode)->io_tree,
 					     start_pos, last_pos,
 					     cached_state, GFP_NOFS);
@@ -1372,12 +1372,9 @@ lock_and_cleanup_extent_if_need(struct inode *inode, struct page **pages,
 				unlock_page(pages[i]);
 				page_cache_release(pages[i]);
 			}
-			ret = btrfs_wait_ordered_range(inode, start_pos,
-						last_pos - start_pos + 1);
-			if (ret)
-				return ret;
-			else
-				return -EAGAIN;
+			btrfs_start_ordered_extent(inode, ordered, 1);
+			btrfs_put_ordered_extent(ordered);
+			return -EAGAIN;
 		}
 		if (ordered)
 			btrfs_put_ordered_extent(ordered);
-- 
cgit v1.2.3


From 0424c548976b4c2a72c0bdbea425cf9d51e82d0f Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Thu, 6 Mar 2014 13:54:59 +0800
Subject: Btrfs: remove unnecessary lock in may_commit_transaction()

The reason is:
- The per-cpu counter has its own lock to protect itself.
- Here we needn't get a exact value.

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
---
 fs/btrfs/extent-tree.c | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 6b821c64b37b..5608b4f8a27e 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -4112,13 +4112,9 @@ static int may_commit_transaction(struct btrfs_root *root,
 		goto commit;
 
 	/* See if there is enough pinned space to make this reservation */
-	spin_lock(&space_info->lock);
 	if (percpu_counter_compare(&space_info->total_bytes_pinned,
-				   bytes) >= 0) {
-		spin_unlock(&space_info->lock);
+				   bytes) >= 0)
 		goto commit;
-	}
-	spin_unlock(&space_info->lock);
 
 	/*
 	 * See if there is some space in the delayed insertion reservation for
@@ -4127,16 +4123,13 @@ static int may_commit_transaction(struct btrfs_root *root,
 	if (space_info != delayed_rsv->space_info)
 		return -ENOSPC;
 
-	spin_lock(&space_info->lock);
 	spin_lock(&delayed_rsv->lock);
 	if (percpu_counter_compare(&space_info->total_bytes_pinned,
 				   bytes - delayed_rsv->size) >= 0) {
 		spin_unlock(&delayed_rsv->lock);
-		spin_unlock(&space_info->lock);
 		return -ENOSPC;
 	}
 	spin_unlock(&delayed_rsv->lock);
-	spin_unlock(&space_info->lock);
 
 commit:
 	trans = btrfs_join_transaction(root);
-- 
cgit v1.2.3


From 24af7dd1881f9f5c13c7d82e22d7858137383766 Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Thu, 6 Mar 2014 13:55:00 +0800
Subject: Btrfs: reclaim delalloc metadata more aggressively

generic/074 in xfstests failed sometimes because of the enospc error,
the reason of this problem is that we just reclaimed the space we need
from the reserved space for delalloc, and then tried to reserve the space,
but if some task did no-flush reservation between the above reclamation
and reservation,
	Task1			Task2
	shrink_delalloc()
	reclaim 1 block
	(The space that can
	 be reserved now is 1
	 block)
				do no-flush reservation
				reserve 1 block
				(The space that can
				 be reserved now is 0
				 block)
	reserving 1 block failed
the reservation of Task1 failed, but in fact, there was enough space to
reserve if we could reclaim more space before.

Fix this problem by the aggressive reclamation of the reserved delalloc
metadata space.

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
---
 fs/btrfs/extent-tree.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 5608b4f8a27e..5c0c5457268a 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -4174,7 +4174,7 @@ static int flush_space(struct btrfs_root *root,
 		break;
 	case FLUSH_DELALLOC:
 	case FLUSH_DELALLOC_WAIT:
-		shrink_delalloc(root, num_bytes, orig_bytes,
+		shrink_delalloc(root, num_bytes * 2, orig_bytes,
 				state == FLUSH_DELALLOC_WAIT);
 		break;
 	case ALLOC_CHUNK:
-- 
cgit v1.2.3


From 6c255e67cec1c38a0569c7f823eba63f9449ccf8 Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Thu, 6 Mar 2014 13:55:01 +0800
Subject: Btrfs: don't flush all delalloc inodes when we doesn't get s_umount
 lock

We needn't flush all delalloc inodes when we doesn't get s_umount lock,
or we would make the tasks wait for a long time.

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
---
 fs/btrfs/ctree.h       |  3 ++-
 fs/btrfs/dev-replace.c |  2 +-
 fs/btrfs/extent-tree.c |  8 ++++----
 fs/btrfs/inode.c       | 34 +++++++++++++++++++---------------
 fs/btrfs/ioctl.c       |  2 +-
 fs/btrfs/relocation.c  |  2 +-
 fs/btrfs/transaction.c |  2 +-
 7 files changed, 29 insertions(+), 24 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 374bb2f8ccd9..5a800986f416 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3740,7 +3740,8 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
 			       u32 min_type);
 
 int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput);
-int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput);
+int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput,
+			       int nr);
 int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
 			      struct extent_state **cached_state);
 int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index ec1c3f3a775d..9f2290509aca 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -491,7 +491,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
 	 * flush all outstanding I/O and inode extent mappings before the
 	 * copy operation is declared as being finished
 	 */
-	ret = btrfs_start_delalloc_roots(root->fs_info, 0);
+	ret = btrfs_start_delalloc_roots(root->fs_info, 0, -1);
 	if (ret) {
 		mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
 		return ret;
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 5c0c5457268a..c6b6a6e3e735 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3971,7 +3971,7 @@ static int can_overcommit(struct btrfs_root *root,
 }
 
 static void btrfs_writeback_inodes_sb_nr(struct btrfs_root *root,
-					 unsigned long nr_pages)
+					 unsigned long nr_pages, int nr_items)
 {
 	struct super_block *sb = root->fs_info->sb;
 
@@ -3986,9 +3986,9 @@ static void btrfs_writeback_inodes_sb_nr(struct btrfs_root *root,
 		 * the filesystem is readonly(all dirty pages are written to
 		 * the disk).
 		 */
-		btrfs_start_delalloc_roots(root->fs_info, 0);
+		btrfs_start_delalloc_roots(root->fs_info, 0, nr_items);
 		if (!current->journal_info)
-			btrfs_wait_ordered_roots(root->fs_info, -1);
+			btrfs_wait_ordered_roots(root->fs_info, nr_items);
 	}
 }
 
@@ -4045,7 +4045,7 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
 	while (delalloc_bytes && loops < 3) {
 		max_reclaim = min(delalloc_bytes, to_reclaim);
 		nr_pages = max_reclaim >> PAGE_CACHE_SHIFT;
-		btrfs_writeback_inodes_sb_nr(root, nr_pages);
+		btrfs_writeback_inodes_sb_nr(root, nr_pages, items);
 		/*
 		 * We need to wait for the async pages to actually start before
 		 * we do anything.
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index f5e623371bf3..fbaf1ac3941b 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -8437,7 +8437,8 @@ void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work)
  * some fairly slow code that needs optimization. This walks the list
  * of all the inodes with pending delalloc and forces them to disk.
  */
-static int __start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
+static int __start_delalloc_inodes(struct btrfs_root *root, int delay_iput,
+				   int nr)
 {
 	struct btrfs_inode *binode;
 	struct inode *inode;
@@ -8471,23 +8472,19 @@ static int __start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
 			else
 				iput(inode);
 			ret = -ENOMEM;
-			goto out;
+			break;
 		}
 		list_add_tail(&work->list, &works);
 		btrfs_queue_work(root->fs_info->flush_workers,
 				 &work->work);
-
+		ret++;
+		if (nr != -1 && ret >= nr)
+			break;
 		cond_resched();
 		spin_lock(&root->delalloc_lock);
 	}
 	spin_unlock(&root->delalloc_lock);
 
-	list_for_each_entry_safe(work, next, &works, list) {
-		list_del_init(&work->list);
-		btrfs_wait_and_free_delalloc_work(work);
-	}
-	return 0;
-out:
 	list_for_each_entry_safe(work, next, &works, list) {
 		list_del_init(&work->list);
 		btrfs_wait_and_free_delalloc_work(work);
@@ -8508,7 +8505,9 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
 	if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state))
 		return -EROFS;
 
-	ret = __start_delalloc_inodes(root, delay_iput);
+	ret = __start_delalloc_inodes(root, delay_iput, -1);
+	if (ret > 0)
+		ret = 0;
 	/*
 	 * the filemap_flush will queue IO into the worker threads, but
 	 * we have to make sure the IO is actually started and that
@@ -8525,7 +8524,8 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
 	return ret;
 }
 
-int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput)
+int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput,
+			       int nr)
 {
 	struct btrfs_root *root;
 	struct list_head splice;
@@ -8538,7 +8538,7 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput)
 
 	spin_lock(&fs_info->delalloc_root_lock);
 	list_splice_init(&fs_info->delalloc_roots, &splice);
-	while (!list_empty(&splice)) {
+	while (!list_empty(&splice) && nr) {
 		root = list_first_entry(&splice, struct btrfs_root,
 					delalloc_root);
 		root = btrfs_grab_fs_root(root);
@@ -8547,15 +8547,20 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput)
 			       &fs_info->delalloc_roots);
 		spin_unlock(&fs_info->delalloc_root_lock);
 
-		ret = __start_delalloc_inodes(root, delay_iput);
+		ret = __start_delalloc_inodes(root, delay_iput, nr);
 		btrfs_put_fs_root(root);
-		if (ret)
+		if (ret < 0)
 			goto out;
 
+		if (nr != -1) {
+			nr -= ret;
+			WARN_ON(nr < 0);
+		}
 		spin_lock(&fs_info->delalloc_root_lock);
 	}
 	spin_unlock(&fs_info->delalloc_root_lock);
 
+	ret = 0;
 	atomic_inc(&fs_info->async_submit_draining);
 	while (atomic_read(&fs_info->nr_async_submits) ||
 	      atomic_read(&fs_info->async_delalloc_pages)) {
@@ -8564,7 +8569,6 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput)
 		    atomic_read(&fs_info->async_delalloc_pages) == 0));
 	}
 	atomic_dec(&fs_info->async_submit_draining);
-	return 0;
 out:
 	if (!list_empty_careful(&splice)) {
 		spin_lock(&fs_info->delalloc_root_lock);
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 57bc9f33fa3c..e1747701f520 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -4893,7 +4893,7 @@ long btrfs_ioctl(struct file *file, unsigned int
 	case BTRFS_IOC_SYNC: {
 		int ret;
 
-		ret = btrfs_start_delalloc_roots(root->fs_info, 0);
+		ret = btrfs_start_delalloc_roots(root->fs_info, 0, -1);
 		if (ret)
 			return ret;
 		ret = btrfs_sync_fs(file->f_dentry->d_sb, 1);
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 07b3b36f40ee..def428a25b2a 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -4248,7 +4248,7 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
 	btrfs_info(extent_root->fs_info, "relocating block group %llu flags %llu",
 	       rc->block_group->key.objectid, rc->block_group->flags);
 
-	ret = btrfs_start_delalloc_roots(fs_info, 0);
+	ret = btrfs_start_delalloc_roots(fs_info, 0, -1);
 	if (ret < 0) {
 		err = ret;
 		goto out;
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 79a4186b724a..a999b85d1176 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -1620,7 +1620,7 @@ static int btrfs_flush_all_pending_stuffs(struct btrfs_trans_handle *trans,
 static inline int btrfs_start_delalloc_flush(struct btrfs_fs_info *fs_info)
 {
 	if (btrfs_test_opt(fs_info->tree_root, FLUSHONCOMMIT))
-		return btrfs_start_delalloc_roots(fs_info, 1);
+		return btrfs_start_delalloc_roots(fs_info, 1, -1);
 	return 0;
 }
 
-- 
cgit v1.2.3


From 31f3d255c677073f83daa1e0671bbf2157bf8edc Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Thu, 6 Mar 2014 13:55:02 +0800
Subject: Btrfs: split the global ordered extents mutex

When we create a snapshot, we just need wait the ordered extents in
the source fs/file root, but because we use the global mutex to protect
this ordered extents list of the source fs/file root to avoid accessing
a empty list, if someone got the mutex to access the ordered extents list
of the other fs/file root, we had to wait.

This patch splits the above global mutex, now every fs/file root has
its own mutex to protect its own list.

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
---
 fs/btrfs/ctree.h        |  2 ++
 fs/btrfs/disk-io.c      |  1 +
 fs/btrfs/ordered-data.c | 17 ++++-------------
 3 files changed, 7 insertions(+), 13 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 5a800986f416..5f4921554f0a 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1814,6 +1814,8 @@ struct btrfs_root {
 	struct list_head delalloc_inodes;
 	struct list_head delalloc_root;
 	u64 nr_delalloc_inodes;
+
+	struct mutex ordered_extent_mutex;
 	/*
 	 * this is used by the balancing code to wait for all the pending
 	 * ordered extents
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 7d09ca48c347..237b5b5a2200 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1220,6 +1220,7 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
 	spin_lock_init(&root->log_extents_lock[1]);
 	mutex_init(&root->objectid_mutex);
 	mutex_init(&root->log_mutex);
+	mutex_init(&root->ordered_extent_mutex);
 	init_waitqueue_head(&root->log_writer_wait);
 	init_waitqueue_head(&root->log_commit_wait[0]);
 	init_waitqueue_head(&root->log_commit_wait[1]);
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index a75eaa286b8a..a94b05f72869 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -595,7 +595,7 @@ static void btrfs_run_ordered_extent_work(struct btrfs_work *work)
  * wait for all the ordered extents in a root.  This is done when balancing
  * space between drives.
  */
-static int __btrfs_wait_ordered_extents(struct btrfs_root *root, int nr)
+int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr)
 {
 	struct list_head splice, works;
 	struct btrfs_ordered_extent *ordered, *next;
@@ -604,6 +604,7 @@ static int __btrfs_wait_ordered_extents(struct btrfs_root *root, int nr)
 	INIT_LIST_HEAD(&splice);
 	INIT_LIST_HEAD(&works);
 
+	mutex_lock(&root->ordered_extent_mutex);
 	spin_lock(&root->ordered_extent_lock);
 	list_splice_init(&root->ordered_extents, &splice);
 	while (!list_empty(&splice) && nr) {
@@ -635,17 +636,7 @@ static int __btrfs_wait_ordered_extents(struct btrfs_root *root, int nr)
 		btrfs_put_ordered_extent(ordered);
 		cond_resched();
 	}
-
-	return count;
-}
-
-int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr)
-{
-	int count;
-
-	mutex_lock(&root->fs_info->ordered_operations_mutex);
-	count = __btrfs_wait_ordered_extents(root, nr);
-	mutex_unlock(&root->fs_info->ordered_operations_mutex);
+	mutex_unlock(&root->ordered_extent_mutex);
 
 	return count;
 }
@@ -670,7 +661,7 @@ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr)
 			       &fs_info->ordered_roots);
 		spin_unlock(&fs_info->ordered_root_lock);
 
-		done = __btrfs_wait_ordered_extents(root, nr);
+		done = btrfs_wait_ordered_extents(root, nr);
 		btrfs_put_fs_root(root);
 
 		spin_lock(&fs_info->ordered_root_lock);
-- 
cgit v1.2.3


From 573bfb72f7608eb7097d2dd036a714a6ab20cffe Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Thu, 6 Mar 2014 13:55:03 +0800
Subject: Btrfs: fix possible empty list access when flushing the delalloc
 inodes

We didn't have a lock to protect the access to the delalloc inodes list, that is
we might access a empty delalloc inodes list if someone start flushing delalloc
inodes because the delalloc inodes were moved into a other list temporarily.
Fix it by wrapping the access with a lock.

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
---
 fs/btrfs/ctree.h   | 2 ++
 fs/btrfs/disk-io.c | 2 ++
 fs/btrfs/inode.c   | 4 ++++
 3 files changed, 8 insertions(+)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 5f4921554f0a..2a9d32e193a5 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1490,6 +1490,7 @@ struct btrfs_fs_info {
 	 */
 	struct list_head ordered_roots;
 
+	struct mutex delalloc_root_mutex;
 	spinlock_t delalloc_root_lock;
 	/* all fs/file tree roots that have delalloc inodes. */
 	struct list_head delalloc_roots;
@@ -1805,6 +1806,7 @@ struct btrfs_root {
 	spinlock_t root_item_lock;
 	atomic_t refs;
 
+	struct mutex delalloc_mutex;
 	spinlock_t delalloc_lock;
 	/*
 	 * all of the inodes that have delalloc bytes.  It is possible for
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 237b5b5a2200..d9698fda2d12 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1221,6 +1221,7 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
 	mutex_init(&root->objectid_mutex);
 	mutex_init(&root->log_mutex);
 	mutex_init(&root->ordered_extent_mutex);
+	mutex_init(&root->delalloc_mutex);
 	init_waitqueue_head(&root->log_writer_wait);
 	init_waitqueue_head(&root->log_commit_wait[0]);
 	init_waitqueue_head(&root->log_commit_wait[1]);
@@ -2209,6 +2210,7 @@ int open_ctree(struct super_block *sb,
 	spin_lock_init(&fs_info->buffer_lock);
 	rwlock_init(&fs_info->tree_mod_log_lock);
 	mutex_init(&fs_info->reloc_mutex);
+	mutex_init(&fs_info->delalloc_root_mutex);
 	seqlock_init(&fs_info->profiles_lock);
 
 	init_completion(&fs_info->kobj_unregister);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index fbaf1ac3941b..0ec876657923 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -8450,6 +8450,7 @@ static int __start_delalloc_inodes(struct btrfs_root *root, int delay_iput,
 	INIT_LIST_HEAD(&works);
 	INIT_LIST_HEAD(&splice);
 
+	mutex_lock(&root->delalloc_mutex);
 	spin_lock(&root->delalloc_lock);
 	list_splice_init(&root->delalloc_inodes, &splice);
 	while (!list_empty(&splice)) {
@@ -8495,6 +8496,7 @@ static int __start_delalloc_inodes(struct btrfs_root *root, int delay_iput,
 		list_splice_tail(&splice, &root->delalloc_inodes);
 		spin_unlock(&root->delalloc_lock);
 	}
+	mutex_unlock(&root->delalloc_mutex);
 	return ret;
 }
 
@@ -8536,6 +8538,7 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput,
 
 	INIT_LIST_HEAD(&splice);
 
+	mutex_lock(&fs_info->delalloc_root_mutex);
 	spin_lock(&fs_info->delalloc_root_lock);
 	list_splice_init(&fs_info->delalloc_roots, &splice);
 	while (!list_empty(&splice) && nr) {
@@ -8575,6 +8578,7 @@ out:
 		list_splice_tail(&splice, &fs_info->delalloc_roots);
 		spin_unlock(&fs_info->delalloc_root_lock);
 	}
+	mutex_unlock(&fs_info->delalloc_root_mutex);
 	return ret;
 }
 
-- 
cgit v1.2.3


From 3bbb24b20a8800158c33eca8564f432dd14d0bf3 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fb.com>
Date: Thu, 6 Mar 2014 19:01:07 -0500
Subject: Btrfs: fix deadlock with nested trans handles

Zach found this deadlock that would happen like this

btrfs_end_transaction <- reduce trans->use_count to 0
  btrfs_run_delayed_refs
    btrfs_cow_block
      find_free_extent
	btrfs_start_transaction <- increase trans->use_count to 1
          allocate chunk
	btrfs_end_transaction <- decrease trans->use_count to 0
	  btrfs_run_delayed_refs
	    lock tree block we are cowing above ^^

We need to only decrease trans->use_count if it is above 1, otherwise leave it
alone.  This will make nested trans be the only ones who decrease their added
ref, and will let us get rid of the trans->use_count++ hack if we have to commit
the transaction.  Thanks,

cc: stable@vger.kernel.org
Reported-by: Zach Brown <zab@redhat.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Tested-by: Zach Brown <zab@redhat.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/transaction.c | 14 ++++----------
 1 file changed, 4 insertions(+), 10 deletions(-)

diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index a999b85d1176..a04707f740d6 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -683,7 +683,8 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
 	int lock = (trans->type != TRANS_JOIN_NOLOCK);
 	int err = 0;
 
-	if (--trans->use_count) {
+	if (trans->use_count > 1) {
+		trans->use_count--;
 		trans->block_rsv = trans->orig_rsv;
 		return 0;
 	}
@@ -731,17 +732,10 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
 	}
 
 	if (lock && ACCESS_ONCE(cur_trans->state) == TRANS_STATE_BLOCKED) {
-		if (throttle) {
-			/*
-			 * We may race with somebody else here so end up having
-			 * to call end_transaction on ourselves again, so inc
-			 * our use_count.
-			 */
-			trans->use_count++;
+		if (throttle)
 			return btrfs_commit_transaction(trans, root);
-		} else {
+		else
 			wake_up_process(info->transaction_kthread);
-		}
 	}
 
 	if (trans->type & __TRANS_FREEZABLE)
-- 
cgit v1.2.3


From 308d9800b2c4f1fb344dbf055912d3140438bac0 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@gmail.com>
Date: Tue, 11 Mar 2014 13:56:15 +0000
Subject: Btrfs: cache extent states in defrag code path

When locking file ranges in the inode's io_tree, cache the first
extent state that belongs to the target range, so that when unlocking
the range we don't need to search in the io_tree again, reducing cpu
time and making and therefore holding the io_tree's lock for a shorter
period.

Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/ioctl.c | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index e1747701f520..3ad5c10d3704 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -986,10 +986,13 @@ static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start)
 	read_unlock(&em_tree->lock);
 
 	if (!em) {
+		struct extent_state *cached = NULL;
+		u64 end = start + len - 1;
+
 		/* get the big lock and read metadata off disk */
-		lock_extent(io_tree, start, start + len - 1);
+		lock_extent_bits(io_tree, start, end, 0, &cached);
 		em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
-		unlock_extent(io_tree, start, start + len - 1);
+		unlock_extent_cached(io_tree, start, end, &cached, GFP_NOFS);
 
 		if (IS_ERR(em))
 			return NULL;
@@ -1128,10 +1131,12 @@ again:
 		page_start = page_offset(page);
 		page_end = page_start + PAGE_CACHE_SIZE - 1;
 		while (1) {
-			lock_extent(tree, page_start, page_end);
+			lock_extent_bits(tree, page_start, page_end,
+					 0, &cached_state);
 			ordered = btrfs_lookup_ordered_extent(inode,
 							      page_start);
-			unlock_extent(tree, page_start, page_end);
+			unlock_extent_cached(tree, page_start, page_end,
+					     &cached_state, GFP_NOFS);
 			if (!ordered)
 				break;
 
-- 
cgit v1.2.3


From ef66af101a261f1c86ef9ec3859ebd9c28ee2e54 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@gmail.com>
Date: Tue, 11 Mar 2014 14:31:44 +0000
Subject: Btrfs: add missing kfree in btrfs_destroy_workqueue

Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/async-thread.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 00623dd16b81..66532b8f0f7c 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -315,6 +315,7 @@ void btrfs_destroy_workqueue(struct btrfs_workqueue *wq)
 	if (wq->high)
 		__btrfs_destroy_workqueue(wq->high);
 	__btrfs_destroy_workqueue(wq->normal);
+	kfree(wq);
 }
 
 void btrfs_workqueue_set_max(struct btrfs_workqueue *wq, int max)
-- 
cgit v1.2.3


From 72de6b5393c15c5228074008bbdc47e92bf6d4f7 Mon Sep 17 00:00:00 2001
From: Guangyu Sun <guangyu.sun@oracle.com>
Date: Tue, 11 Mar 2014 11:24:18 -0700
Subject: Btrfs: return EPERM when deleting a default subvolume

The error message is confusing:

 # btrfs sub delete /mnt/mysub/
 Delete subvolume '/mnt/mysub'
 ERROR: cannot delete '/mnt/mysub' - Directory not empty

The error message does not make sense to me: It's not about deleting a
directory but it's a subvolume, and it doesn't matter if the subvolume is
empty or not.

Maybe EPERM or is more appropriate in this case, combined with an explanatory
kernel log message. (e.g. "subvolume with ID 123 cannot be deleted because
it is configured as default subvolume.")

Reported-by: Koen De Wit <koen.de.wit@oracle.com>
Signed-off-by: Guangyu Sun <guangyu.sun@oracle.com>
Reviewed-by: David Sterba <dsterba@suse.cz>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/ioctl.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 3ad5c10d3704..10c18a6582cc 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1858,7 +1858,9 @@ static noinline int may_destroy_subvol(struct btrfs_root *root)
 	if (di && !IS_ERR(di)) {
 		btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key);
 		if (key.objectid == root->root_key.objectid) {
-			ret = -ENOTEMPTY;
+			ret = -EPERM;
+			btrfs_err(root->fs_info, "deleting default subvolume "
+				  "%llu is not allowed", key.objectid);
 			goto out;
 		}
 		btrfs_release_path(path);
-- 
cgit v1.2.3


From f094c9bd3e12ee83e91f4249b600d4d2ac0a4738 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@gmail.com>
Date: Wed, 12 Mar 2014 01:28:24 +0000
Subject: Btrfs: less fs tree lock contention when using autodefrag

When finding new extents during an autodefrag, don't do so many fs tree
lookups to find an extent with a size smaller then the target treshold.
Instead, after each fs tree forward search immediately unlock upper
levels and process the entire leaf while holding a read lock on the leaf,
since our leaf processing is very fast.
This reduces lock contention, allowing for higher concurrency when other
tasks want to write/update items related to other inodes in the fs tree,
as we're not holding read locks on upper tree levels while processing the
leaf and we do less tree searches.

Test:

    sysbench --test=fileio --file-num=512 --file-total-size=16G \
       --file-test-mode=rndrw --num-threads=32 --file-block-size=32768 \
       --file-rw-ratio=3 --file-io-mode=sync --max-time=1800 \
       --max-requests=10000000000 [prepare|run]

(fileystem mounted with -o autodefrag, averages of 5 runs)

Before this change: 58.852Mb/sec throughtput, read 77.589Gb, written 25.863Gb
After this change:  63.034Mb/sec throughtput, read 83.102Gb, written 27.701Gb

Test machine: quad core intel i5-3570K, 32Gb of RAM, SSD.

Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/ioctl.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 10c18a6582cc..3ca313b138ca 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -935,12 +935,14 @@ static int find_new_extents(struct btrfs_root *root,
 	min_key.type = BTRFS_EXTENT_DATA_KEY;
 	min_key.offset = *off;
 
-	path->keep_locks = 1;
-
 	while (1) {
+		path->keep_locks = 1;
 		ret = btrfs_search_forward(root, &min_key, path, newer_than);
 		if (ret != 0)
 			goto none;
+		path->keep_locks = 0;
+		btrfs_unlock_up_safe(path, 1);
+process_slot:
 		if (min_key.objectid != ino)
 			goto none;
 		if (min_key.type != BTRFS_EXTENT_DATA_KEY)
@@ -959,6 +961,12 @@ static int find_new_extents(struct btrfs_root *root,
 			return 0;
 		}
 
+		path->slots[0]++;
+		if (path->slots[0] < btrfs_header_nritems(leaf)) {
+			btrfs_item_key_to_cpu(leaf, &min_key, path->slots[0]);
+			goto process_slot;
+		}
+
 		if (min_key.offset == (u64)-1)
 			goto none;
 
-- 
cgit v1.2.3


From c3a468915a384c0015263edd9b7263775599a323 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <quwenruo@cn.fujitsu.com>
Date: Wed, 12 Mar 2014 08:05:33 +0000
Subject: btrfs: Add trace for btrfs_workqueue alloc/destroy

Since most of the btrfs_workqueue is printed as pointer address,
for easier analysis, add trace for btrfs_workqueue alloc/destroy.
So it is possible to determine the workqueue that a given work belongs
to(by comparing the wq pointer address with alloc trace event).

Signed-off-by: Qu Wenruo <quenruo@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/async-thread.c      |  7 ++++--
 fs/btrfs/async-thread.h      |  2 +-
 include/trace/events/btrfs.h | 55 ++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 61 insertions(+), 3 deletions(-)

diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 66532b8f0f7c..ecb5832c0967 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -56,7 +56,8 @@ struct btrfs_workqueue {
 };
 
 static inline struct __btrfs_workqueue
-*__btrfs_alloc_workqueue(char *name, int flags, int max_active, int thresh)
+*__btrfs_alloc_workqueue(const char *name, int flags, int max_active,
+			 int thresh)
 {
 	struct __btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_NOFS);
 
@@ -92,13 +93,14 @@ static inline struct __btrfs_workqueue
 	INIT_LIST_HEAD(&ret->ordered_list);
 	spin_lock_init(&ret->list_lock);
 	spin_lock_init(&ret->thres_lock);
+	trace_btrfs_workqueue_alloc(ret, name, flags & WQ_HIGHPRI);
 	return ret;
 }
 
 static inline void
 __btrfs_destroy_workqueue(struct __btrfs_workqueue *wq);
 
-struct btrfs_workqueue *btrfs_alloc_workqueue(char *name,
+struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name,
 					      int flags,
 					      int max_active,
 					      int thresh)
@@ -305,6 +307,7 @@ static inline void
 __btrfs_destroy_workqueue(struct __btrfs_workqueue *wq)
 {
 	destroy_workqueue(wq->normal_wq);
+	trace_btrfs_workqueue_destroy(wq);
 	kfree(wq);
 }
 
diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h
index 0a891cdc4c28..9c6b66d15fb0 100644
--- a/fs/btrfs/async-thread.h
+++ b/fs/btrfs/async-thread.h
@@ -38,7 +38,7 @@ struct btrfs_work {
 	unsigned long flags;
 };
 
-struct btrfs_workqueue *btrfs_alloc_workqueue(char *name,
+struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name,
 					      int flags,
 					      int max_active,
 					      int thresh);
diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h
index c346919254a9..4ee4e30d26d9 100644
--- a/include/trace/events/btrfs.h
+++ b/include/trace/events/btrfs.h
@@ -22,6 +22,7 @@ struct btrfs_free_cluster;
 struct map_lookup;
 struct extent_buffer;
 struct btrfs_work;
+struct __btrfs_workqueue;
 
 #define show_ref_type(type)						\
 	__print_symbolic(type,						\
@@ -1063,6 +1064,60 @@ DEFINE_EVENT(btrfs__work, btrfs_ordered_sched,
 	TP_ARGS(work)
 );
 
+DECLARE_EVENT_CLASS(btrfs__workqueue,
+
+	TP_PROTO(struct __btrfs_workqueue *wq, const char *name, int high),
+
+	TP_ARGS(wq, name, high),
+
+	TP_STRUCT__entry(
+		__field(	void *,	wq			)
+		__string(	name,	name			)
+		__field(	int ,	high			)
+	),
+
+	TP_fast_assign(
+		__entry->wq		= wq;
+		__assign_str(name, name);
+		__entry->high		= high;
+	),
+
+	TP_printk("name=%s%s, wq=%p", __get_str(name),
+		  __print_flags(__entry->high, "",
+				{(WQ_HIGHPRI),	"-high"}),
+		  __entry->wq)
+);
+
+DEFINE_EVENT(btrfs__workqueue, btrfs_workqueue_alloc,
+
+	TP_PROTO(struct __btrfs_workqueue *wq, const char *name, int high),
+
+	TP_ARGS(wq, name, high)
+);
+
+DECLARE_EVENT_CLASS(btrfs__workqueue_done,
+
+	TP_PROTO(struct __btrfs_workqueue *wq),
+
+	TP_ARGS(wq),
+
+	TP_STRUCT__entry(
+		__field(	void *,	wq			)
+	),
+
+	TP_fast_assign(
+		__entry->wq		= wq;
+	),
+
+	TP_printk("wq=%p", __entry->wq)
+);
+
+DEFINE_EVENT(btrfs__workqueue_done, btrfs_workqueue_destroy,
+
+	TP_PROTO(struct __btrfs_workqueue *wq),
+
+	TP_ARGS(wq)
+);
 
 #endif /* _TRACE_BTRFS_H */
 
-- 
cgit v1.2.3


From 21543baddcdbaa49db5ac8766ae564381e7c64d9 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@gmail.com>
Date: Fri, 14 Mar 2014 20:55:01 +0000
Subject: Btrfs: fix race when updating existing ref head

While we update an existing ref head's extent_op, we're not holding
its spinlock, so while we're updating its extent_op contents (key,
flags) we can have a task running __btrfs_run_delayed_refs() that
holds the ref head's lock and sets its extent_op to NULL right after
the task updating the ref head just checked its extent_op was not NULL.

Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/delayed-ref.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 2502ba5a3ac0..31299646024d 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -495,6 +495,7 @@ update_existing_head_ref(struct btrfs_delayed_ref_node *existing,
 	ref = btrfs_delayed_node_to_head(update);
 	BUG_ON(existing_ref->is_data != ref->is_data);
 
+	spin_lock(&existing_ref->lock);
 	if (ref->must_insert_reserved) {
 		/* if the extent was freed and then
 		 * reallocated before the delayed ref
@@ -536,7 +537,6 @@ update_existing_head_ref(struct btrfs_delayed_ref_node *existing,
 	 * only need the lock for this case cause we could be processing it
 	 * currently, for refs we just added we know we're a-ok.
 	 */
-	spin_lock(&existing_ref->lock);
 	existing->ref_mod += update->ref_mod;
 	spin_unlock(&existing_ref->lock);
 }
-- 
cgit v1.2.3


From 425b5dafc8738d3d6d6b05827f40bd32bf04a20b Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@gmail.com>
Date: Tue, 18 Mar 2014 17:56:06 +0000
Subject: Btrfs: remove unnecessary inode generation lookup in send

No need to search in the send tree for the generation number of the inode,
we already have it in the recorded_ref structure passed to us.

Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Reviewed-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/send.c | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 646369179697..92d4ae8a8ae6 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -3179,7 +3179,7 @@ static int wait_for_parent_move(struct send_ctx *sctx,
 	int ret;
 	u64 ino = parent_ref->dir;
 	u64 parent_ino_before, parent_ino_after;
-	u64 new_gen, old_gen;
+	u64 old_gen;
 	struct fs_path *path_before = NULL;
 	struct fs_path *path_after = NULL;
 	int len1, len2;
@@ -3197,12 +3197,7 @@ static int wait_for_parent_move(struct send_ctx *sctx,
 	else if (ret < 0)
 		return ret;
 
-	ret = get_inode_info(sctx->send_root, ino, NULL, &new_gen,
-			     NULL, NULL, NULL, NULL);
-	if (ret < 0)
-		return ret;
-
-	if (new_gen != old_gen)
+	if (parent_ref->dir_gen != old_gen)
 		return 0;
 
 	path_before = fs_path_alloc();
-- 
cgit v1.2.3


From 7b119a8b8998f17abd6caf928dee5bf203eef8c5 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@gmail.com>
Date: Sun, 16 Mar 2014 20:37:26 +0000
Subject: Btrfs: fix incremental send's decision to delay a dir move/rename

It's possible to change the parent/child relationship between directories
in such a way that if a child directory has a higher inode number than
its parent, it doesn't necessarily means the child rename/move operation
can be performed immediately. The parent migth have its own rename/move
operation delayed, therefore in this case the child needs to have its
rename/move operation delayed too, and be performed after its new parent's
rename/move.

Steps to reproduce the issue:

      $ umount /mnt
      $ mkfs.btrfs -f /dev/sdd
      $ mount /dev/sdd /mnt

      $ mkdir /mnt/A
      $ mkdir /mnt/B
      $ mkdir /mnt/C
      $ mv /mnt/C /mnt/A
      $ mv /mnt/B /mnt/A/C
      $ mkdir /mnt/A/C/D

      $ btrfs subvolume snapshot -r /mnt /mnt/snap1
      $ btrfs send /mnt/snap1 -f /tmp/base.send

      $ mv /mnt/A/C/D /mnt/A/D2
      $ mv /mnt/A/C/B /mnt/A/D2/B2
      $ mv /mnt/A/C /mnt/A/D2/B2/C2

      $ btrfs subvolume snapshot -r /mnt /mnt/snap2
      $ btrfs send -p /mnt/snap1 /mnt/snap2 -f /tmp/incremental.send

The incremental send caused the kernel code to enter an infinite loop when
building the path string for directory C after its references are processed.

The necessary conditions here are that C has an inode number higher than both
A and B, and B as an higher inode number higher than A, and D has the highest
inode number, that is:
    inode_number(A) < inode_number(B) < inode_number(C) < inode_number(D)

The same issue could happen if after the first snapshot there's any number
of intermediary parent directories between A2 and B2, and between B2 and C2.

A test case for xfstests follows, covering this simple case and more advanced
ones, with files and hard links created inside the directories.

Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/send.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 92d4ae8a8ae6..f16472409eec 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -3184,12 +3184,12 @@ static int wait_for_parent_move(struct send_ctx *sctx,
 	struct fs_path *path_after = NULL;
 	int len1, len2;
 
-	if (parent_ref->dir <= sctx->cur_ino)
-		return 0;
-
 	if (is_waiting_for_move(sctx, ino))
 		return 1;
 
+	if (parent_ref->dir <= sctx->cur_ino)
+		return 0;
+
 	ret = get_inode_info(sctx->parent_root, ino, NULL, &old_gen,
 			     NULL, NULL, NULL, NULL);
 	if (ret == -ENOENT)
-- 
cgit v1.2.3


From bfa7e1f8be4bd7118e485a42cc8889530d415d05 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@gmail.com>
Date: Wed, 19 Mar 2014 14:20:54 +0000
Subject: Btrfs: part 2, fix incremental send's decision to delay a dir
 move/rename

For an incremental send, fix the process of determining whether the directory
inode we're currently processing needs to have its move/rename operation delayed.

We were ignoring the fact that if the inode's new immediate ancestor has a higher
inode number than ours but wasn't renamed/moved, we might still need to delay our
move/rename, because some other ancestor directory higher in the hierarchy might
have an inode number higher than ours *and* was renamed/moved too - in this case
we have to wait for rename/move of that ancestor to happen before our current
directory's rename/move operation.

Simple steps to reproduce this issue:

      $ mkfs.btrfs -f /dev/sdd
      $ mount /dev/sdd /mnt

      $ mkdir -p /mnt/a/x1/x2
      $ mkdir /mnt/a/Z
      $ mkdir -p /mnt/a/x1/x2/x3/x4/x5

      $ btrfs subvolume snapshot -r /mnt /mnt/snap1
      $ btrfs send /mnt/snap1 -f /tmp/base.send

      $ mv /mnt/a/x1/x2/x3 /mnt/a/Z/X33
      $ mv /mnt/a/x1/x2 /mnt/a/Z/X33/x4/x5/X22

      $ btrfs subvolume snapshot -r /mnt /mnt/snap2
      $ btrfs send -p /mnt/snap1 /mnt/snap2 -f /tmp/incremental.send

The incremental send caused the kernel code to enter an infinite loop when
building the path string for directory Z after its references are processed.

A more complex scenario:

      $ mkfs.btrfs -f /dev/sdd
      $ mount /dev/sdd /mnt

      $ mkdir -p /mnt/a/b/c/d
      $ mkdir /mnt/a/b/c/d/e
      $ mkdir /mnt/a/b/c/d/f
      $ mv /mnt/a/b/c/d/e /mnt/a/b/c/d/f/E2
      $ mkdir /mmt/a/b/c/g
      $ mv /mnt/a/b/c/d /mnt/a/b/D2

      $ btrfs subvolume snapshot -r /mnt /mnt/snap1
      $ btrfs send /mnt/snap1 -f /tmp/base.send

      $ mkdir /mnt/a/o
      $ mv /mnt/a/b/c/g /mnt/a/b/D2/f/G2
      $ mv /mnt/a/b/D2 /mnt/a/b/dd
      $ mv /mnt/a/b/c /mnt/a/C2
      $ mv /mnt/a/b/dd/f /mnt/a/o/FF
      $ mv /mnt/a/b /mnt/a/o/FF/E2/BB

      $ btrfs subvolume snapshot -r /mnt /mnt/snap2
      $ btrfs send -p /mnt/snap1 /mnt/snap2 -f /tmp/incremental.send

A test case for xfstests follows.

Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/send.c | 71 +++++++++++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 66 insertions(+), 5 deletions(-)

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index f16472409eec..143fed3f4586 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -2916,7 +2916,10 @@ static void free_waiting_dir_move(struct send_ctx *sctx,
 	kfree(dm);
 }
 
-static int add_pending_dir_move(struct send_ctx *sctx, u64 parent_ino)
+static int add_pending_dir_move(struct send_ctx *sctx,
+				u64 ino,
+				u64 ino_gen,
+				u64 parent_ino)
 {
 	struct rb_node **p = &sctx->pending_dir_moves.rb_node;
 	struct rb_node *parent = NULL;
@@ -2929,8 +2932,8 @@ static int add_pending_dir_move(struct send_ctx *sctx, u64 parent_ino)
 	if (!pm)
 		return -ENOMEM;
 	pm->parent_ino = parent_ino;
-	pm->ino = sctx->cur_ino;
-	pm->gen = sctx->cur_inode_gen;
+	pm->ino = ino;
+	pm->gen = ino_gen;
 	INIT_LIST_HEAD(&pm->list);
 	INIT_LIST_HEAD(&pm->update_refs);
 	RB_CLEAR_NODE(&pm->node);
@@ -3183,6 +3186,8 @@ static int wait_for_parent_move(struct send_ctx *sctx,
 	struct fs_path *path_before = NULL;
 	struct fs_path *path_after = NULL;
 	int len1, len2;
+	int register_upper_dirs;
+	u64 gen;
 
 	if (is_waiting_for_move(sctx, ino))
 		return 1;
@@ -3220,7 +3225,7 @@ static int wait_for_parent_move(struct send_ctx *sctx,
 	}
 
 	ret = get_first_ref(sctx->send_root, ino, &parent_ino_after,
-			    NULL, path_after);
+			    &gen, path_after);
 	if (ret == -ENOENT) {
 		ret = 0;
 		goto out;
@@ -3237,6 +3242,60 @@ static int wait_for_parent_move(struct send_ctx *sctx,
 	}
 	ret = 0;
 
+	/*
+	 * Ok, our new most direct ancestor has a higher inode number but
+	 * wasn't moved/renamed. So maybe some of the new ancestors higher in
+	 * the hierarchy have an higher inode number too *and* were renamed
+	 * or moved - in this case we need to wait for the ancestor's rename
+	 * or move operation before we can do the move/rename for the current
+	 * inode.
+	 */
+	register_upper_dirs = 0;
+	ino = parent_ino_after;
+again:
+	while ((ret == 0 || register_upper_dirs) && ino > sctx->cur_ino) {
+		u64 parent_gen;
+
+		fs_path_reset(path_before);
+		fs_path_reset(path_after);
+
+		ret = get_first_ref(sctx->send_root, ino, &parent_ino_after,
+				    &parent_gen, path_after);
+		if (ret < 0)
+			goto out;
+		ret = get_first_ref(sctx->parent_root, ino, &parent_ino_before,
+				    NULL, path_before);
+		if (ret == -ENOENT) {
+			ret = 0;
+			break;
+		} else if (ret < 0) {
+			goto out;
+		}
+
+		len1 = fs_path_len(path_before);
+		len2 = fs_path_len(path_after);
+		if (parent_ino_before != parent_ino_after || len1 != len2 ||
+		    memcmp(path_before->start, path_after->start, len1)) {
+			ret = 1;
+			if (register_upper_dirs) {
+				break;
+			} else {
+				register_upper_dirs = 1;
+				ino = parent_ref->dir;
+				gen = parent_ref->dir_gen;
+				goto again;
+			}
+		} else if (register_upper_dirs) {
+			ret = add_pending_dir_move(sctx, ino, gen,
+						   parent_ino_after);
+			if (ret < 0 && ret != -EEXIST)
+				goto out;
+		}
+
+		ino = parent_ino_after;
+		gen = parent_gen;
+	}
+
 out:
 	fs_path_free(path_before);
 	fs_path_free(path_after);
@@ -3402,7 +3461,9 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
 					goto out;
 				if (ret) {
 					ret = add_pending_dir_move(sctx,
-								   cur->dir);
+							   sctx->cur_ino,
+							   sctx->cur_inode_gen,
+							   cur->dir);
 					*pending_move = 1;
 				} else {
 					ret = send_rename(sctx, valid_path,
-- 
cgit v1.2.3


From 4485386853454f184235c8a973b29fa7fa522eb1 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fb.com>
Date: Wed, 19 Mar 2014 13:35:14 -0400
Subject: Btrfs: take into account total references when doing backref lookup

I added an optimization for large files where we would stop searching for
backrefs once we had looked at the number of references we currently had for
this extent.  This works great most of the time, but for snapshots that point to
this extent and has changes in the original root this assumption falls on it
face.  So keep track of any delayed ref mods made and add in the actual ref
count as reported by the extent item and use that to limit how far down an inode
we'll search for extents.  Thanks,

Reportedy-by: Hugo Mills <hugo@carfax.org.uk>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Reported-by: Hugo Mills <hugo@carfax.org.uk>
Tested-by: Hugo Mills <hugo@carfax.org.uk>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/backref.c | 29 ++++++++++++++++++-----------
 1 file changed, 18 insertions(+), 11 deletions(-)

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 860f4f22b9b0..aad7201ad11b 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -220,7 +220,8 @@ static int __add_prelim_ref(struct list_head *head, u64 root_id,
 
 static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
 			   struct ulist *parents, struct __prelim_ref *ref,
-			   int level, u64 time_seq, const u64 *extent_item_pos)
+			   int level, u64 time_seq, const u64 *extent_item_pos,
+			   u64 total_refs)
 {
 	int ret = 0;
 	int slot;
@@ -249,7 +250,7 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
 	if (path->slots[0] >= btrfs_header_nritems(path->nodes[0]))
 		ret = btrfs_next_old_leaf(root, path, time_seq);
 
-	while (!ret && count < ref->count) {
+	while (!ret && count < total_refs) {
 		eb = path->nodes[0];
 		slot = path->slots[0];
 
@@ -306,7 +307,7 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
 				  struct btrfs_path *path, u64 time_seq,
 				  struct __prelim_ref *ref,
 				  struct ulist *parents,
-				  const u64 *extent_item_pos)
+				  const u64 *extent_item_pos, u64 total_refs)
 {
 	struct btrfs_root *root;
 	struct btrfs_key root_key;
@@ -361,7 +362,7 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
 	}
 
 	ret = add_all_parents(root, path, parents, ref, level, time_seq,
-			      extent_item_pos);
+			      extent_item_pos, total_refs);
 out:
 	path->lowest_level = 0;
 	btrfs_release_path(path);
@@ -374,7 +375,7 @@ out:
 static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
 				   struct btrfs_path *path, u64 time_seq,
 				   struct list_head *head,
-				   const u64 *extent_item_pos)
+				   const u64 *extent_item_pos, u64 total_refs)
 {
 	int err;
 	int ret = 0;
@@ -400,7 +401,8 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
 		if (ref->count == 0)
 			continue;
 		err = __resolve_indirect_ref(fs_info, path, time_seq, ref,
-					     parents, extent_item_pos);
+					     parents, extent_item_pos,
+					     total_refs);
 		/*
 		 * we can only tolerate ENOENT,otherwise,we should catch error
 		 * and return directly.
@@ -557,7 +559,7 @@ static void __merge_refs(struct list_head *head, int mode)
  * smaller or equal that seq to the list
  */
 static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
-			      struct list_head *prefs)
+			      struct list_head *prefs, u64 *total_refs)
 {
 	struct btrfs_delayed_extent_op *extent_op = head->extent_op;
 	struct rb_node *n = &head->node.rb_node;
@@ -593,6 +595,7 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
 		default:
 			BUG_ON(1);
 		}
+		*total_refs += (node->ref_mod * sgn);
 		switch (node->type) {
 		case BTRFS_TREE_BLOCK_REF_KEY: {
 			struct btrfs_delayed_tree_ref *ref;
@@ -653,7 +656,8 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
  */
 static int __add_inline_refs(struct btrfs_fs_info *fs_info,
 			     struct btrfs_path *path, u64 bytenr,
-			     int *info_level, struct list_head *prefs)
+			     int *info_level, struct list_head *prefs,
+			     u64 *total_refs)
 {
 	int ret = 0;
 	int slot;
@@ -677,6 +681,7 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info,
 
 	ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item);
 	flags = btrfs_extent_flags(leaf, ei);
+	*total_refs += btrfs_extent_refs(leaf, ei);
 	btrfs_item_key_to_cpu(leaf, &found_key, slot);
 
 	ptr = (unsigned long)(ei + 1);
@@ -859,6 +864,7 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans,
 	struct list_head prefs;
 	struct __prelim_ref *ref;
 	struct extent_inode_elem *eie = NULL;
+	u64 total_refs = 0;
 
 	INIT_LIST_HEAD(&prefs);
 	INIT_LIST_HEAD(&prefs_delayed);
@@ -917,7 +923,7 @@ again:
 			}
 			spin_unlock(&delayed_refs->lock);
 			ret = __add_delayed_refs(head, time_seq,
-						 &prefs_delayed);
+						 &prefs_delayed, &total_refs);
 			mutex_unlock(&head->mutex);
 			if (ret)
 				goto out;
@@ -938,7 +944,8 @@ again:
 		    (key.type == BTRFS_EXTENT_ITEM_KEY ||
 		     key.type == BTRFS_METADATA_ITEM_KEY)) {
 			ret = __add_inline_refs(fs_info, path, bytenr,
-						&info_level, &prefs);
+						&info_level, &prefs,
+						&total_refs);
 			if (ret)
 				goto out;
 			ret = __add_keyed_refs(fs_info, path, bytenr,
@@ -958,7 +965,7 @@ again:
 	__merge_refs(&prefs, 1);
 
 	ret = __resolve_indirect_refs(fs_info, path, time_seq, &prefs,
-				      extent_item_pos);
+				      extent_item_pos, total_refs);
 	if (ret)
 		goto out;
 
-- 
cgit v1.2.3


From 73b802f44747e824f6efe273903149ede9ddf741 Mon Sep 17 00:00:00 2001
From: Chris Mason <clm@fb.com>
Date: Fri, 21 Mar 2014 15:30:44 -0700
Subject: btrfs: fix uninit variable warning
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

fs/btrfs/send.c:2926: warning: ‘entry’ may be used uninitialized in this
function

Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/send.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 143fed3f4586..9b6da9d55f9a 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -2923,7 +2923,7 @@ static int add_pending_dir_move(struct send_ctx *sctx,
 {
 	struct rb_node **p = &sctx->pending_dir_moves.rb_node;
 	struct rb_node *parent = NULL;
-	struct pending_dir_move *entry, *pm;
+	struct pending_dir_move *entry = NULL, *pm;
 	struct recorded_ref *cur;
 	int exists = 0;
 	int ret;
-- 
cgit v1.2.3


From 00fdf13a2e9f313a044288aa59d3b8ec29ff904a Mon Sep 17 00:00:00 2001
From: Liu Bo <bo.li.liu@oracle.com>
Date: Mon, 10 Mar 2014 18:56:07 +0800
Subject: Btrfs: fix a crash of clone with inline extents's split

xfstests's btrfs/035 triggers a BUG_ON, which we use to detect the split
of inline extents in __btrfs_drop_extents().

For inline extents, we cannot duplicate another EXTENT_DATA item, because
it breaks the rule of inline extents, that is, 'start offset' needs to be 0.

We have set limitations for the source inode's compressed inline extents,
because it needs to decompress and recompress.  Now the destination inode's
inline extents also need similar limitations.

With this, xfstests btrfs/035 doesn't run into panic.

Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/file.c  | 15 ++++++++++++---
 fs/btrfs/ioctl.c | 10 ++++++----
 2 files changed, 18 insertions(+), 7 deletions(-)

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index b2143b8c33c5..036f506cabd8 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -804,7 +804,10 @@ next_slot:
 		 */
 		if (start > key.offset && end < extent_end) {
 			BUG_ON(del_nr > 0);
-			BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE);
+			if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
+				ret = -EINVAL;
+				break;
+			}
 
 			memcpy(&new_key, &key, sizeof(new_key));
 			new_key.offset = start;
@@ -847,7 +850,10 @@ next_slot:
 		 *      | -------- extent -------- |
 		 */
 		if (start <= key.offset && end < extent_end) {
-			BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE);
+			if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
+				ret = -EINVAL;
+				break;
+			}
 
 			memcpy(&new_key, &key, sizeof(new_key));
 			new_key.offset = end;
@@ -870,7 +876,10 @@ next_slot:
 		 */
 		if (start > key.offset && end >= extent_end) {
 			BUG_ON(del_nr > 0);
-			BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE);
+			if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
+				ret = -EINVAL;
+				break;
+			}
 
 			btrfs_set_file_extent_num_bytes(leaf, fi,
 							start - key.offset);
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 3ca313b138ca..6778fa3c6ed2 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -3087,8 +3087,9 @@ process_slot:
 							 new_key.offset + datal,
 							 1);
 				if (ret) {
-					btrfs_abort_transaction(trans, root,
-								ret);
+					if (ret != -EINVAL)
+						btrfs_abort_transaction(trans,
+								root, ret);
 					btrfs_end_transaction(trans, root);
 					goto out;
 				}
@@ -3246,8 +3247,9 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
 	 *   decompress into destination's address_space (the file offset
 	 *   may change, so source mapping won't do), then recompress (or
 	 *   otherwise reinsert) a subrange.
-	 * - allow ranges within the same file to be cloned (provided
-	 *   they don't overlap)?
+	 *
+	 * - split destination inode's inline extents.  The inline extents can
+	 *   be either compressed or non-compressed.
 	 */
 
 	/* the destination must be opened for writing */
-- 
cgit v1.2.3